Make string-limit with encoding return complete glyphs

* lisp/emacs-lisp/subr-x.el (string-limit): Return more correct results in the CODING-SYSTEM case for coding systems with BOM and charset designations (bug#48324). Also amend the algorithm to return complete glyphs, not just complete code points.
2024-11-26 07:33:47 +00:00 · 2022-07-03 14:05:01 +02:00 · 2022-07-03 14:05:01 +02:00 · b72e4b1493
commit b72e4b1493
parent cfee07d4dd
2 changed files with 71 additions and 35 deletions
--- a/lisp/emacs-lisp/subr-x.el
+++ b/lisp/emacs-lisp/subr-x.el
@ -167,9 +167,9 @@ non-nil, return the last LENGTH characters instead.
 If CODING-SYSTEM is non-nil, STRING will be encoded before
 limiting, and LENGTH is interpreted as the number of bytes to
 limit the string to.  The result will be a unibyte string that is
-shorter than LENGTH, but will not contain \"partial\" characters,
-even if CODING-SYSTEM encodes characters with several bytes per
-character.
+shorter than LENGTH, but will not contain \"partial\"
+characters (or glyphs), even if CODING-SYSTEM encodes characters
+with several bytes per character.

 When shortening strings for display purposes,
 `truncate-string-to-width' is almost always a better alternative
@ -177,34 +177,55 @@ than this function."
  (unless (natnump length)
    (signal 'wrong-type-argument (list 'natnump length)))
  (if coding-system
-      (let ((result nil)
-            (result-length 0)
-            (index (if end (1- (length string)) 0)))
-        ;; FIXME: This implementation, which uses encode-coding-char
-        ;; to encode the string one character at a time, is in general
-        ;; incorrect: coding-systems that produce prefix or suffix
-        ;; bytes, such as ISO-2022-based or UTF-8/16 with BOM, will
-        ;; produce those bytes for each character, instead of just
-        ;; once for the entire string.  encode-coding-char attempts to
-        ;; remove those extra bytes at least in some situations, but
-        ;; it cannot do that in all cases.  And in any case, producing
-        ;; what is supposed to be a UTF-16 or ISO-2022-CN encoded
-        ;; string which lacks the BOM bytes at the beginning and the
-        ;; charset designation sequences at the head and tail of the
-        ;; result will definitely surprise the callers in some cases.
-        (while (let ((encoded (encode-coding-char
-                               (aref string index) coding-system)))
-                 (and (<= (+ (length encoded) result-length) length)
-                      (progn
-                        (push encoded result)
-                        (cl-incf result-length (length encoded))
-                        (setq index (if end (1- index)
-                                      (1+ index))))
-                      (if end (> index -1)
-                        (< index (length string)))))
-          ;; No body.
-          )
-        (apply #'concat (if end result (nreverse result))))
+      ;; The previous implementation here tried to encode char by
+      ;; char, and then adding up the length of the encoded octets,
+      ;; but that's not reliably in the presence of BOM marks and
+      ;; ISO-2022-CN which may add charset designations at the
+      ;; start/end of each encoded char (which we don't want).  So
+      ;; iterate (with a binary search) instead to find the desired
+      ;; length.
+      (let* ((glyphs (string-glyph-split string))
+             (nglyphs (length glyphs))
+             (too-long (1+ nglyphs))
+             (stop (max (/ nglyphs 2) 1))
+             (gap stop)
+             candidate encoded found candidate-stop)
+        ;; We're returning the end of the string.
+        (when end
+          (setq glyphs (nreverse glyphs)))
+        (while (and (not found)
+                    (< stop too-long))
+          (setq encoded
+                (encode-coding-string (string-join (seq-take glyphs stop))
+                                      coding-system))
+          (cond
+           ((= (length encoded) length)
+            (setq found encoded
+                  candidate-stop stop))
+           ;; Too long; try shortening.
+           ((> (length encoded) length)
+            (setq too-long stop
+                  stop (max (- stop gap) 1)))
+           ;; Too short; try lengthening.
+           (t
+            (setq candidate encoded
+                  candidate-stop stop)
+            (setq stop
+                  (if (>= stop nglyphs)
+                      too-long
+                    (min (+ stop gap) nglyphs)))))
+          (setq gap (max (/ gap 2) 1)))
+        (cond
+         ((not (or found candidate))
+          "")
+         ;; We're returning the end, so redo the encoding.
+         (end
+          (encode-coding-string
+           (string-join (nreverse (seq-take glyphs candidate-stop)))
+           coding-system))
+         (t
+          (or found candidate))))
+    ;; Char-based version.
    (cond
     ((<= (length string) length) string)
     (end (substring string (- (length string) length)))
--- a/test/lisp/emacs-lisp/subr-x-tests.el
+++ b/test/lisp/emacs-lisp/subr-x-tests.el
@ -607,21 +607,36 @@
  (should (equal (string-limit "foó" 4 nil 'utf-8) "fo\303\263"))
  (should (equal (string-limit "foóa" 4 nil 'utf-8) "fo\303\263"))
  (should (equal (string-limit "foóá" 4 nil 'utf-8) "fo\303\263"))
+  (should (equal (string-limit "foóá" 2 nil 'utf-8-with-signature)
+                 ""))
  (should (equal (string-limit "foóá" 4 nil 'utf-8-with-signature)
-                 "fo\303\263"))
+                 "\357\273\277f"))
  (should (equal (string-limit "foóa" 4 nil 'iso-8859-1) "fo\363a"))
  (should (equal (string-limit "foóá" 4 nil 'iso-8859-1) "fo\363\341"))
-  (should (equal (string-limit "foóá" 4 nil 'utf-16) "\000f\000o"))
+  (should (equal (string-limit "foóá" 3 nil 'utf-16) ""))
+  (should (equal (string-limit "foóá" 6 nil 'utf-16) "\376\377\000f\000o"))

  (should (equal (string-limit "foó" 10 t 'utf-8) "fo\303\263"))
  (should (equal (string-limit "foó" 3 t 'utf-8) "o\303\263"))
  (should (equal (string-limit "foó" 4 t 'utf-8) "fo\303\263"))
  (should (equal (string-limit "foóa" 4 t 'utf-8) "o\303\263a"))
  (should (equal (string-limit "foóá" 4 t 'utf-8) "\303\263\303\241"))
-  (should (equal (string-limit "foóá" 2 t 'utf-8-with-signature) "\303\241"))
+  (should (equal (string-limit "foóá" 2 t 'utf-8-with-signature)
+                 ""))
  (should (equal (string-limit "foóa" 4 t 'iso-8859-1) "fo\363a"))
  (should (equal (string-limit "foóá" 4 t 'iso-8859-1) "fo\363\341"))
-  (should (equal (string-limit "foóá" 4 t 'utf-16) "\000\363\000\341")))
+  (should (equal (string-limit "foóá" 6 t 'utf-16) "\376\377\000\363\000\341")))
+
+(ert-deftest subr-string-limit-glyphs ()
+  (should (equal (encode-coding-string "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 'utf-8)
+                 "Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273"))
+  (should (= (length (encode-coding-string "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 'utf-8)) 41))
+  (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 100 nil 'utf-8)
+                 "Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273"))
+  (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 15 nil 'utf-8)
+                 "Hello, \360\237\221\274\360\237\217\273"))
+  (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 10 nil 'utf-8)
+                 "Hello, ")))

 (ert-deftest subr-string-lines ()
  (should (equal (string-lines "foo") '("foo")))