mirror of
https://git.savannah.gnu.org/git/emacs.git
synced 2024-11-26 07:33:47 +00:00
Make string-limit with encoding return complete glyphs
* lisp/emacs-lisp/subr-x.el (string-limit): Return more correct results in the CODING-SYSTEM case for coding systems with BOM and charset designations (bug#48324). Also amend the algorithm to return complete glyphs, not just complete code points.
This commit is contained in:
parent
cfee07d4dd
commit
b72e4b1493
@ -167,9 +167,9 @@ non-nil, return the last LENGTH characters instead.
|
||||
If CODING-SYSTEM is non-nil, STRING will be encoded before
|
||||
limiting, and LENGTH is interpreted as the number of bytes to
|
||||
limit the string to. The result will be a unibyte string that is
|
||||
shorter than LENGTH, but will not contain \"partial\" characters,
|
||||
even if CODING-SYSTEM encodes characters with several bytes per
|
||||
character.
|
||||
shorter than LENGTH, but will not contain \"partial\"
|
||||
characters (or glyphs), even if CODING-SYSTEM encodes characters
|
||||
with several bytes per character.
|
||||
|
||||
When shortening strings for display purposes,
|
||||
`truncate-string-to-width' is almost always a better alternative
|
||||
@ -177,34 +177,55 @@ than this function."
|
||||
(unless (natnump length)
|
||||
(signal 'wrong-type-argument (list 'natnump length)))
|
||||
(if coding-system
|
||||
(let ((result nil)
|
||||
(result-length 0)
|
||||
(index (if end (1- (length string)) 0)))
|
||||
;; FIXME: This implementation, which uses encode-coding-char
|
||||
;; to encode the string one character at a time, is in general
|
||||
;; incorrect: coding-systems that produce prefix or suffix
|
||||
;; bytes, such as ISO-2022-based or UTF-8/16 with BOM, will
|
||||
;; produce those bytes for each character, instead of just
|
||||
;; once for the entire string. encode-coding-char attempts to
|
||||
;; remove those extra bytes at least in some situations, but
|
||||
;; it cannot do that in all cases. And in any case, producing
|
||||
;; what is supposed to be a UTF-16 or ISO-2022-CN encoded
|
||||
;; string which lacks the BOM bytes at the beginning and the
|
||||
;; charset designation sequences at the head and tail of the
|
||||
;; result will definitely surprise the callers in some cases.
|
||||
(while (let ((encoded (encode-coding-char
|
||||
(aref string index) coding-system)))
|
||||
(and (<= (+ (length encoded) result-length) length)
|
||||
(progn
|
||||
(push encoded result)
|
||||
(cl-incf result-length (length encoded))
|
||||
(setq index (if end (1- index)
|
||||
(1+ index))))
|
||||
(if end (> index -1)
|
||||
(< index (length string)))))
|
||||
;; No body.
|
||||
)
|
||||
(apply #'concat (if end result (nreverse result))))
|
||||
;; The previous implementation here tried to encode char by
|
||||
;; char, and then adding up the length of the encoded octets,
|
||||
;; but that's not reliably in the presence of BOM marks and
|
||||
;; ISO-2022-CN which may add charset designations at the
|
||||
;; start/end of each encoded char (which we don't want). So
|
||||
;; iterate (with a binary search) instead to find the desired
|
||||
;; length.
|
||||
(let* ((glyphs (string-glyph-split string))
|
||||
(nglyphs (length glyphs))
|
||||
(too-long (1+ nglyphs))
|
||||
(stop (max (/ nglyphs 2) 1))
|
||||
(gap stop)
|
||||
candidate encoded found candidate-stop)
|
||||
;; We're returning the end of the string.
|
||||
(when end
|
||||
(setq glyphs (nreverse glyphs)))
|
||||
(while (and (not found)
|
||||
(< stop too-long))
|
||||
(setq encoded
|
||||
(encode-coding-string (string-join (seq-take glyphs stop))
|
||||
coding-system))
|
||||
(cond
|
||||
((= (length encoded) length)
|
||||
(setq found encoded
|
||||
candidate-stop stop))
|
||||
;; Too long; try shortening.
|
||||
((> (length encoded) length)
|
||||
(setq too-long stop
|
||||
stop (max (- stop gap) 1)))
|
||||
;; Too short; try lengthening.
|
||||
(t
|
||||
(setq candidate encoded
|
||||
candidate-stop stop)
|
||||
(setq stop
|
||||
(if (>= stop nglyphs)
|
||||
too-long
|
||||
(min (+ stop gap) nglyphs)))))
|
||||
(setq gap (max (/ gap 2) 1)))
|
||||
(cond
|
||||
((not (or found candidate))
|
||||
"")
|
||||
;; We're returning the end, so redo the encoding.
|
||||
(end
|
||||
(encode-coding-string
|
||||
(string-join (nreverse (seq-take glyphs candidate-stop)))
|
||||
coding-system))
|
||||
(t
|
||||
(or found candidate))))
|
||||
;; Char-based version.
|
||||
(cond
|
||||
((<= (length string) length) string)
|
||||
(end (substring string (- (length string) length)))
|
||||
|
@ -607,21 +607,36 @@
|
||||
(should (equal (string-limit "foó" 4 nil 'utf-8) "fo\303\263"))
|
||||
(should (equal (string-limit "foóa" 4 nil 'utf-8) "fo\303\263"))
|
||||
(should (equal (string-limit "foóá" 4 nil 'utf-8) "fo\303\263"))
|
||||
(should (equal (string-limit "foóá" 2 nil 'utf-8-with-signature)
|
||||
""))
|
||||
(should (equal (string-limit "foóá" 4 nil 'utf-8-with-signature)
|
||||
"fo\303\263"))
|
||||
"\357\273\277f"))
|
||||
(should (equal (string-limit "foóa" 4 nil 'iso-8859-1) "fo\363a"))
|
||||
(should (equal (string-limit "foóá" 4 nil 'iso-8859-1) "fo\363\341"))
|
||||
(should (equal (string-limit "foóá" 4 nil 'utf-16) "\000f\000o"))
|
||||
(should (equal (string-limit "foóá" 3 nil 'utf-16) ""))
|
||||
(should (equal (string-limit "foóá" 6 nil 'utf-16) "\376\377\000f\000o"))
|
||||
|
||||
(should (equal (string-limit "foó" 10 t 'utf-8) "fo\303\263"))
|
||||
(should (equal (string-limit "foó" 3 t 'utf-8) "o\303\263"))
|
||||
(should (equal (string-limit "foó" 4 t 'utf-8) "fo\303\263"))
|
||||
(should (equal (string-limit "foóa" 4 t 'utf-8) "o\303\263a"))
|
||||
(should (equal (string-limit "foóá" 4 t 'utf-8) "\303\263\303\241"))
|
||||
(should (equal (string-limit "foóá" 2 t 'utf-8-with-signature) "\303\241"))
|
||||
(should (equal (string-limit "foóá" 2 t 'utf-8-with-signature)
|
||||
""))
|
||||
(should (equal (string-limit "foóa" 4 t 'iso-8859-1) "fo\363a"))
|
||||
(should (equal (string-limit "foóá" 4 t 'iso-8859-1) "fo\363\341"))
|
||||
(should (equal (string-limit "foóá" 4 t 'utf-16) "\000\363\000\341")))
|
||||
(should (equal (string-limit "foóá" 6 t 'utf-16) "\376\377\000\363\000\341")))
|
||||
|
||||
(ert-deftest subr-string-limit-glyphs ()
|
||||
(should (equal (encode-coding-string "Hello, 👼🏻🧑🏼🤝🧑🏻" 'utf-8)
|
||||
"Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273"))
|
||||
(should (= (length (encode-coding-string "Hello, 👼🏻🧑🏼🤝🧑🏻" 'utf-8)) 41))
|
||||
(should (equal (string-limit "Hello, 👼🏻🧑🏼🤝🧑🏻" 100 nil 'utf-8)
|
||||
"Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273"))
|
||||
(should (equal (string-limit "Hello, 👼🏻🧑🏼🤝🧑🏻" 15 nil 'utf-8)
|
||||
"Hello, \360\237\221\274\360\237\217\273"))
|
||||
(should (equal (string-limit "Hello, 👼🏻🧑🏼🤝🧑🏻" 10 nil 'utf-8)
|
||||
"Hello, ")))
|
||||
|
||||
(ert-deftest subr-string-lines ()
|
||||
(should (equal (string-lines "foo") '("foo")))
|
||||
|
Loading…
Reference in New Issue
Block a user