1
0
mirror of https://git.savannah.gnu.org/git/emacs.git synced 2025-02-02 20:16:25 +00:00

Ensure ucs-names is consistent with Unicode names

* lisp/international/mule-cmds.el (ucs-names): Skip adding an old-name
if it conflicts with the offical name of a codepoint.  Adjust the
ranges iterated over to account for new Unicode codepoints.
* test/lisp/international/mule-tests.el
(mule-cmds-tests--ucs-names-old-name-override,
mule-cmds-tests--ucs-names-missing-names): New tests for checking
'ucs-names' consistency.

Bug#65997
This commit is contained in:
Robert Pluim 2023-09-18 10:41:01 +02:00
parent 93134bb9c2
commit 6bc3800000
2 changed files with 42 additions and 8 deletions

View File

@ -3094,6 +3094,10 @@ on encoding."
(defun ucs-names ()
"Return table of CHAR-NAME keys and CHAR-CODE values cached in `ucs-names'."
(or ucs-names
;; Sometimes these ranges will need adjusting as codepoints are
;; added to unicode. The test case
;; 'mule-cmds-tests--ucs-names-missing-names' will tell you
;; which are missing (Bug#65997).
(let ((ranges
'((#x0000 . #x33FF)
;; (#x3400 . #x4DBF) CJK Ideographs Extension A
@ -3106,14 +3110,16 @@ on encoding."
(#x14400 . #x14646)
;; (#x14647 . #x167FF) unused
(#x16800 . #x16F9F)
(#x16FE0 . #x16FE3)
(#x16FE0 . #x16FF1)
;; (#x17000 . #x187FF) Tangut Ideographs
;; (#x18800 . #x18AFF) Tangut Components
;; (#x18B00 . #x18CFF) Khitan Small Script
;; (#x18D00 . #x18D0F) Tangut Ideograph Supplement
;; (#x18D10 . #x1AFEF) unused
(#x1AFF0 . #x1B12F)
;; (#x1B130 . #x1B14F) unused
(#x1AFF0 . #x1B122)
;; (#x1B123 . #x1B131) unused
(#x1B132 . #x1B132)
;; (#x1B133 . #x1B14F) unused
(#x1B150 . #x1B16F)
(#x1B170 . #x1B2FF)
;; (#x1B300 . #x1BBFF) unused
@ -3130,12 +3136,16 @@ on encoding."
(while (<= c end)
(let ((new-name (get-char-code-property c 'name))
(old-name (get-char-code-property c 'old-name)))
;; In theory this code could end up pushing an "old-name" that
;; shadows a "new-name" but in practice every time an
;; `old-name' conflicts with a `new-name', the newer one has a
;; higher code, so it gets pushed later!
;; This code used to push both old-name and new-name
;; on the assumption that the new-name codepoint would
;; always be higher, which was true for a long time.
;; As of at latest 2023-09-15, this is no longer true,
;; so we now skip the old-name if it conflicts with an
;; existing new-name (Bug#65997).
(if new-name (puthash new-name c names))
(if old-name (puthash old-name c names))
(when (and old-name
(not (gethash old-name names)))
(puthash old-name c names))
;; Unicode uses the spelling "lamda" in character
;; names, instead of "lambda", due to "preferences
;; expressed by the Greek National Body" (Bug#30513).

View File

@ -49,6 +49,30 @@
(kbd "C-x RET c u t f - 8 RET C-u C-u c a b RET")
(read-string "prompt:"))))))
;;Bug#65997, ensure that old-names haven't overriden new names.
(ert-deftest mule-cmds-tests--ucs-names-old-name-override ()
(let (code-points)
(dotimes (u (1+ (max-char 'ucs)))
(when-let* ((name (get-char-code-property u 'name))
(c (char-from-name name)))
(when (and (not (<= #xD800 u #xDFFF))
(not (= c u)))
(push (format "%X" u) code-points))))
(setq code-points (nreverse code-points))
(should (null code-points))))
;; Bug#65997, ensure that all codepoints with names are in '(ucs-names)'.
(ert-deftest mule-cmds-tests--ucs-names-missing-names ()
(let (code-points)
(dotimes (u (1+ (max-char 'ucs)))
(when-let ((name (get-char-code-property u 'name)))
(when (and (not (<= #xD800 u #xDFFF))
(not (<= #x18800 u #x18AFF))
(not (char-from-name name)))
(push (format "%X" u) code-points))))
(setq code-points (nreverse code-points))
(should (null code-points))))
(ert-deftest mule-utf-7 ()
;; utf-7 and utf-7-imap are not ASCII-compatible.
(should-not (coding-system-get 'utf-7 :ascii-compatible-p))