1
0
mirror of https://git.savannah.gnu.org/git/emacs.git synced 2025-01-04 11:40:22 +00:00
emacs/test/lisp/char-fold-tests.el
Michal Nazarewicz b3b9b258c4 Support casing characters which map into multiple code points (bug#24603)
Implement unconditional special casing rules defined in Unicode standard.

Among other things, they deal with cases when a single code point is
replaced by multiple ones because single character does not exist (e.g.
‘fi’ ligature turning into ‘FL’) or is not commonly used (e.g. ß turning
into SS).

* admin/unidata/SpecialCasing.txt: New data file pulled from Unicode
standard distribution.
* admin/unidata/README: Mention SpecialCasing.txt.

* admin/unidata/unidata-get.el (unidata-gen-table-special-casing,
unidata-gen-table-special-casing--do-load): New functions generating
‘special-uppercase’, ‘special-lowercase’ and ‘special-titlecase’
character Unicode properties built from the SpecialCasing.txt Unicode
data file.

* src/casefiddle.c (struct casing_str_buf): New structure for
representing short strings used to handle one-to-many character
mappings.

(case_character_imlp): New function which can handle one-to-many
character mappings.
(case_character, case_single_character): Wrappers for the above
functions.  The former may map one character to multiple (or no)
code points while the latter does what the former used to do (i.e.
handles one-to-one mappings only).

(do_casify_natnum, do_casify_unibyte_string,
do_casify_unibyte_region): Use case_single_character.
(do_casify_multibyte_string, do_casify_multibyte_region): Support new
features of case_character.
* (do_casify_region): Updated to reflact do_casify_multibyte_string
changes.

(casify_word): Handle situation when one character-length of a word
can change affecting where end of the word is.

(upcase, capitalize, upcase-initials): Update documentation to mention
limitations when working on characters.

* test/src/casefiddle-tests.el (casefiddle-tests-char-properties):
Add test cases for the newly introduced character properties.
(casefiddle-tests-casing): Update test cases which are now passing.

* test/lisp/char-fold-tests.el (char-fold--ascii-upcase,
char-fold--ascii-downcase): New functions which behave like old ‘upcase’
and ‘downcase’.
(char-fold--test-match-exactly): Use the new functions.  This is needed
because otherwise fi and similar characters are turned into their multi-
-character representation.

* doc/lispref/strings.texi: Describe issue with casing characters versus
strings.
* doc/lispref/nonascii.texi: Describe the new character properties.
2017-04-06 20:54:58 +02:00

133 lines
5.5 KiB
EmacsLisp
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

;;; char-fold-tests.el --- Tests for char-fold.el -*- lexical-binding: t; -*-
;; Copyright (C) 2013-2017 Free Software Foundation, Inc.
;; Author: Artur Malabarba <bruce.connor.am@gmail.com>
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Code:
(require 'ert)
(require 'char-fold)
(defun char-fold--random-word (n)
(mapconcat (lambda (_) (string (+ 9 (random 117))))
(make-list n nil) ""))
(defun char-fold--test-search-with-contents (contents string)
(with-temp-buffer
(insert contents)
(goto-char (point-min))
(should (search-forward-regexp (char-fold-to-regexp string) nil 'noerror))
(goto-char (point-min))
(should (char-fold-search-forward string nil 'noerror))
(should (char-fold-search-backward string nil 'noerror))))
(ert-deftest char-fold--test-consistency ()
(dotimes (n 30)
(let ((w (char-fold--random-word n)))
;; A folded string should always match the original string.
(char-fold--test-search-with-contents w w))))
(ert-deftest char-fold--test-lax-whitespace ()
(dotimes (n 40)
(let ((w1 (char-fold--random-word n))
(w2 (char-fold--random-word n))
(search-spaces-regexp "\\s-+"))
(char-fold--test-search-with-contents
(concat w1 "\s\n\s\t\f\t\n\r\t" w2)
(concat w1 " " w2))
(char-fold--test-search-with-contents
(concat w1 "\s\n\s\t\f\t\n\r\t" w2)
(concat w1 (make-string 10 ?\s) w2)))))
(defun char-fold--ascii-upcase (string)
"Like `upcase' but acts on ASCII characters only."
(replace-regexp-in-string "[a-z]+" 'upcase string))
(defun char-fold--ascii-downcase (string)
"Like `downcase' but acts on ASCII characters only."
(replace-regexp-in-string "[a-z]+" 'downcase string))
(defun char-fold--test-match-exactly (string &rest strings-to-match)
(let ((re (concat "\\`" (char-fold-to-regexp string) "\\'")))
(dolist (it strings-to-match)
(should (string-match re it)))
;; Case folding
(let ((case-fold-search t))
(dolist (it strings-to-match)
(should (string-match (char-fold--ascii-upcase re) (downcase it)))
(should (string-match (char-fold--ascii-downcase re) (upcase it)))))))
(ert-deftest char-fold--test-some-defaults ()
(dolist (it '(("ffl" . "") ("ffi" . "")
("fi" . "") ("ff" . "")
("" . "ä")))
(char-fold--test-search-with-contents (cdr it) (car it))
(let ((multi (char-table-extra-slot char-fold-table 0))
(char-fold-table (make-char-table 'char-fold-table)))
(set-char-table-extra-slot char-fold-table 0 multi)
(char-fold--test-match-exactly (car it) (cdr it)))))
(ert-deftest char-fold--test-fold-to-regexp ()
(let ((char-fold-table (make-char-table 'char-fold-table))
(multi (make-char-table 'char-fold-table)))
(set-char-table-extra-slot char-fold-table 0 multi)
(aset char-fold-table ?a "xx")
(aset char-fold-table ?1 "44")
(aset char-fold-table ?\s "-!-")
(char-fold--test-match-exactly "a1a1" "xx44xx44")
(char-fold--test-match-exactly "a1 a 1" "xx44-!--!-xx-!-44")
(aset multi ?a '(("1" . "99")
("2" . "88")
("12" . "77")))
(char-fold--test-match-exactly "a" "xx")
(char-fold--test-match-exactly "a1" "xx44" "99")
(char-fold--test-match-exactly "a12" "77" "xx442" "992")
(char-fold--test-match-exactly "a2" "88")
(aset multi ?1 '(("2" . "yy")))
(char-fold--test-match-exactly "a1" "xx44" "99")
(char-fold--test-match-exactly "a12" "77" "xx442" "992")
;; Support for this case is disabled. See function definition or:
;; https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
;; (char-fold--test-match-exactly "a12" "xxyy")
))
(ert-deftest char-fold--speed-test ()
(dolist (string (append '("tty-set-up-initial-frame-face"
"tty-set-up-initial-frame-face-frame-faceframe-faceframe-faceframe-face")
(mapcar #'char-fold--random-word '(10 50 100
50 100))))
(message "Testing %s" string)
;; Make sure we didn't just fallback on the trivial search.
(should-not (string= (regexp-quote string)
(char-fold-to-regexp string)))
(with-temp-buffer
(save-excursion (insert string))
(let ((time (time-to-seconds (current-time))))
;; Our initial implementation of case-folding in char-folding
;; created a lot of redundant paths in the regexp. Because of
;; that, if a really long string "almost" matches, the regexp
;; engine took a long time to realize that it doesn't match.
(should-not (char-fold-search-forward (concat string "c") nil 'noerror))
;; Ensure it took less than a second.
(should (< (- (time-to-seconds (current-time))
time)
1))))))
(provide 'char-fold-tests)
;;; char-fold-tests.el ends here