1
0
mirror of https://git.savannah.gnu.org/git/emacs.git synced 2025-01-13 16:38:14 +00:00
emacs/lisp/char-fold.el

243 lines
11 KiB
EmacsLisp
Raw Normal View History

;;; char-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
;; Maintainer: emacs-devel@gnu.org
;; Keywords: matching
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Code:
(eval-and-compile (put 'char-fold-table 'char-table-extra-slots 1))
(defconst char-fold-table
(eval-when-compile
(let ((equiv (make-char-table 'char-fold-table))
(equiv-multi (make-char-table 'char-fold-table))
(table (unicode-property-table-internal 'decomposition)))
(set-char-table-extra-slot equiv 0 equiv-multi)
;; Ensure the table is populated.
(let ((func (char-table-extra-slot table 1)))
(map-char-table (lambda (char v)
(when (consp char)
(funcall func (car char) v table)))
table))
;; Compile a list of all complex characters that each simple
;; character should match.
;; In summary this loop does 3 things:
;; - A complex character might be allowed to match its decomp.
;; - The decomp is allowed to match the complex character.
;; - A single char of the decomp might be allowed to match the
;; character.
;; Some examples in the comments below.
(map-char-table
(lambda (char decomp)
(when (consp decomp)
2015-07-01 02:45:56 +00:00
;; Skip trivial cases like ?a decomposing to (?a).
(unless (and (not (cdr decomp))
(eq char (car decomp)))
(if (symbolp (car decomp))
;; Discard a possible formatting tag.
(setq decomp (cdr decomp))
;; If there's no formatting tag, ensure that char matches
;; its decomp exactly. This is because we want 'ä' to
;; match 'ä', but we don't want '¹' to match '1'.
(aset equiv char
(cons (apply #'string decomp)
(aref equiv char))))
;; Allow the entire decomp to match char. If decomp has
;; multiple characters, this is done by adding an entry
;; to the alist of the first character in decomp. This
;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
;; match '¹'.
(let ((make-decomp-match-char
(lambda (decomp char)
(if (cdr decomp)
(aset equiv-multi (car decomp)
(cons (cons (apply #'string (cdr decomp))
(regexp-quote (string char)))
(aref equiv-multi (car decomp))))
(aset equiv (car decomp)
(cons (char-to-string char)
(aref equiv (car decomp))))))))
(funcall make-decomp-match-char decomp char)
;; Do it again, without the non-spacing characters.
;; This allows 'a' to match 'ä'.
(let ((simpler-decomp nil)
(found-one nil))
(dolist (c decomp)
(if (> (get-char-code-property c 'canonical-combining-class) 0)
(setq found-one t)
(push c simpler-decomp)))
(when (and simpler-decomp found-one)
(funcall make-decomp-match-char simpler-decomp char)
;; Finally, if the decomp only had one spacing
;; character, we allow this character to match the
;; decomp. This is to let 'a' match 'ä'.
(unless (cdr simpler-decomp)
(aset equiv (car simpler-decomp)
(cons (apply #'string decomp)
(aref equiv (car simpler-decomp)))))))))))
table)
;; Add some manual entries.
(dolist (it '((?\" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "🙷" "🙶" "🙸" "«" "»")
(?' "" "" "" "" "" "" "" "" "󠀢" "" "" "" "")
(?` "" "" "" "󠀢" "" "")))
(let ((idx (car it))
(chars (cdr it)))
(aset equiv idx (append chars (aref equiv idx)))))
;; Convert the lists of characters we compiled into regexps.
(map-char-table
(lambda (char dec-list)
(let ((re (regexp-opt (cons (char-to-string char) dec-list))))
(if (consp char)
(set-char-table-range equiv char re)
(aset equiv char re))))
equiv)
equiv))
"Used for folding characters of the same group during search.
This is a char-table with the `char-fold-table' subtype.
Let us refer to the character in question by char-x.
Each entry is either nil (meaning char-x only matches literally)
or a regexp. This regexp should match anything that char-x can
match by itself \(including char-x). For instance, the default
regexp for the ?+ character is \"[+]\".
This table also has one extra slot which is also a char-table.
Each entry in the extra slot is an alist used for multi-character
matching (which may be nil). The elements of the alist should
have the form (SUFFIX . OTHER-REGEXP). If the characters after
char-x are equal to SUFFIX, then this combination of char-x +
SUFFIX is allowed to match OTHER-REGEXP. This is in addition to
char-x being allowed to match REGEXP.
For instance, the default alist for ?f includes:
\((\"fl\" . \"ffl\") (\"fi\" . \"ffi\")
(\"i\" . \"fi\") (\"f\" . \"ff\"))
Exceptionally for the space character (32), ALIST is ignored.")
(defun char-fold--make-space-string (n)
"Return a string that matches N spaces."
(format "\\(?:%s\\|%s\\)"
(make-string n ?\s)
(apply #'concat
(make-list n (or (aref char-fold-table ?\s) " ")))))
;;;###autoload
(defun char-fold-to-regexp (string &optional _lax from)
"Return a regexp matching anything that char-folds into STRING.
Any character in STRING that has an entry in
`char-fold-table' is replaced with that entry (which is a
regexp) and other characters are `regexp-quote'd.
If the resulting regexp would be too long for Emacs to handle,
just return the result of calling `regexp-quote' on STRING.
FROM is for internal use. It specifies an index in the STRING
from which to start."
(let* ((spaces 0)
(multi-char-table (char-table-extra-slot char-fold-table 0))
(i (or from 0))
(end (length string))
(out nil))
;; When the user types a space, we want to match the table entry
;; for ?\s, which is generally a regexp like "[ ...]". However,
;; the `search-spaces-regexp' variable doesn't "see" spaces inside
;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
;; instead (to manually expose a space). Furthermore, the lax
;; search engine acts on a bunch of spaces, not on individual
;; spaces, so if the string contains sequential spaces like " ", we
;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)".
(while (< i end)
(pcase (aref string i)
(`?\s (setq spaces (1+ spaces)))
(c (when (> spaces 0)
(push (char-fold--make-space-string spaces) out)
(setq spaces 0))
(let ((regexp (or (aref char-fold-table c)
(regexp-quote (string c))))
;; Long string. The regexp would probably be too long.
(alist (unless (> end 50)
(aref multi-char-table c))))
(push (let ((matched-entries nil)
(max-length 0))
(dolist (entry alist)
(let* ((suffix (car entry))
(len-suf (length suffix)))
(when (eq (compare-strings suffix 0 nil
string (1+ i) (+ i 1 len-suf)
nil)
t)
(push (cons len-suf (cdr entry)) matched-entries)
(setq max-length (max max-length len-suf)))))
;; If no suffixes matched, just go on.
(if (not matched-entries)
regexp
;;; If N suffixes match, we "branch" out into N+1 executions for the
;;; length of the longest match. This means "fix" will match "fix" but
;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
;;; exponentially. See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
(let ((subs (substring string (1+ i) (+ i 1 max-length))))
;; `i' is still going to inc by 1 below.
(setq i (+ i max-length))
(concat
"\\(?:"
(mapconcat (lambda (entry)
(let ((length (car entry))
(suffix-regexp (cdr entry)))
(concat suffix-regexp
(char-fold-to-regexp subs nil length))))
`((0 . ,regexp) . ,matched-entries) "\\|")
"\\)"))))
out))))
(setq i (1+ i)))
(when (> spaces 0)
(push (char-fold--make-space-string spaces) out))
(let ((regexp (apply #'concat (nreverse out))))
;; Limited by `MAX_BUF_SIZE' in `regex.c'.
(if (> (length regexp) 5000)
(regexp-quote string)
regexp))))
;;; Commands provided for completeness.
(defun char-fold-search-forward (string &optional bound noerror count)
"Search forward for a char-folded version of STRING.
STRING is converted to a regexp with `char-fold-to-regexp',
which is searched for with `re-search-forward'.
BOUND NOERROR COUNT are passed to `re-search-forward'."
(interactive "sSearch: ")
(re-search-forward (char-fold-to-regexp string) bound noerror count))
(defun char-fold-search-backward (string &optional bound noerror count)
"Search backward for a char-folded version of STRING.
STRING is converted to a regexp with `char-fold-to-regexp',
which is searched for with `re-search-backward'.
BOUND NOERROR COUNT are passed to `re-search-backward'."
(interactive "sSearch: ")
(re-search-backward (char-fold-to-regexp string) bound noerror count))
(provide 'char-fold)
;;; char-fold.el ends here