mirror of
https://git.savannah.gnu.org/git/emacs.git
synced 2024-11-27 07:37:33 +00:00
1500 lines
59 KiB
EmacsLisp
1500 lines
59 KiB
EmacsLisp
;;; rx.el --- S-exp notation for regexps --*- lexical-binding: t -*-
|
|
|
|
;; Copyright (C) 2001-2023 Free Software Foundation, Inc.
|
|
|
|
;; This file is part of GNU Emacs.
|
|
|
|
;; GNU Emacs is free software: you can redistribute it and/or modify
|
|
;; it under the terms of the GNU General Public License as published by
|
|
;; the Free Software Foundation, either version 3 of the License, or
|
|
;; (at your option) any later version.
|
|
|
|
;; GNU Emacs is distributed in the hope that it will be useful,
|
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;; GNU General Public License for more details.
|
|
|
|
;; You should have received a copy of the GNU General Public License
|
|
;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
;;; Commentary:
|
|
|
|
;; This facility allows writing regexps in a sexp-based language
|
|
;; instead of strings. Regexps in the `rx' notation are easier to
|
|
;; read, write and maintain; they can be indented and commented in a
|
|
;; natural way, and are easily composed by program code.
|
|
;; The translation to string regexp is done by a macro and does not
|
|
;; incur any extra processing during run time. Example:
|
|
;;
|
|
;; (rx bos (or (not (any "^"))
|
|
;; (seq "^" (or " *" "["))))
|
|
;;
|
|
;; => "\\`\\(?:[^^]\\|\\^\\(?: \\*\\|\\[\\)\\)"
|
|
;;
|
|
;; The notation is much influenced by and retains some compatibility with
|
|
;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities,
|
|
;; and the older Emacs package Sregex.
|
|
|
|
;;; Code:
|
|
|
|
;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE),
|
|
;; where REGEXP is a list of string expressions that will be
|
|
;; concatenated into a regexp, and PRECEDENCE is one of
|
|
;;
|
|
;; t -- can be used as argument to postfix operators (eg. "a")
|
|
;; seq -- can be concatenated in sequence with other seq or higher (eg. "ab")
|
|
;; lseq -- can be concatenated to the left of rseq or higher (eg. "^a")
|
|
;; rseq -- can be concatenated to the right of lseq or higher (eg. "a$")
|
|
;; nil -- can only be used in alternatives (eg. "a\\|b")
|
|
;;
|
|
;; They form a lattice:
|
|
;;
|
|
;; t highest precedence
|
|
;; |
|
|
;; seq
|
|
;; / \
|
|
;; lseq rseq
|
|
;; \ /
|
|
;; nil lowest precedence
|
|
|
|
|
|
(defconst rx--char-classes
|
|
'((digit . digit)
|
|
(numeric . digit)
|
|
(num . digit)
|
|
(control . cntrl)
|
|
(cntrl . cntrl)
|
|
(hex-digit . xdigit)
|
|
(hex . xdigit)
|
|
(xdigit . xdigit)
|
|
(blank . blank)
|
|
(graphic . graph)
|
|
(graph . graph)
|
|
(printing . print)
|
|
(print . print)
|
|
(alphanumeric . alnum)
|
|
(alnum . alnum)
|
|
(letter . alpha)
|
|
(alphabetic . alpha)
|
|
(alpha . alpha)
|
|
(ascii . ascii)
|
|
(nonascii . nonascii)
|
|
(lower . lower)
|
|
(lower-case . lower)
|
|
(punctuation . punct)
|
|
(punct . punct)
|
|
(space . space)
|
|
(whitespace . space)
|
|
(white . space)
|
|
(upper . upper)
|
|
(upper-case . upper)
|
|
(word . word)
|
|
(wordchar . word)
|
|
(unibyte . unibyte)
|
|
(multibyte . multibyte))
|
|
"Alist mapping rx symbols to character classes.
|
|
Most of the names are from SRE.")
|
|
|
|
(defvar rx-constituents nil
|
|
"Alist of old-style rx extensions, for compatibility.
|
|
For new code, use `rx-define', `rx-let' or `rx-let-eval'.
|
|
|
|
Each element is (SYMBOL . DEF).
|
|
|
|
If DEF is a symbol, then SYMBOL is an alias of DEF.
|
|
|
|
If DEF is a string, then SYMBOL is a plain rx symbol defined as the
|
|
regexp string DEF.
|
|
|
|
If DEF is a list on the form (FUN MIN-ARGS MAX-ARGS PRED), then
|
|
SYMBOL is an rx form with at least MIN-ARGS and at most
|
|
MAX-ARGS arguments. If MAX-ARGS is nil, then there is no upper limit.
|
|
FUN is a function taking the entire rx form as single argument
|
|
and returning the translated regexp string.
|
|
If PRED is non-nil, it is a predicate that all actual arguments must
|
|
satisfy.")
|
|
|
|
(defvar rx--local-definitions nil
|
|
"Alist of dynamic local rx definitions.
|
|
Each entry is:
|
|
(NAME DEF) -- NAME is an rx symbol defined as the rx form DEF.
|
|
(NAME ARGS DEF) -- NAME is an rx form with arglist ARGS, defined
|
|
as the rx form DEF (which can contain members of ARGS).")
|
|
|
|
(defsubst rx--lookup-def (name)
|
|
"Current definition of NAME: (DEF) or (ARGS DEF), or nil if none."
|
|
(or (cdr (assq name rx--local-definitions))
|
|
(get name 'rx-definition)))
|
|
|
|
(defun rx--expand-def (form)
|
|
"FORM expanded (once) if a user-defined construct; otherwise nil."
|
|
(cond ((symbolp form)
|
|
(let ((def (rx--lookup-def form)))
|
|
(and def
|
|
(if (cdr def)
|
|
(error "Not an `rx' symbol definition: %s" form)
|
|
(car def)))))
|
|
((and (consp form) (symbolp (car form)))
|
|
(let* ((op (car form))
|
|
(def (rx--lookup-def op)))
|
|
(and def
|
|
(if (cdr def)
|
|
(rx--expand-template
|
|
op (cdr form) (nth 0 def) (nth 1 def))
|
|
(error "Not an `rx' form definition: %s" op)))))))
|
|
|
|
;; TODO: Additions to consider:
|
|
;; - A construct like `or' but without the match order guarantee,
|
|
;; maybe `unordered-or'. Useful for composition or generation of
|
|
;; alternatives; permits more effective use of regexp-opt.
|
|
|
|
(defun rx--translate-symbol (sym)
|
|
"Translate an rx symbol. Return (REGEXP . PRECEDENCE)."
|
|
(pcase sym
|
|
;; Use `list' instead of a quoted list to wrap the strings here,
|
|
;; since the return value may be mutated.
|
|
((or 'nonl 'not-newline 'any) (cons (list ".") t))
|
|
((or 'anychar 'anything) (cons (list "[^z-a]") t))
|
|
('unmatchable (rx--empty))
|
|
((or 'bol 'line-start) (cons (list "^") 'lseq))
|
|
((or 'eol 'line-end) (cons (list "$") 'rseq))
|
|
((or 'bos 'string-start 'bot 'buffer-start) (cons (list "\\`") t))
|
|
((or 'eos 'string-end 'eot 'buffer-end) (cons (list "\\'") t))
|
|
('point (cons (list "\\=") t))
|
|
((or 'bow 'word-start) (cons (list "\\<") t))
|
|
((or 'eow 'word-end) (cons (list "\\>") t))
|
|
('word-boundary (cons (list "\\b") t))
|
|
('not-word-boundary (cons (list "\\B") t))
|
|
('symbol-start (cons (list "\\_<") t))
|
|
('symbol-end (cons (list "\\_>") t))
|
|
('not-wordchar (cons (list "\\W") t))
|
|
(_
|
|
(cond
|
|
((let ((class (cdr (assq sym rx--char-classes))))
|
|
(and class (cons (list (concat "[[:" (symbol-name class) ":]]")) t))))
|
|
|
|
((let ((expanded (rx--expand-def sym)))
|
|
(and expanded (rx--translate expanded))))
|
|
|
|
;; For compatibility with old rx.
|
|
((let ((entry (assq sym rx-constituents)))
|
|
(and (progn
|
|
(while (and entry (not (stringp (cdr entry))))
|
|
(setq entry
|
|
(if (symbolp (cdr entry))
|
|
;; Alias for another entry.
|
|
(assq (cdr entry) rx-constituents)
|
|
;; Wrong type, try further down the list.
|
|
(assq (car entry)
|
|
(cdr (memq entry rx-constituents))))))
|
|
entry)
|
|
(cons (list (cdr entry)) nil))))
|
|
(t (error "Unknown rx symbol `%s'" sym))))))
|
|
|
|
(defun rx--enclose (left-str rexp right-str)
|
|
"Bracket REXP by LEFT-STR and RIGHT-STR."
|
|
(append (list left-str) rexp (list right-str)))
|
|
|
|
(defun rx--bracket (rexp)
|
|
(rx--enclose "\\(?:" rexp "\\)"))
|
|
|
|
(defun rx--sequence (left right)
|
|
"Return the sequence (concatenation) of two translated items,
|
|
each on the form (REGEXP . PRECEDENCE), returning (REGEXP . PRECEDENCE)."
|
|
;; Concatenation rules:
|
|
;; seq ++ seq -> seq
|
|
;; lseq ++ seq -> lseq
|
|
;; seq ++ rseq -> rseq
|
|
;; lseq ++ rseq -> nil
|
|
(cond ((not (car left)) right)
|
|
((not (car right)) left)
|
|
(t
|
|
(let ((l (if (memq (cdr left) '(nil rseq))
|
|
(cons (rx--bracket (car left)) t)
|
|
left))
|
|
(r (if (memq (cdr right) '(nil lseq))
|
|
(cons (rx--bracket (car right)) t)
|
|
right)))
|
|
(cons (append (car l) (car r))
|
|
(if (eq (cdr l) 'lseq)
|
|
(if (eq (cdr r) 'rseq)
|
|
nil ; lseq ++ rseq
|
|
'lseq) ; lseq ++ seq
|
|
(if (eq (cdr r) 'rseq)
|
|
'rseq ; seq ++ rseq
|
|
'seq))))))) ; seq ++ seq
|
|
|
|
(defun rx--translate-seq (body)
|
|
"Translate a sequence of zero or more rx items.
|
|
Return (REGEXP . PRECEDENCE)."
|
|
(if body
|
|
(let* ((items (mapcar #'rx--translate body))
|
|
(result (car items)))
|
|
(dolist (item (cdr items))
|
|
(setq result (rx--sequence result item)))
|
|
result)
|
|
(cons nil 'seq)))
|
|
|
|
(defun rx--empty ()
|
|
"Regexp that never matches anything."
|
|
(cons (list regexp-unmatchable) 'seq))
|
|
|
|
;; `cl-every' replacement to avoid bootstrapping problems.
|
|
(defun rx--every (pred list)
|
|
"Whether PRED is true for every element of LIST."
|
|
(while (and list (funcall pred (car list)))
|
|
(setq list (cdr list)))
|
|
(null list))
|
|
|
|
(defun rx--foldl (f x l)
|
|
"(F (F (F X L0) L1) L2) ...
|
|
Left-fold the list L, starting with X, by the binary function F."
|
|
(while l
|
|
(setq x (funcall f x (car l)))
|
|
(setq l (cdr l)))
|
|
x)
|
|
|
|
(defun rx--normalise-or-arg (form)
|
|
"Normalize the `or' argument FORM.
|
|
Characters become strings, user-definitions and `eval' forms are expanded,
|
|
and `or' forms are normalized recursively."
|
|
(cond ((characterp form)
|
|
(char-to-string form))
|
|
((and (consp form) (memq (car form) '(or |)))
|
|
(cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
|
|
((and (consp form) (eq (car form) 'eval))
|
|
(rx--normalise-or-arg (rx--expand-eval (cdr form))))
|
|
(t
|
|
(let ((expanded (rx--expand-def form)))
|
|
(if expanded
|
|
(rx--normalise-or-arg expanded)
|
|
form)))))
|
|
|
|
(defun rx--all-string-or-args (body)
|
|
"If BODY only consists of strings or such `or' forms, return all the strings.
|
|
Otherwise throw `rx--nonstring'."
|
|
(mapcan (lambda (form)
|
|
(cond ((stringp form) (list form))
|
|
((and (consp form) (memq (car form) '(or |)))
|
|
(rx--all-string-or-args (cdr form)))
|
|
(t (throw 'rx--nonstring nil))))
|
|
body))
|
|
|
|
(defun rx--translate-or (body)
|
|
"Translate an or-pattern of zero or more rx items.
|
|
Return (REGEXP . PRECEDENCE)."
|
|
;; FIXME: Possible improvements:
|
|
;;
|
|
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
|
|
;; Then call regexp-opt on runs of string arguments. Example:
|
|
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
|
|
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
|
|
;;
|
|
;; - Optimize single-character alternatives better:
|
|
;; * classes: space, alpha, ...
|
|
;; * (syntax S), for some S (whitespace, word)
|
|
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
|
|
;; -> (any "@" "%" digit "A-Z" space word)
|
|
;; -> "[A-Z@%[:digit:][:space:][:word:]]"
|
|
(cond
|
|
((null body) ; No items: a never-matching regexp.
|
|
(rx--empty))
|
|
((null (cdr body)) ; Single item.
|
|
(rx--translate (car body)))
|
|
(t
|
|
(let* ((args (mapcar #'rx--normalise-or-arg body))
|
|
(all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
|
|
(cond
|
|
(all-strings ; Only strings.
|
|
(cons (list (regexp-opt all-strings nil))
|
|
t))
|
|
((rx--every #'rx--charset-p args) ; All charsets.
|
|
(rx--translate-union nil args))
|
|
(t
|
|
(cons (append (car (rx--translate (car args)))
|
|
(mapcan (lambda (item)
|
|
(cons "\\|" (car (rx--translate item))))
|
|
(cdr args)))
|
|
nil)))))))
|
|
|
|
(defun rx--charset-p (form)
|
|
"Whether FORM looks like a charset, only consisting of character intervals
|
|
and set operations."
|
|
(or (and (consp form)
|
|
(or (and (memq (car form) '(any in char))
|
|
(rx--every (lambda (x) (not (symbolp x))) (cdr form)))
|
|
(and (memq (car form) '(not or | intersection))
|
|
(rx--every #'rx--charset-p (cdr form)))))
|
|
(characterp form)
|
|
(and (stringp form) (= (length form) 1))
|
|
(and (or (symbolp form) (consp form))
|
|
(let ((expanded (rx--expand-def form)))
|
|
(and expanded
|
|
(rx--charset-p expanded))))))
|
|
|
|
(defun rx--string-to-intervals (str)
|
|
"Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
|
|
character X becomes (?X . ?X). Return the intervals in a list."
|
|
;; We could just do string-to-multibyte on the string and work with
|
|
;; that instead of this `decode-char' workaround.
|
|
(let ((decode-char
|
|
(if (multibyte-string-p str)
|
|
#'identity
|
|
#'unibyte-char-to-multibyte))
|
|
(len (length str))
|
|
(i 0)
|
|
(intervals nil))
|
|
(while (< i len)
|
|
(cond ((and (< i (- len 2))
|
|
(= (aref str (1+ i)) ?-))
|
|
;; Range.
|
|
(let ((start (funcall decode-char (aref str i)))
|
|
(end (funcall decode-char (aref str (+ i 2)))))
|
|
(cond ((and (<= start #x7f) (>= end #x3fff80))
|
|
;; Ranges between ASCII and raw bytes are split to
|
|
;; avoid having them absorb Unicode characters
|
|
;; caught in-between.
|
|
(push (cons start #x7f) intervals)
|
|
(push (cons #x3fff80 end) intervals))
|
|
((<= start end)
|
|
(push (cons start end) intervals))
|
|
(t
|
|
(error "Invalid rx `any' range: %s"
|
|
(substring str i (+ i 3)))))
|
|
(setq i (+ i 3))))
|
|
(t
|
|
;; Single character.
|
|
(let ((char (funcall decode-char (aref str i))))
|
|
(push (cons char char) intervals))
|
|
(setq i (+ i 1)))))
|
|
intervals))
|
|
|
|
(defun rx--condense-intervals (intervals)
|
|
"Merge adjacent and overlapping intervals by mutation, preserving the order.
|
|
INTERVALS is a list of (START . END) with START ≤ END, sorted by START."
|
|
(let ((tail intervals)
|
|
d)
|
|
(while (setq d (cdr tail))
|
|
(if (>= (cdar tail) (1- (caar d)))
|
|
(progn
|
|
(setcdr (car tail) (max (cdar tail) (cdar d)))
|
|
(setcdr tail (cdr d)))
|
|
(setq tail d)))
|
|
intervals))
|
|
|
|
(defun rx--parse-any (body)
|
|
"Parse arguments of an (any ...) construct.
|
|
Return (INTERVALS . CLASSES), where INTERVALS is a sorted list of
|
|
disjoint intervals (each a cons of chars), and CLASSES
|
|
a list of named character classes in the order they occur in BODY."
|
|
(let ((classes nil)
|
|
(strings nil)
|
|
(conses nil))
|
|
;; Collect strings, conses and characters, and classes in separate bins.
|
|
(dolist (arg body)
|
|
(cond ((stringp arg)
|
|
(push arg strings))
|
|
((and (consp arg)
|
|
(characterp (car arg))
|
|
(characterp (cdr arg))
|
|
(<= (car arg) (cdr arg)))
|
|
;; Copy the cons, in case we need to modify it.
|
|
(push (cons (car arg) (cdr arg)) conses))
|
|
((characterp arg)
|
|
(push (cons arg arg) conses))
|
|
((and (symbolp arg)
|
|
(let ((class (cdr (assq arg rx--char-classes))))
|
|
(and class
|
|
(or (memq class classes)
|
|
(progn (push class classes) t))))))
|
|
(t (error "Invalid rx `any' argument: %s" arg))))
|
|
(cons (rx--condense-intervals
|
|
(sort (append conses
|
|
(mapcan #'rx--string-to-intervals strings))
|
|
#'car-less-than-car))
|
|
(reverse classes))))
|
|
|
|
(defun rx--generate-alt (negated intervals classes)
|
|
"Generate a character alternative. Return (REGEXP . PRECEDENCE).
|
|
If NEGATED is non-nil, negate the result; INTERVALS is a sorted
|
|
list of disjoint intervals and CLASSES a list of named character
|
|
classes."
|
|
(let ((items (append intervals classes)))
|
|
;; Move lone ] and range ]-x to the start.
|
|
(let ((rbrac-l (assq ?\] items)))
|
|
(when rbrac-l
|
|
(setq items (cons rbrac-l (delq rbrac-l items)))))
|
|
|
|
;; Split x-] and move the lone ] to the start.
|
|
(let ((rbrac-r (rassq ?\] items)))
|
|
(when (and rbrac-r (not (eq (car rbrac-r) ?\])))
|
|
(setcdr rbrac-r ?\\)
|
|
(setq items (cons '(?\] . ?\]) items))))
|
|
|
|
;; Split ,-- (which would end up as ,- otherwise).
|
|
(let ((dash-r (rassq ?- items)))
|
|
(when (eq (car dash-r) ?,)
|
|
(setcdr dash-r ?,)
|
|
(setq items (nconc items '((?- . ?-))))))
|
|
|
|
;; Remove - (lone or at start of interval)
|
|
(let ((dash-l (assq ?- items)))
|
|
(when dash-l
|
|
(if (eq (cdr dash-l) ?-)
|
|
(setq items (delq dash-l items)) ; Remove lone -
|
|
(setcar dash-l ?.)) ; Reduce --x to .-x
|
|
(setq items (nconc items '((?- . ?-))))))
|
|
|
|
;; Deal with leading ^ and range ^-x.
|
|
(when (and (consp (car items))
|
|
(eq (caar items) ?^)
|
|
(cdr items))
|
|
;; Move ^ and ^-x to second place.
|
|
(setq items (cons (cadr items)
|
|
(cons (car items) (cddr items)))))
|
|
|
|
(cond
|
|
;; Empty set: if negated, any char, otherwise match-nothing.
|
|
((null items)
|
|
(if negated
|
|
(rx--translate-symbol 'anything)
|
|
(rx--empty)))
|
|
;; Single non-negated character.
|
|
((and (null (cdr items))
|
|
(consp (car items))
|
|
(eq (caar items) (cdar items))
|
|
(not negated))
|
|
(cons (list (regexp-quote (char-to-string (caar items))))
|
|
t))
|
|
;; Negated newline.
|
|
((and (equal items '((?\n . ?\n)))
|
|
negated)
|
|
(rx--translate-symbol 'nonl))
|
|
;; At least one character or class, possibly negated.
|
|
(t
|
|
(cons
|
|
(list
|
|
(concat
|
|
"["
|
|
(and negated "^")
|
|
(mapconcat (lambda (item)
|
|
(cond ((symbolp item)
|
|
(format "[:%s:]" item))
|
|
((eq (car item) (cdr item))
|
|
(char-to-string (car item)))
|
|
((eq (1+ (car item)) (cdr item))
|
|
(string (car item) (cdr item)))
|
|
(t
|
|
(string (car item) ?- (cdr item)))))
|
|
items nil)
|
|
"]"))
|
|
t)))))
|
|
|
|
(defun rx--translate-any (negated body)
|
|
"Translate an (any ...) construct. Return (REGEXP . PRECEDENCE).
|
|
If NEGATED, negate the sense."
|
|
(let ((parsed (rx--parse-any body)))
|
|
(rx--generate-alt negated (car parsed) (cdr parsed))))
|
|
|
|
(defun rx--intervals-to-alt (negated intervals)
|
|
"Generate a character alternative from an interval set.
|
|
Return (REGEXP . PRECEDENCE).
|
|
INTERVALS is a sorted list of disjoint intervals.
|
|
If NEGATED, negate the sense."
|
|
;; Detect whether the interval set is better described in
|
|
;; complemented form. This is not just a matter of aesthetics: any
|
|
;; range from ASCII to raw bytes will automatically exclude the
|
|
;; entire non-ASCII Unicode range by the regexp engine.
|
|
(if (rx--every (lambda (iv) (not (<= (car iv) #x3ffeff (cdr iv))))
|
|
intervals)
|
|
(rx--generate-alt negated intervals nil)
|
|
(rx--generate-alt
|
|
(not negated) (rx--complement-intervals intervals) nil)))
|
|
|
|
;; FIXME: Consider turning `not' into a variadic operator, following SRE:
|
|
;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
|
|
;; (not) = anychar.
|
|
;; Maybe allow singleton characters as arguments.
|
|
|
|
(defun rx--translate-not (negated body)
|
|
"Translate a (not ...) construct. Return (REGEXP . PRECEDENCE).
|
|
If NEGATED, negate the sense (thus making it positive)."
|
|
(unless (and body (null (cdr body)))
|
|
(error "rx `not' form takes exactly one argument"))
|
|
(let ((arg (car body)))
|
|
(cond
|
|
((and (consp arg)
|
|
(pcase (car arg)
|
|
((or 'any 'in 'char)
|
|
(rx--translate-any (not negated) (cdr arg)))
|
|
('syntax
|
|
(rx--translate-syntax (not negated) (cdr arg)))
|
|
('category
|
|
(rx--translate-category (not negated) (cdr arg)))
|
|
('not
|
|
(rx--translate-not (not negated) (cdr arg)))
|
|
((or 'or '|)
|
|
(rx--translate-union (not negated) (cdr arg)))
|
|
('intersection
|
|
(rx--translate-intersection (not negated) (cdr arg))))))
|
|
((let ((class (cdr (assq arg rx--char-classes))))
|
|
(and class
|
|
(rx--generate-alt (not negated) nil (list class)))))
|
|
((eq arg 'word-boundary)
|
|
(rx--translate-symbol
|
|
(if negated 'word-boundary 'not-word-boundary)))
|
|
((characterp arg)
|
|
(rx--generate-alt (not negated) (list (cons arg arg)) nil))
|
|
((and (stringp arg) (= (length arg) 1))
|
|
(let ((char (string-to-char arg)))
|
|
(rx--generate-alt (not negated) (list (cons char char)) nil)))
|
|
((let ((expanded (rx--expand-def arg)))
|
|
(and expanded
|
|
(rx--translate-not negated (list expanded)))))
|
|
(t (error "Illegal argument to rx `not': %S" arg)))))
|
|
|
|
(defun rx--complement-intervals (intervals)
|
|
"Complement of the interval list INTERVALS."
|
|
(let ((compl nil)
|
|
(c 0))
|
|
(dolist (iv intervals)
|
|
(when (< c (car iv))
|
|
(push (cons c (1- (car iv))) compl))
|
|
(setq c (1+ (cdr iv))))
|
|
(when (< c (max-char))
|
|
(push (cons c (max-char)) compl))
|
|
(nreverse compl)))
|
|
|
|
(defun rx--intersect-intervals (ivs-a ivs-b)
|
|
"Intersection of the interval lists IVS-A and IVS-B."
|
|
(let ((isect nil))
|
|
(while (and ivs-a ivs-b)
|
|
(let ((a (car ivs-a))
|
|
(b (car ivs-b)))
|
|
(cond
|
|
((< (cdr a) (car b)) (setq ivs-a (cdr ivs-a)))
|
|
((> (car a) (cdr b)) (setq ivs-b (cdr ivs-b)))
|
|
(t
|
|
(push (cons (max (car a) (car b))
|
|
(min (cdr a) (cdr b)))
|
|
isect)
|
|
(setq ivs-a (cdr ivs-a))
|
|
(setq ivs-b (cdr ivs-b))
|
|
(cond ((< (cdr a) (cdr b))
|
|
(push (cons (1+ (cdr a)) (cdr b))
|
|
ivs-b))
|
|
((> (cdr a) (cdr b))
|
|
(push (cons (1+ (cdr b)) (cdr a))
|
|
ivs-a)))))))
|
|
(nreverse isect)))
|
|
|
|
(defun rx--union-intervals (ivs-a ivs-b)
|
|
"Union of the interval lists IVS-A and IVS-B."
|
|
(rx--complement-intervals
|
|
(rx--intersect-intervals
|
|
(rx--complement-intervals ivs-a)
|
|
(rx--complement-intervals ivs-b))))
|
|
|
|
(defun rx--charset-intervals (charset)
|
|
"Return a sorted list of non-adjacent disjoint intervals from CHARSET.
|
|
CHARSET is any expression allowed in a character set expression:
|
|
characters, single-char strings, `any' forms (no classes permitted),
|
|
or `not', `or' or `intersection' forms whose arguments are charsets."
|
|
(pcase charset
|
|
(`(,(or 'any 'in 'char) . ,body)
|
|
(let ((parsed (rx--parse-any body)))
|
|
(when (cdr parsed)
|
|
(error
|
|
"Character class not permitted in set operations: %S"
|
|
(cadr parsed)))
|
|
(car parsed)))
|
|
(`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
|
|
(`(,(or 'or '|) . ,body) (rx--charset-union body))
|
|
(`(intersection . ,body) (rx--charset-intersection body))
|
|
((pred characterp)
|
|
(list (cons charset charset)))
|
|
((guard (and (stringp charset) (= (length charset) 1)))
|
|
(let ((char (string-to-char charset)))
|
|
(list (cons char char))))
|
|
(_ (let ((expanded (rx--expand-def charset)))
|
|
(if expanded
|
|
(rx--charset-intervals expanded)
|
|
(error "Bad character set: %S" charset))))))
|
|
|
|
(defun rx--charset-union (charsets)
|
|
"Union of CHARSETS, as a set of intervals."
|
|
(rx--foldl #'rx--union-intervals nil
|
|
(mapcar #'rx--charset-intervals charsets)))
|
|
|
|
(defconst rx--charset-all (list (cons 0 (max-char))))
|
|
|
|
(defun rx--charset-intersection (charsets)
|
|
"Intersection of CHARSETS, as a set of intervals."
|
|
(rx--foldl #'rx--intersect-intervals rx--charset-all
|
|
(mapcar #'rx--charset-intervals charsets)))
|
|
|
|
(defun rx--translate-union (negated body)
|
|
"Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE).
|
|
If NEGATED, negate the sense."
|
|
(rx--intervals-to-alt negated (rx--charset-union body)))
|
|
|
|
(defun rx--translate-intersection (negated body)
|
|
"Translate an (intersection ...) construct. Return (REGEXP . PRECEDENCE).
|
|
If NEGATED, negate the sense."
|
|
(rx--intervals-to-alt negated (rx--charset-intersection body)))
|
|
|
|
(defun rx--atomic-regexp (item)
|
|
"ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t."
|
|
(if (eq (cdr item) t)
|
|
(car item)
|
|
(rx--bracket (car item))))
|
|
|
|
(defun rx--translate-counted-repetition (min-count max-count body)
|
|
(let ((operand (rx--translate-seq body)))
|
|
(if (car operand)
|
|
(cons (append
|
|
(rx--atomic-regexp operand)
|
|
(list (concat "\\{"
|
|
(number-to-string min-count)
|
|
(cond ((null max-count) ",")
|
|
((< min-count max-count)
|
|
(concat "," (number-to-string max-count))))
|
|
"\\}")))
|
|
t)
|
|
operand)))
|
|
|
|
(defun rx--check-repeat-arg (name min-args body)
|
|
(unless (>= (length body) min-args)
|
|
(error "rx `%s' requires at least %d argument%s"
|
|
name min-args (if (= min-args 1) "" "s")))
|
|
;; There seems to be no reason to disallow zero counts.
|
|
(unless (natnump (car body))
|
|
(error "rx `%s' first argument must be nonnegative" name)))
|
|
|
|
(defun rx--translate-bounded-repetition (name body)
|
|
(let ((min-count (car body))
|
|
(max-count (cadr body))
|
|
(items (cddr body)))
|
|
(unless (and (natnump min-count)
|
|
(natnump max-count)
|
|
(<= min-count max-count))
|
|
(error "rx `%s' range error" name))
|
|
(rx--translate-counted-repetition min-count max-count items)))
|
|
|
|
(defun rx--translate-repeat (body)
|
|
(rx--check-repeat-arg 'repeat 2 body)
|
|
(if (= (length body) 2)
|
|
(rx--translate-counted-repetition (car body) (car body) (cdr body))
|
|
(rx--translate-bounded-repetition 'repeat body)))
|
|
|
|
(defun rx--translate-** (body)
|
|
(rx--check-repeat-arg '** 2 body)
|
|
(rx--translate-bounded-repetition '** body))
|
|
|
|
(defun rx--translate->= (body)
|
|
(rx--check-repeat-arg '>= 1 body)
|
|
(rx--translate-counted-repetition (car body) nil (cdr body)))
|
|
|
|
(defun rx--translate-= (body)
|
|
(rx--check-repeat-arg '= 1 body)
|
|
(rx--translate-counted-repetition (car body) (car body) (cdr body)))
|
|
|
|
(defvar rx--greedy t)
|
|
|
|
(defun rx--translate-rep (op-string greedy body)
|
|
"Translate a repetition; OP-STRING is one of \"*\", \"+\" or \"?\".
|
|
GREEDY is a boolean. Return (REGEXP . PRECEDENCE)."
|
|
(let ((operand (rx--translate-seq body)))
|
|
(if (car operand)
|
|
(cons (append (rx--atomic-regexp operand)
|
|
(list (concat op-string (unless greedy "?"))))
|
|
;; The result has precedence seq to avoid (? (* "a")) -> "a*?"
|
|
'seq)
|
|
operand)))
|
|
|
|
(defun rx--control-greedy (greedy body)
|
|
"Translate the sequence BODY with greediness GREEDY.
|
|
Return (REGEXP . PRECEDENCE)."
|
|
(let ((rx--greedy greedy))
|
|
(rx--translate-seq body)))
|
|
|
|
(defun rx--translate-group (body)
|
|
"Translate the `group' form. Return (REGEXP . PRECEDENCE)."
|
|
(cons (rx--enclose "\\("
|
|
(car (rx--translate-seq body))
|
|
"\\)")
|
|
t))
|
|
|
|
(defun rx--translate-group-n (body)
|
|
"Translate the `group-n' form. Return (REGEXP . PRECEDENCE)."
|
|
(unless (and (integerp (car body)) (> (car body) 0))
|
|
(error "rx `group-n' requires a positive number as first argument"))
|
|
(cons (rx--enclose (concat "\\(?" (number-to-string (car body)) ":")
|
|
(car (rx--translate-seq (cdr body)))
|
|
"\\)")
|
|
t))
|
|
|
|
(defun rx--translate-backref (body)
|
|
"Translate the `backref' form. Return (REGEXP . PRECEDENCE)."
|
|
(unless (and (= (length body) 1) (integerp (car body)) (<= 1 (car body) 9))
|
|
(error "rx `backref' requires an argument in the range 1..9"))
|
|
(cons (list "\\" (number-to-string (car body))) t))
|
|
|
|
(defconst rx--syntax-codes
|
|
'((whitespace . ?-) ; SPC also accepted
|
|
(punctuation . ?.)
|
|
(word . ?w) ; W also accepted
|
|
(symbol . ?_)
|
|
(open-parenthesis . ?\()
|
|
(close-parenthesis . ?\))
|
|
(expression-prefix . ?\')
|
|
(string-quote . ?\")
|
|
(paired-delimiter . ?$)
|
|
(escape . ?\\)
|
|
(character-quote . ?/)
|
|
(comment-start . ?<)
|
|
(comment-end . ?>)
|
|
(string-delimiter . ?|)
|
|
(comment-delimiter . ?!)))
|
|
|
|
(defun rx--translate-syntax (negated body)
|
|
"Translate the `syntax' form. Return (REGEXP . PRECEDENCE)."
|
|
(unless (and body (null (cdr body)))
|
|
(error "rx `syntax' form takes exactly one argument"))
|
|
(let* ((sym (car body))
|
|
(syntax (cdr (assq sym rx--syntax-codes))))
|
|
(unless syntax
|
|
(cond
|
|
;; Syntax character directly (sregex compatibility)
|
|
((and (characterp sym) (rassq sym rx--syntax-codes))
|
|
(setq syntax sym))
|
|
;; Syntax character as symbol (sregex compatibility)
|
|
((symbolp sym)
|
|
(let ((name (symbol-name sym)))
|
|
(when (= (length name) 1)
|
|
(let ((char (string-to-char name)))
|
|
(when (rassq char rx--syntax-codes)
|
|
(setq syntax char)))))))
|
|
(unless syntax
|
|
(error "Unknown rx syntax name `%s'" sym)))
|
|
(cons (list (string ?\\ (if negated ?S ?s) syntax))
|
|
t)))
|
|
|
|
(defconst rx--categories
|
|
'((space-for-indent . ?\s)
|
|
(base . ?.)
|
|
(consonant . ?0)
|
|
(base-vowel . ?1)
|
|
(upper-diacritical-mark . ?2)
|
|
(lower-diacritical-mark . ?3)
|
|
(tone-mark . ?4)
|
|
(symbol . ?5)
|
|
(digit . ?6)
|
|
(vowel-modifying-diacritical-mark . ?7)
|
|
(vowel-sign . ?8)
|
|
(semivowel-lower . ?9)
|
|
(not-at-end-of-line . ?<)
|
|
(not-at-beginning-of-line . ?>)
|
|
(alpha-numeric-two-byte . ?A)
|
|
(chinese-two-byte . ?C)
|
|
(chinse-two-byte . ?C) ; A typo in Emacs 21.1-24.3.
|
|
(greek-two-byte . ?G)
|
|
(japanese-hiragana-two-byte . ?H)
|
|
(indian-two-byte . ?I)
|
|
(japanese-katakana-two-byte . ?K)
|
|
(strong-left-to-right . ?L)
|
|
(korean-hangul-two-byte . ?N)
|
|
(strong-right-to-left . ?R)
|
|
(cyrillic-two-byte . ?Y)
|
|
(combining-diacritic . ?^)
|
|
(ascii . ?a)
|
|
(arabic . ?b)
|
|
(chinese . ?c)
|
|
(ethiopic . ?e)
|
|
(greek . ?g)
|
|
(korean . ?h)
|
|
(indian . ?i)
|
|
(japanese . ?j)
|
|
(japanese-katakana . ?k)
|
|
(latin . ?l)
|
|
(lao . ?o)
|
|
(tibetan . ?q)
|
|
(japanese-roman . ?r)
|
|
(thai . ?t)
|
|
(vietnamese . ?v)
|
|
(hebrew . ?w)
|
|
(cyrillic . ?y)
|
|
(can-break . ?|)))
|
|
|
|
(defun rx--translate-category (negated body)
|
|
"Translate the `category' form. Return (REGEXP . PRECEDENCE)."
|
|
(unless (and body (null (cdr body)))
|
|
(error "rx `category' form takes exactly one argument"))
|
|
(let* ((arg (car body))
|
|
(category
|
|
(cond ((symbolp arg)
|
|
(let ((cat (assq arg rx--categories)))
|
|
(unless cat
|
|
(error "Unknown rx category `%s'" arg))
|
|
(cdr cat)))
|
|
((characterp arg) arg)
|
|
(t (error "Invalid rx `category' argument `%s'" arg)))))
|
|
(cons (list (string ?\\ (if negated ?C ?c) category))
|
|
t)))
|
|
|
|
(defvar rx--delayed-evaluation nil
|
|
"Whether to allow certain forms to be evaluated at runtime.")
|
|
|
|
(defun rx--translate-literal (body)
|
|
"Translate the `literal' form. Return (REGEXP . PRECEDENCE)."
|
|
(unless (and body (null (cdr body)))
|
|
(error "rx `literal' form takes exactly one argument"))
|
|
(let ((arg (car body)))
|
|
(cond ((stringp arg)
|
|
(cons (list (regexp-quote arg)) (if (= (length arg) 1) t 'seq)))
|
|
(rx--delayed-evaluation
|
|
(cons (list (list 'regexp-quote arg)) 'seq))
|
|
(t (error "rx `literal' form with non-string argument")))))
|
|
|
|
(defun rx--expand-eval (body)
|
|
"Expand `eval' arguments. Return a new rx form."
|
|
(unless (and body (null (cdr body)))
|
|
(error "rx `eval' form takes exactly one argument"))
|
|
(eval (car body)))
|
|
|
|
(defun rx--translate-eval (body)
|
|
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
|
|
(rx--translate (rx--expand-eval body)))
|
|
|
|
(defvar rx--regexp-atomic-regexp nil)
|
|
|
|
(defun rx--translate-regexp (body)
|
|
"Translate the `regexp' form. Return (REGEXP . PRECEDENCE)."
|
|
(unless (and body (null (cdr body)))
|
|
(error "rx `regexp' form takes exactly one argument"))
|
|
(let ((arg (car body)))
|
|
(cond ((stringp arg)
|
|
;; Generate the regexp when needed, since rx isn't
|
|
;; necessarily present in the byte-compilation environment.
|
|
(unless rx--regexp-atomic-regexp
|
|
(setq rx--regexp-atomic-regexp
|
|
;; Match atomic (precedence t) regexps: may give
|
|
;; false negatives but no false positives, assuming
|
|
;; the target string is syntactically correct.
|
|
(rx-to-string
|
|
'(seq
|
|
bos
|
|
(or (seq "["
|
|
(opt "^")
|
|
(opt "]")
|
|
(* (or (seq "[:" (+ (any "a-z")) ":]")
|
|
(not (any "]"))))
|
|
"]")
|
|
(not (any "*+?^$[\\"))
|
|
(seq "\\"
|
|
(or anything
|
|
(seq (any "sScC_") anything)
|
|
(seq "("
|
|
(* (or (not (any "\\"))
|
|
(seq "\\" (not (any ")")))))
|
|
"\\)"))))
|
|
eos)
|
|
t)))
|
|
(cons (list arg)
|
|
(if (string-match-p rx--regexp-atomic-regexp arg) t nil)))
|
|
(rx--delayed-evaluation
|
|
(cons (list arg) nil))
|
|
(t (error "rx `regexp' form with non-string argument")))))
|
|
|
|
(defun rx--translate-compat-form (def form)
|
|
"Translate a compatibility form from `rx-constituents'.
|
|
DEF is the definition tuple. Return (REGEXP . PRECEDENCE)."
|
|
(let* ((fn (nth 0 def))
|
|
(min-args (nth 1 def))
|
|
(max-args (nth 2 def))
|
|
(predicate (nth 3 def))
|
|
(nargs (1- (length form))))
|
|
(when (< nargs min-args)
|
|
(error "The `%s' form takes at least %d argument(s)"
|
|
(car form) min-args))
|
|
(when (and max-args (> nargs max-args))
|
|
(error "The `%s' form takes at most %d argument(s)"
|
|
(car form) max-args))
|
|
(when (and predicate (not (rx--every predicate (cdr form))))
|
|
(error "The `%s' form requires arguments satisfying `%s'"
|
|
(car form) predicate))
|
|
(let ((regexp (funcall fn form)))
|
|
(unless (stringp regexp)
|
|
(error "The `%s' form did not expand to a string" (car form)))
|
|
(cons (list regexp) nil))))
|
|
|
|
(defun rx--substitute (bindings form)
|
|
"Substitute BINDINGS in FORM. BINDINGS is an alist of (NAME . VALUES)
|
|
where VALUES is a list to splice into FORM wherever NAME occurs.
|
|
Return the substitution result wrapped in a list, since a single value
|
|
can expand to any number of values."
|
|
(cond ((symbolp form)
|
|
(let ((binding (assq form bindings)))
|
|
(if binding
|
|
(cdr binding)
|
|
(list form))))
|
|
((consp form)
|
|
(if (listp (cdr form))
|
|
;; Proper list. We substitute variables even in the head
|
|
;; position -- who knows, might be handy one day.
|
|
(list (mapcan (lambda (x) (copy-sequence
|
|
(rx--substitute bindings x)))
|
|
form))
|
|
;; Cons pair (presumably an interval).
|
|
(let ((first (rx--substitute bindings (car form)))
|
|
(second (rx--substitute bindings (cdr form))))
|
|
(if (and first (not (cdr first))
|
|
second (not (cdr second)))
|
|
(list (cons (car first) (car second)))
|
|
(error
|
|
"Cannot substitute a &rest parameter into a dotted pair")))))
|
|
(t (list form))))
|
|
|
|
;; FIXME: Consider adding extensions in Lisp macro style, where
|
|
;; arguments are passed unevaluated to code that returns the rx form
|
|
;; to use. Example:
|
|
;;
|
|
;; (rx-let ((radix-digit (radix)
|
|
;; :lisp (list 'any (cons ?0 (+ ?0 (eval radix) -1)))))
|
|
;; (rx (radix-digit (+ 5 3))))
|
|
;; =>
|
|
;; "[0-7]"
|
|
;;
|
|
;; While this would permit more powerful extensions, it's unclear just
|
|
;; how often they would be used in practice. Let's wait until there is
|
|
;; demand for it.
|
|
|
|
;; FIXME: An alternative binding syntax would be
|
|
;;
|
|
;; (NAME RXs...)
|
|
;; and
|
|
;; ((NAME ARGS...) RXs...)
|
|
;;
|
|
;; which would have two minor advantages: multiple RXs with implicit
|
|
;; `seq' in the definition, and the arglist is no longer an optional
|
|
;; element in the middle of the list. On the other hand, it's less
|
|
;; like traditional lisp arglist constructs (defun, defmacro).
|
|
;; Since it's a Scheme-like syntax, &rest parameters could be done using
|
|
;; dotted lists:
|
|
;; (rx-let (((name arg1 arg2 . rest) ...definition...)) ...)
|
|
|
|
(defun rx--expand-template (op values arglist template)
|
|
"Return TEMPLATE with variables in ARGLIST replaced with VALUES."
|
|
(let ((bindings nil)
|
|
(value-tail values)
|
|
(formals arglist))
|
|
(while formals
|
|
(pcase (car formals)
|
|
('&rest
|
|
(unless (cdr formals)
|
|
(error
|
|
"Expanding rx def `%s': missing &rest parameter name" op))
|
|
(push (cons (cadr formals) value-tail) bindings)
|
|
(setq formals nil)
|
|
(setq value-tail nil))
|
|
(name
|
|
(unless value-tail
|
|
(error
|
|
"Expanding rx def `%s': too few arguments (got %d, need %s%d)"
|
|
op (length values)
|
|
(if (memq '&rest arglist) "at least " "")
|
|
(- (length arglist) (length (memq '&rest arglist)))))
|
|
(push (cons name (list (car value-tail))) bindings)
|
|
(setq value-tail (cdr value-tail))))
|
|
(setq formals (cdr formals)))
|
|
(when value-tail
|
|
(error
|
|
"Expanding rx def `%s': too many arguments (got %d, need %d)"
|
|
op (length values) (length arglist)))
|
|
(let ((subst (rx--substitute bindings template)))
|
|
(if (and subst (not (cdr subst)))
|
|
(car subst)
|
|
(error "Expanding rx def `%s': must result in a single value" op)))))
|
|
|
|
(defun rx--translate-form (form)
|
|
"Translate an rx form (list structure). Return (REGEXP . PRECEDENCE)."
|
|
(let ((body (cdr form)))
|
|
(pcase (car form)
|
|
((or 'seq : 'and 'sequence) (rx--translate-seq body))
|
|
((or 'or '|) (rx--translate-or body))
|
|
((or 'any 'in 'char) (rx--translate-any nil body))
|
|
('not-char (rx--translate-any t body))
|
|
('not (rx--translate-not nil body))
|
|
('intersection (rx--translate-intersection nil body))
|
|
|
|
('repeat (rx--translate-repeat body))
|
|
('= (rx--translate-= body))
|
|
('>= (rx--translate->= body))
|
|
('** (rx--translate-** body))
|
|
|
|
((or 'zero-or-more '0+) (rx--translate-rep "*" rx--greedy body))
|
|
((or 'one-or-more '1+) (rx--translate-rep "+" rx--greedy body))
|
|
((or 'zero-or-one 'opt 'optional) (rx--translate-rep "?" rx--greedy body))
|
|
|
|
('* (rx--translate-rep "*" t body))
|
|
('+ (rx--translate-rep "+" t body))
|
|
((or '\? ?\s) (rx--translate-rep "?" t body))
|
|
|
|
('*? (rx--translate-rep "*" nil body))
|
|
('+? (rx--translate-rep "+" nil body))
|
|
((or '\?? ??) (rx--translate-rep "?" nil body))
|
|
|
|
('minimal-match (rx--control-greedy nil body))
|
|
('maximal-match (rx--control-greedy t body))
|
|
|
|
((or 'group 'submatch) (rx--translate-group body))
|
|
((or 'group-n 'submatch-n) (rx--translate-group-n body))
|
|
('backref (rx--translate-backref body))
|
|
|
|
('syntax (rx--translate-syntax nil body))
|
|
('not-syntax (rx--translate-syntax t body))
|
|
('category (rx--translate-category nil body))
|
|
|
|
('literal (rx--translate-literal body))
|
|
('eval (rx--translate-eval body))
|
|
((or 'regexp 'regex) (rx--translate-regexp body))
|
|
|
|
(op
|
|
(cond
|
|
((not (symbolp op)) (error "Bad rx operator `%S'" op))
|
|
|
|
((let ((expanded (rx--expand-def form)))
|
|
(and expanded
|
|
(rx--translate expanded))))
|
|
|
|
;; For compatibility with old rx.
|
|
((let ((entry (assq op rx-constituents)))
|
|
(and (progn
|
|
(while (and entry (not (consp (cdr entry))))
|
|
(setq entry
|
|
(if (symbolp (cdr entry))
|
|
;; Alias for another entry.
|
|
(assq (cdr entry) rx-constituents)
|
|
;; Wrong type, try further down the list.
|
|
(assq (car entry)
|
|
(cdr (memq entry rx-constituents))))))
|
|
entry)
|
|
(rx--translate-compat-form (cdr entry) form))))
|
|
|
|
(t (error "Unknown rx form `%s'" op)))))))
|
|
|
|
(defconst rx--builtin-forms
|
|
'(seq sequence : and or | any in char not-char not intersection
|
|
repeat = >= **
|
|
zero-or-more 0+ *
|
|
one-or-more 1+ +
|
|
zero-or-one opt optional \?
|
|
*? +? \??
|
|
minimal-match maximal-match
|
|
group submatch group-n submatch-n backref
|
|
syntax not-syntax category
|
|
literal eval regexp regex)
|
|
"List of built-in rx function-like symbols.")
|
|
|
|
(defconst rx--builtin-symbols
|
|
(append '(nonl not-newline any anychar anything unmatchable
|
|
bol eol line-start line-end
|
|
bos eos string-start string-end
|
|
bow eow word-start word-end
|
|
symbol-start symbol-end
|
|
point word-boundary not-word-boundary not-wordchar)
|
|
(mapcar #'car rx--char-classes))
|
|
"List of built-in rx variable-like symbols.")
|
|
|
|
(defconst rx--builtin-names
|
|
(append rx--builtin-forms rx--builtin-symbols)
|
|
"List of built-in rx names. These cannot be redefined by the user.")
|
|
|
|
;; Declare Lisp indentation rules for constructs that take 1 or 2
|
|
;; parameters before a body of RX forms.
|
|
;; (`>=' and `=' are omitted because they are more likely to be used
|
|
;; as Lisp functions than RX constructs; `repeat' is a `defcustom' type.)
|
|
(put 'group-n 'lisp-indent-function 1)
|
|
(put 'submatch-n 'lisp-indent-function 1)
|
|
(put '** 'lisp-indent-function 2)
|
|
|
|
|
|
(defun rx--translate (item)
|
|
"Translate the rx-expression ITEM. Return (REGEXP . PRECEDENCE)."
|
|
(cond
|
|
((stringp item)
|
|
(if (= (length item) 0)
|
|
(cons nil 'seq)
|
|
(cons (list (regexp-quote item)) (if (= (length item) 1) t 'seq))))
|
|
((characterp item)
|
|
(cons (list (regexp-quote (char-to-string item))) t))
|
|
((symbolp item)
|
|
(rx--translate-symbol item))
|
|
((consp item)
|
|
(rx--translate-form item))
|
|
(t (error "Bad rx expression: %S" item))))
|
|
|
|
|
|
;;;###autoload
|
|
(defun rx-to-string (form &optional no-group)
|
|
"Translate FORM from `rx' sexp syntax into a string regexp.
|
|
The arguments to `literal' and `regexp' forms inside FORM must be
|
|
constant strings.
|
|
If NO-GROUP is non-nil, don't bracket the result in a non-capturing
|
|
group.
|
|
|
|
For extending the `rx' notation in FORM, use `rx-define' or `rx-let-eval'."
|
|
(let* ((item (rx--translate form))
|
|
(exprs (if no-group
|
|
(car item)
|
|
(rx--atomic-regexp item))))
|
|
(apply #'concat exprs)))
|
|
|
|
(defun rx--to-expr (form)
|
|
"Translate the rx-expression FORM to a Lisp expression yielding a regexp."
|
|
(let* ((rx--local-definitions
|
|
;; Retrieve local definitions from the macroexpansion environment.
|
|
;; (It's unclear whether the previous value of `rx--local-definitions'
|
|
;; should be included, and if so, in which order.)
|
|
(cdr (assq :rx-locals macroexpand-all-environment)))
|
|
(rx--delayed-evaluation t)
|
|
(elems (car (rx--translate form)))
|
|
(args nil))
|
|
;; Merge adjacent strings.
|
|
(while elems
|
|
(let ((strings nil))
|
|
(while (and elems (stringp (car elems)))
|
|
(push (car elems) strings)
|
|
(setq elems (cdr elems)))
|
|
(let ((s (apply #'concat (nreverse strings))))
|
|
(unless (zerop (length s))
|
|
(push s args))))
|
|
(when elems
|
|
(push (car elems) args)
|
|
(setq elems (cdr elems))))
|
|
(cond ((null args) "") ; 0 args
|
|
((cdr args) (cons 'concat (nreverse args))) ; ≥2 args
|
|
(t (car args))))) ; 1 arg
|
|
|
|
|
|
;;;###autoload
|
|
(defmacro rx (&rest regexps)
|
|
"Translate regular expressions REGEXPS in sexp form to a regexp string.
|
|
Each argument is one of the forms below; RX is a subform, and RX... stands
|
|
for zero or more RXs. For details, see Info node `(elisp) Rx Notation'.
|
|
See `rx-to-string' for the corresponding function.
|
|
|
|
STRING Match a literal string.
|
|
CHAR Match a literal character.
|
|
|
|
(seq RX...) Match the RXs in sequence. Alias: :, sequence, and.
|
|
(or RX...) Match one of the RXs. Alias: |.
|
|
|
|
(zero-or-more RX...) Match RXs zero or more times. Alias: 0+.
|
|
(one-or-more RX...) Match RXs one or more times. Alias: 1+.
|
|
(zero-or-one RX...) Match RXs or the empty string. Alias: opt, optional.
|
|
(* RX...) Match RXs zero or more times; greedy.
|
|
(+ RX...) Match RXs one or more times; greedy.
|
|
(? RX...) Match RXs or the empty string; greedy.
|
|
(*? RX...) Match RXs zero or more times; non-greedy.
|
|
(+? RX...) Match RXs one or more times; non-greedy.
|
|
(?? RX...) Match RXs or the empty string; non-greedy.
|
|
(= N RX...) Match RXs exactly N times.
|
|
(>= N RX...) Match RXs N or more times.
|
|
(** N M RX...) Match RXs N to M times. Alias: repeat.
|
|
(minimal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one
|
|
and aliases using non-greedy matching.
|
|
(maximal-match RX) Match RX, with zero-or-more, one-or-more, zero-or-one
|
|
and aliases using greedy matching, which is the default.
|
|
|
|
(any SET...) Match a character from one of the SETs. Each SET is a
|
|
character, a string, a range as string \"A-Z\" or cons
|
|
(?A . ?Z), or a character class (see below). Alias: in, char.
|
|
(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC
|
|
can be a character, single-char string, (any ...), (or ...),
|
|
(intersection ...), (syntax ...), (category ...),
|
|
or a character class.
|
|
(intersection CHARSET...) Match all CHARSETs.
|
|
CHARSET is (any...), (not...), (or...) or (intersection...),
|
|
a character or a single-char string.
|
|
not-newline Match any character except a newline. Alias: nonl.
|
|
anychar Match any character. Alias: anything.
|
|
unmatchable Never match anything at all.
|
|
|
|
CHARCLASS Match a character from a character class. One of:
|
|
alpha, alphabetic, letter Alphabetic characters (defined by Unicode).
|
|
alnum, alphanumeric Alphabetic or decimal digit chars (Unicode).
|
|
digit, numeric, num 0-9.
|
|
xdigit, hex-digit, hex 0-9, A-F, a-f.
|
|
cntrl, control ASCII codes 0-31.
|
|
blank Horizontal whitespace (Unicode).
|
|
space, whitespace, white Chars with whitespace syntax.
|
|
lower, lower-case Lower-case chars, from current case table.
|
|
upper, upper-case Upper-case chars, from current case table.
|
|
graph, graphic Graphic characters (Unicode).
|
|
print, printing Whitespace or graphic (Unicode).
|
|
punct, punctuation Not control, space, letter or digit (ASCII);
|
|
not word syntax (non-ASCII).
|
|
word, wordchar Characters with word syntax.
|
|
ascii ASCII characters (codes 0-127).
|
|
nonascii Non-ASCII characters (but not raw bytes).
|
|
|
|
(syntax SYNTAX) Match a character with syntax SYNTAX, being one of:
|
|
whitespace, punctuation, word, symbol, open-parenthesis,
|
|
close-parenthesis, expression-prefix, string-quote,
|
|
paired-delimiter, escape, character-quote, comment-start,
|
|
comment-end, string-delimiter, comment-delimiter
|
|
|
|
(category CAT) Match a character in category CAT, being one of:
|
|
space-for-indent, base, consonant, base-vowel,
|
|
upper-diacritical-mark, lower-diacritical-mark, tone-mark, symbol,
|
|
digit, vowel-modifying-diacritical-mark, vowel-sign,
|
|
semivowel-lower, not-at-end-of-line, not-at-beginning-of-line,
|
|
alpha-numeric-two-byte, chinese-two-byte, greek-two-byte,
|
|
japanese-hiragana-two-byte, indian-two-byte,
|
|
japanese-katakana-two-byte, strong-left-to-right,
|
|
korean-hangul-two-byte, strong-right-to-left, cyrillic-two-byte,
|
|
combining-diacritic, ascii, arabic, chinese, ethiopic, greek,
|
|
korean, indian, japanese, japanese-katakana, latin, lao,
|
|
tibetan, japanese-roman, thai, vietnamese, hebrew, cyrillic,
|
|
can-break
|
|
|
|
Zero-width assertions: these all match the empty string in specific places.
|
|
line-start At the beginning of a line. Alias: bol.
|
|
line-end At the end of a line. Alias: eol.
|
|
string-start At the start of the string or buffer.
|
|
Alias: buffer-start, bos, bot.
|
|
string-end At the end of the string or buffer.
|
|
Alias: buffer-end, eos, eot.
|
|
point At point.
|
|
word-start At the beginning of a word. Alias: bow.
|
|
word-end At the end of a word. Alias: eow.
|
|
word-boundary At the beginning or end of a word.
|
|
not-word-boundary Not at the beginning or end of a word.
|
|
symbol-start At the beginning of a symbol.
|
|
symbol-end At the end of a symbol.
|
|
|
|
(group RX...) Match RXs and define a capture group. Alias: submatch.
|
|
(group-n N RX...) Match RXs and define capture group N. Alias: submatch-n.
|
|
(backref N) Match the text that capture group N matched.
|
|
|
|
(literal EXPR) Match the literal string from evaluating EXPR at run time.
|
|
(regexp EXPR) Match the string regexp from evaluating EXPR at run time.
|
|
(eval EXPR) Match the rx sexp from evaluating EXPR at macro-expansion
|
|
(compile) time.
|
|
|
|
Additional constructs can be defined using `rx-define' and `rx-let',
|
|
which see.
|
|
|
|
\(fn REGEXPS...)"
|
|
(rx--to-expr (cons 'seq regexps)))
|
|
|
|
(defun rx--make-binding (name tail)
|
|
"Make a definitions entry out of TAIL.
|
|
TAIL is on the form ([ARGLIST] DEFINITION)."
|
|
(unless (symbolp name)
|
|
(error "Bad `rx' definition name: %S" name))
|
|
;; FIXME: Consider using a hash table or symbol property, for speed.
|
|
(when (memq name rx--builtin-names)
|
|
(error "Cannot redefine built-in rx name `%s'" name))
|
|
(pcase tail
|
|
(`(,def)
|
|
(list def))
|
|
(`(,args ,def)
|
|
(unless (and (listp args) (rx--every #'symbolp args))
|
|
(error "Bad argument list for `rx' definition %s: %S" name args))
|
|
(list args def))
|
|
(_ (error "Bad `rx' definition of %s: %S" name tail))))
|
|
|
|
(defun rx--make-named-binding (bindspec)
|
|
"Make a definitions entry out of BINDSPEC.
|
|
BINDSPEC is on the form (NAME [ARGLIST] DEFINITION)."
|
|
(unless (consp bindspec)
|
|
(error "Bad `rx-let' binding: %S" bindspec))
|
|
(cons (car bindspec)
|
|
(rx--make-binding (car bindspec) (cdr bindspec))))
|
|
|
|
(defun rx--extend-local-defs (bindspecs)
|
|
(append (mapcar #'rx--make-named-binding bindspecs)
|
|
rx--local-definitions))
|
|
|
|
;;;###autoload
|
|
(defmacro rx-let-eval (bindings &rest body)
|
|
"Evaluate BODY with local BINDINGS for `rx-to-string'.
|
|
BINDINGS, after evaluation, is a list of definitions each on the form
|
|
(NAME [(ARGS...)] RX), in effect for calls to `rx-to-string'
|
|
in BODY.
|
|
|
|
For bindings without an ARGS list, NAME is defined as an alias
|
|
for the `rx' expression RX. Where ARGS is supplied, NAME is
|
|
defined as an `rx' form with ARGS as argument list. The
|
|
parameters are bound from the values in the (NAME ...) form and
|
|
are substituted in RX. ARGS can contain `&rest' parameters,
|
|
whose values are spliced into RX where the parameter name occurs.
|
|
|
|
Any previous definitions with the same names are shadowed during
|
|
the expansion of BODY only.
|
|
For extensions when using the `rx' macro, use `rx-let'.
|
|
To make global rx extensions, use `rx-define'.
|
|
For more details, see Info node `(elisp) Extending Rx'.
|
|
|
|
\(fn BINDINGS BODY...)"
|
|
(declare (indent 1) (debug (form body)))
|
|
;; FIXME: this way, `rx--extend-local-defs' may need to be autoloaded.
|
|
`(let ((rx--local-definitions (rx--extend-local-defs ,bindings)))
|
|
,@body))
|
|
|
|
;;;###autoload
|
|
(defmacro rx-let (bindings &rest body)
|
|
"Evaluate BODY with local BINDINGS for `rx'.
|
|
BINDINGS is an unevaluated list of bindings each on the form
|
|
(NAME [(ARGS...)] RX).
|
|
They are bound lexically and are available in `rx' expressions in
|
|
BODY only.
|
|
|
|
For bindings without an ARGS list, NAME is defined as an alias
|
|
for the `rx' expression RX. Where ARGS is supplied, NAME is
|
|
defined as an `rx' form with ARGS as argument list. The
|
|
parameters are bound from the values in the (NAME ...) form and
|
|
are substituted in RX. ARGS can contain `&rest' parameters,
|
|
whose values are spliced into RX where the parameter name occurs.
|
|
|
|
Any previous definitions with the same names are shadowed during
|
|
the expansion of BODY only.
|
|
For local extensions to `rx-to-string', use `rx-let-eval'.
|
|
To make global rx extensions, use `rx-define'.
|
|
For more details, see Info node `(elisp) Extending Rx'.
|
|
|
|
\(fn BINDINGS BODY...)"
|
|
(declare (indent 1) (debug (sexp body)))
|
|
(let ((prev-locals (cdr (assq :rx-locals macroexpand-all-environment)))
|
|
(new-locals (mapcar #'rx--make-named-binding bindings)))
|
|
(macroexpand-all (cons 'progn body)
|
|
(cons (cons :rx-locals (append new-locals prev-locals))
|
|
macroexpand-all-environment))))
|
|
|
|
;;;###autoload
|
|
(defmacro rx-define (name &rest definition)
|
|
"Define NAME as a global `rx' definition.
|
|
If the ARGS list is omitted, define NAME as an alias for the `rx'
|
|
expression RX.
|
|
|
|
If the ARGS list is supplied, define NAME as an `rx' form with
|
|
ARGS as argument list. The parameters are bound from the values
|
|
in the (NAME ...) form and are substituted in RX.
|
|
ARGS can contain `&rest' parameters, whose values are spliced
|
|
into RX where the parameter name occurs.
|
|
|
|
Any previous global definition of NAME is overwritten with the new one.
|
|
To make local rx extensions, use `rx-let' for `rx',
|
|
`rx-let-eval' for `rx-to-string'.
|
|
For more details, see Info node `(elisp) Extending Rx'.
|
|
|
|
\(fn NAME [(ARGS...)] RX)"
|
|
(declare (indent defun))
|
|
`(eval-and-compile
|
|
(put ',name 'rx-definition ',(rx--make-binding name definition))
|
|
',name))
|
|
|
|
;; During `rx--pcase-transform', list of defined variables in right-to-left
|
|
;; order.
|
|
(defvar rx--pcase-vars)
|
|
|
|
;; FIXME: The rewriting strategy for pcase works so-so with extensions;
|
|
;; definitions cannot expand to `let' or named `backref'. If this ever
|
|
;; becomes a problem, we can handle those forms in the ordinary parser,
|
|
;; using a dynamic variable for activating the augmented forms.
|
|
|
|
(defun rx--pcase-transform (rx)
|
|
"Transform RX, an rx-expression augmented with `let' and named `backref',
|
|
into a plain rx-expression, collecting names into `rx--pcase-vars'."
|
|
(pcase rx
|
|
(`(let ,name . ,body)
|
|
(let* ((index (length (memq name rx--pcase-vars)))
|
|
(i (if (zerop index)
|
|
(length (push name rx--pcase-vars))
|
|
index)))
|
|
`(group-n ,i ,(rx--pcase-transform (cons 'seq body)))))
|
|
((and `(backref ,ref)
|
|
(guard (symbolp ref)))
|
|
(let ((index (length (memq ref rx--pcase-vars))))
|
|
(when (zerop index)
|
|
(error "rx `backref' variable must be one of: %s"
|
|
(mapconcat #'symbol-name rx--pcase-vars " ")))
|
|
`(backref ,index)))
|
|
((and `(,head . ,rest)
|
|
(guard (and (or (symbolp head) (memq head '(?\s ??)))
|
|
(not (memq head '(literal regexp regex eval))))))
|
|
(cons head (mapcar #'rx--pcase-transform rest)))
|
|
(_ rx)))
|
|
|
|
(defun rx--reduce-right (f l)
|
|
"Right-reduction on L by F. L must be non-empty."
|
|
(if (cdr l)
|
|
(funcall f (car l) (rx--reduce-right f (cdr l)))
|
|
(car l)))
|
|
|
|
;;;###autoload
|
|
(pcase-defmacro rx (&rest regexps)
|
|
"A pattern that matches strings against `rx' REGEXPS in sexp form.
|
|
REGEXPS are interpreted as in `rx'. The pattern matches any
|
|
string that is a match for REGEXPS, as if by `string-match'.
|
|
|
|
In addition to the usual `rx' syntax, REGEXPS can contain the
|
|
following constructs:
|
|
|
|
(let REF RX...) binds the symbol REF to a submatch that matches
|
|
the regular expressions RX. REF is bound in
|
|
CODE to the string of the submatch or nil, but
|
|
can also be used in `backref'.
|
|
(backref REF) matches whatever the submatch REF matched.
|
|
REF can be a number, as usual, or a name
|
|
introduced by a previous (let REF ...)
|
|
construct."
|
|
(rx--pcase-expand regexps))
|
|
|
|
;; Autoloaded because it's referred to by the pcase rx macro above,
|
|
;; whose body ends up in loaddefs.el.
|
|
;;;###autoload
|
|
(defun rx--pcase-expand (regexps)
|
|
(let* ((rx--pcase-vars nil)
|
|
(regexp (rx--to-expr (rx--pcase-transform (cons 'seq regexps)))))
|
|
`(and (pred stringp)
|
|
,(pcase (length rx--pcase-vars)
|
|
(0
|
|
;; No variables bound: a single predicate suffices.
|
|
`(pred (string-match ,regexp)))
|
|
(1
|
|
;; Create a match value that on a successful regexp match
|
|
;; is the submatch value, 0 on failure. We can't use nil
|
|
;; for failure because it is a valid submatch value.
|
|
`(app (lambda (s)
|
|
(if (string-match ,regexp s)
|
|
(match-string 1 s)
|
|
0))
|
|
(and ,(car rx--pcase-vars) (pred (not numberp)))))
|
|
(nvars
|
|
;; Pack the submatches into a dotted list which is then
|
|
;; immediately destructured into individual variables again.
|
|
;; This is of course slightly inefficient.
|
|
;; A dotted list is used to reduce the number of conses
|
|
;; to create and take apart.
|
|
`(app (lambda (s)
|
|
(and (string-match ,regexp s)
|
|
,(rx--reduce-right
|
|
(lambda (a b) `(cons ,a ,b))
|
|
(mapcar (lambda (i) `(match-string ,i s))
|
|
(number-sequence 1 nvars)))))
|
|
,(list '\`
|
|
(rx--reduce-right
|
|
#'cons
|
|
(mapcar (lambda (name) (list '\, name))
|
|
(reverse rx--pcase-vars))))))))))
|
|
|
|
;; Obsolete internal symbol, used in old versions of the `flycheck' package.
|
|
(define-obsolete-function-alias 'rx-submatch-n 'rx-to-string "27.1")
|
|
|
|
(provide 'rx)
|
|
|
|
;;; rx.el ends here
|