;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX) ;; Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Sofware Foundation, Inc. ;; Author: Vinicius Jose Latorre ;; Maintainer: Vinicius Jose Latorre ;; Time-stamp: <2004/04/03 16:45:34 vinicius> ;; Keywords: wp, ebnf, PostScript ;; Version: 1.1 ;; This file is part of GNU Emacs. ;; GNU Emacs is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ;; Boston, MA 02110-1301, USA. ;;; Commentary: ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; ;; This is part of ebnf2ps package. ;; ;; This package defines a parser for EBNF used to specify XML (EBNFX). ;; ;; See ebnf2ps.el for documentation. ;; ;; ;; EBNFX Syntax ;; ------------ ;; ;; See the URL: ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation' ;; (Extensible Markup Language (XML) 1.0 (Third Edition)) ;; ;; ;; rule ::= symbol '::=' expression ;; /* rules are separated by at least one blank line. */ ;; ;; expression ::= concatenation ('|' concatenation)* ;; ;; concatenation ::= exception* ;; ;; exception ::= term ('-' term)? ;; ;; term ::= factor ('*' | '+' | '?')? ;; ;; factor ::= hex-char+ ;; | '[' '^'? ( char ( '-' char )? )+ ']' ;; | '"' 'string' '"' ;; | "'" "string" "'" ;; | '(' expression ')' ;; | symbol ;; ;; symbol ::= 'upper or lower case letter' ;; ('upper or lower case letter' | '-' | '_')* ;; /* upper and lower 8-bit accentuated characters are included */ ;; ;; hex-char ::= '#x' [0-9A-Fa-f]+ ;; ;; char ::= hex-char | 'any character except control characters' ;; /* 8-bit accentuated characters are included */ ;; ;; any-char ::= char | 'newline' | 'tab' ;; ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']' ;; ;; comment ::= '/*' ( any-char - '*/' ) '*/' ;; ;; ;; Below is the Notation section extracted from the URL cited above. ;; ;; 6 Notation ;; ;; The formal grammar of XML is given in this specification using a simple ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines ;; one symbol, in the form ;; ;; symbol ::= expression ;; ;; Symbols are written with an initial capital letter if they are the start ;; symbol of a regular language, otherwise with an initial lowercase letter. ;; Literal strings are quoted. ;; ;; Within the expression on the right-hand side of a rule, the following ;; expressions are used to match strings of one or more characters: ;; ;; #xN ;; ;; where N is a hexadecimal integer, the expression matches the character ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading ;; zeros in the #xN form is insignificant. ;; ;; [a-zA-Z], [#xN-#xN] ;; ;; matches any Char with a value in the range(s) indicated (inclusive). ;; ;; [abc], [#xN#xN#xN] ;; ;; matches any Char with a value among the characters enumerated. ;; Enumerations and ranges can be mixed in one set of brackets. ;; ;; [^a-z], [^#xN-#xN] ;; ;; matches any Char with a value outside the range indicated. ;; ;; [^abc], [^#xN#xN#xN] ;; ;; matches any Char with a value not among the characters given. ;; Enumerations and ranges of forbidden values can be mixed in one set of ;; brackets. ;; ;; "string" ;; ;; matches a literal string matching that given inside the double quotes. ;; ;; 'string' ;; ;; matches a literal string matching that given inside the single quotes. ;; ;; These symbols may be combined to match more complex patterns as follows, ;; where A and B represent simple expressions: ;; ;; (expression) ;; ;; expression is treated as a unit and may be combined as described in this ;; list. ;; ;; A? ;; ;; matches A or nothing; optional A. ;; ;; A B ;; ;; matches A followed by B. This operator has higher precedence than ;; alternation; thus A B | C D is identical to (A B) | (C D). ;; ;; A | B ;; ;; matches A or B. ;; ;; A - B ;; ;; matches any string that matches A but does not match B. ;; ;; A+ ;; ;; matches one or more occurrences of A. Concatenation has higher ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+). ;; ;; A* ;; ;; matches zero or more occurrences of A. Concatenation has higher ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*). ;; ;; Other notations used in the productions are: ;; ;; /* ... */ ;; ;; comment. ;; ;; [ wfc: ... ] ;; ;; well-formedness constraint; this identifies by name a constraint on ;; well-formed documents associated with a production. ;; ;; [ vc: ... ] ;; ;; validity constraint; this identifies by name a constraint on valid ;; documents associated with a production. ;; ;; ;; Differences Between EBNFX And ebnf2ps EBNFX ;; ------------------------------------------- ;; ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated ;; characters (from \240 to \377) for rule name, string and comment. Also ;; rule name can start with upper case letter. ;; ;; ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Code: (require 'ebnf-otz) (defvar ebnf-ebx-lex nil "Value returned by `ebnf-ebx-lex' function.") ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Syntactic analyzer ;;; rulelist ::= rule+ (defun ebnf-ebx-parser (start) "EBNFX parser." (let ((total (+ (- ebnf-limit start) 1)) (bias (1- start)) (origin (point)) rule-list token rule) (goto-char start) (setq token (ebnf-ebx-lex)) (and (eq token 'end-of-input) (error "Invalid EBNFX file format")) (and (eq token 'end-of-rule) (setq token (ebnf-ebx-lex))) (while (not (eq token 'end-of-input)) (ebnf-message-float "Parsing...%s%%" (/ (* (- (point) bias) 100.0) total)) (setq token (ebnf-ebx-rule token) rule (cdr token) token (car token)) (or (ebnf-add-empty-rule-list rule) (setq rule-list (cons rule rule-list)))) (goto-char origin) rule-list)) ;;; rule ::= symbol '::=' expression (defun ebnf-ebx-rule (token) (let ((name ebnf-ebx-lex) (action ebnf-action) elements) (setq ebnf-action nil) (or (eq token 'non-terminal) (error "Invalid rule name")) (setq token (ebnf-ebx-lex)) (or (eq token 'production) (error "Invalid rule: missing `::='")) (setq elements (ebnf-ebx-expression)) (or (memq (car elements) '(end-of-rule end-of-input)) (error "Invalid rule: there is no end of rule")) (setq elements (cdr elements)) (ebnf-eps-add-production name) (cons (ebnf-ebx-lex) (ebnf-make-production name elements action)))) ;; expression ::= concatenation ('|' concatenation)* (defun ebnf-ebx-expression () (let (body concatenation) (while (eq (car (setq concatenation (ebnf-ebx-concatenation (ebnf-ebx-lex)))) 'alternative) (setq body (cons (cdr concatenation) body))) (ebnf-token-alternative body concatenation))) ;; concatenation ::= exception* (defun ebnf-ebx-concatenation (token) (let ((term (ebnf-ebx-exception token)) seq) (or (setq token (car term) term (cdr term)) (error "Empty element")) (setq seq (cons term seq)) (while (setq term (ebnf-ebx-exception token) token (car term) term (cdr term)) (setq seq (cons term seq))) (cons token (ebnf-token-sequence seq)))) ;;; exception ::= term ('-' term)? (defun ebnf-ebx-exception (token) (let ((term (ebnf-ebx-term token))) (if (eq (car term) 'exception) (let ((except (ebnf-ebx-term (ebnf-ebx-lex)))) (cons (car except) (ebnf-make-except (cdr term) (cdr except)))) term))) ;;; term ::= factor ('*' | '+' | '?')? (defun ebnf-ebx-term (token) (let ((factor (ebnf-ebx-factor token))) (when factor (setq token (ebnf-ebx-lex)) (cond ((eq token 'zero-or-more) (setq factor (ebnf-make-zero-or-more factor) token (ebnf-ebx-lex))) ((eq token 'one-or-more) (setq factor (ebnf-make-one-or-more factor) token (ebnf-ebx-lex))) ((eq token 'optional) (setq factor (ebnf-token-optional factor) token (ebnf-ebx-lex))))) (cons token factor))) ;;; factor ::= hex-char+ ;;; | '[' '^'? ( char ( '-' char )? )+ ']' ;;; | '"' 'string' '"' ;;; | "'" "string" "'" ;;; | '(' expression ')' ;;; | symbol ;;; ;;; symbol ::= 'upper or lower case letter' ;;; ('upper or lower case letter' | '-' | '_')* ;;; /* upper and lower 8-bit accentuated characters are included */ ;;; ;;; hex-char ::= '#x' [0-9A-Fa-f]+ ;;; ;;; char ::= hex-char | 'any character except control characters' ;;; /* 8-bit accentuated characters are included */ ;;; ;;; any-char ::= char | 'newline' | 'tab' (defun ebnf-ebx-factor (token) (cond ;; terminal ((eq token 'terminal) (ebnf-make-terminal ebnf-ebx-lex)) ;; non-terminal ((eq token 'non-terminal) (ebnf-make-non-terminal ebnf-ebx-lex)) ;; group ((eq token 'begin-group) (let ((body (ebnf-ebx-expression))) (or (eq (car body) 'end-group) (error "Missing `)'")) (cdr body))) ;; no element (t nil) )) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Lexical analyzer (defconst ebnf-ebx-token-table (make-vector 256 'error) "Vector used to map characters to a lexical token.") (defun ebnf-ebx-initialize () "Initialize EBNFX token table." ;; control character & control 8-bit character are set to `error' (let ((char ?\101)) ;; printable character: A-Z (while (< char ?\133) (aset ebnf-ebx-token-table char 'non-terminal) (setq char (1+ char))) ;; printable character: a-z (setq char ?\141) (while (< char ?\173) (aset ebnf-ebx-token-table char 'non-terminal) (setq char (1+ char))) ;; European 8-bit accentuated characters: (setq char ?\240) (while (< char ?\400) (aset ebnf-ebx-token-table char 'non-terminal) (setq char (1+ char))) ;; Override end of line characters: (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return ;; Override space characters: (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space ;; Override form feed character: (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed ;; Override other lexical characters: (aset ebnf-ebx-token-table ?# 'hash) (aset ebnf-ebx-token-table ?\" 'double-quote) (aset ebnf-ebx-token-table ?\' 'single-quote) (aset ebnf-ebx-token-table ?\( 'begin-group) (aset ebnf-ebx-token-table ?\) 'end-group) (aset ebnf-ebx-token-table ?- 'exception) (aset ebnf-ebx-token-table ?: 'colon) (aset ebnf-ebx-token-table ?\[ 'begin-square) (aset ebnf-ebx-token-table ?| 'alternative) (aset ebnf-ebx-token-table ?* 'zero-or-more) (aset ebnf-ebx-token-table ?+ 'one-or-more) (aset ebnf-ebx-token-table ?\? 'optional) ;; Override comment character: (aset ebnf-ebx-token-table ?/ 'comment))) ;; replace the range "\240-\377" (see `ebnf-range-regexp'). (defconst ebnf-ebx-non-terminal-chars (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377)) (defconst ebnf-ebx-non-terminal-letter-chars (ebnf-range-regexp "A-Za-z" ?\240 ?\377)) (defun ebnf-ebx-lex () "Lexical analyzer for EBNFX. Return a lexical token. See documentation for variable `ebnf-ebx-lex'." (if (>= (point) ebnf-limit) 'end-of-input (let (token) ;; skip spaces and comments (while (if (> (following-char) 255) (progn (setq token 'error) nil) (setq token (aref ebnf-ebx-token-table (following-char))) (cond ((eq token 'space) (skip-chars-forward " \013\t" ebnf-limit) (< (point) ebnf-limit)) ((eq token 'comment) (ebnf-ebx-skip-comment)) ((eq token 'form-feed) (forward-char) (setq ebnf-action 'form-feed)) ((eq token 'end-of-rule) (ebnf-ebx-skip-end-of-rule)) ((and (eq token 'begin-square) (let ((case-fold-search t)) (looking-at "\\[\\(wfc\\|vc\\):"))) (ebnf-ebx-skip-constraint)) (t nil) ))) (cond ;; end of input ((>= (point) ebnf-limit) 'end-of-input) ;; error ((eq token 'error) (error "Invalid character")) ;; end of rule ((eq token 'end-of-rule) 'end-of-rule) ;; terminal: #x [0-9A-Fa-f]+ ((eq token 'hash) (setq ebnf-ebx-lex (ebnf-ebx-character)) 'terminal) ;; terminal: "string" ((eq token 'double-quote) (setq ebnf-ebx-lex (ebnf-ebx-string ?\")) 'terminal) ;; terminal: 'string' ((eq token 'single-quote) (setq ebnf-ebx-lex (ebnf-ebx-string ?\')) 'terminal) ;; terminal: [ ^? ( char ( - char )? )+ ] ((eq token 'begin-square) (setq ebnf-ebx-lex (ebnf-ebx-range)) 'terminal) ;; non-terminal: NAME ((eq token 'non-terminal) (setq ebnf-ebx-lex (ebnf-buffer-substring ebnf-ebx-non-terminal-chars)) 'non-terminal) ;; colon: ::= ((eq token 'colon) (or (looking-at "::=") (error "Missing `::=' token")) (forward-char 3) 'production) ;; miscellaneous: (, ), *, +, ?, |, - (t (forward-char) token) )))) ;; replace the range "\177-\237" (see `ebnf-range-regexp'). (defconst ebnf-ebx-constraint-chars (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237)) (defun ebnf-ebx-skip-constraint () (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0) (error "Invalid character")) (or (= (following-char) ?\]) (error "Missing end of constraint `]'")) (forward-char) t) (defun ebnf-ebx-skip-end-of-rule () (let (eor-p) (while (progn ;; end of rule ==> 2 or more consecutive end of lines (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1) eor-p)) ;; skip spaces (skip-chars-forward " \013\t" ebnf-limit) ;; skip comments (and (= (following-char) ?/) (ebnf-ebx-skip-comment)))) (not eor-p))) ;; replace the range "\177-\237" (see `ebnf-range-regexp'). (defconst ebnf-ebx-comment-chars (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237)) (defconst ebnf-ebx-filename-chars (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237)) (defun ebnf-ebx-skip-comment () (forward-char) (or (= (following-char) ?*) (error "Invalid beginning of comment")) (forward-char) (cond ;; open EPS file ((and ebnf-eps-executing (= (following-char) ?\[)) (ebnf-eps-add-context (ebnf-ebx-eps-filename))) ;; close EPS file ((and ebnf-eps-executing (= (following-char) ?\])) (ebnf-eps-remove-context (ebnf-ebx-eps-filename))) ;; any other action in comment (t (setq ebnf-action (aref ebnf-comment-table (following-char)))) ) (while (progn (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit) (or (= (following-char) ?*) (error "Missing end of comment")) (forward-char) (and (/= (following-char) ?/) (< (point) ebnf-limit)))) ;; check for a valid end of comment (and (>= (point) ebnf-limit) (error "Missing end of comment")) (forward-char) t) (defun ebnf-ebx-eps-filename () (forward-char) (let (fname nchar) (while (progn (setq fname (concat fname (ebnf-buffer-substring ebnf-ebx-filename-chars))) (and (< (point) ebnf-limit) (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0) (< (point) ebnf-limit) (/= (following-char) ?/))) (setq fname (concat fname (make-string nchar ?*)) nchar nil)) (if (or (not nchar) (= nchar 0)) fname (and (< (point) ebnf-limit) (= (following-char) ?/) (setq nchar (1- nchar))) (concat fname (make-string nchar ?*))))) ;; replace the range "\240-\377" (see `ebnf-range-regexp'). (defconst ebnf-ebx-double-string-chars (ebnf-range-regexp "\t -!#-~" ?\240 ?\377)) (defconst ebnf-ebx-single-string-chars (ebnf-range-regexp "\t -&(-~" ?\240 ?\377)) (defun ebnf-ebx-string (delim) (buffer-substring-no-properties (progn (forward-char) (point)) (progn (skip-chars-forward (if (= delim ?\") ebnf-ebx-double-string-chars ebnf-ebx-single-string-chars) ebnf-limit) (or (= (following-char) delim) (error "Missing string delimiter `%c'" delim)) (prog1 (point) (forward-char))))) (defun ebnf-ebx-character () ;; #x [0-9A-Fa-f]+ (buffer-substring-no-properties (point) (progn (ebnf-ebx-hex-character) (point)))) (defun ebnf-ebx-range () ;; [ ^? ( char ( - char )? )+ ] (buffer-substring-no-properties (point) (progn (forward-char) (and (= (following-char) ?^) (forward-char)) (and (= (following-char) ?-) (forward-char)) (while (progn (ebnf-ebx-any-character) (when (= (following-char) ?-) (forward-char) (ebnf-ebx-any-character)) (and (/= (following-char) ?\]) (< (point) ebnf-limit)))) (and (>= (point) ebnf-limit) (error "Missing end of character range `]'")) (forward-char) (point)))) (defun ebnf-ebx-any-character () (let ((char (following-char))) (cond ((= char ?#) (ebnf-ebx-hex-character t)) ((or (and (<= ?\ char) (<= char ?\")) ; # (and (<= ?$ char) (<= char ?,)) ; - (and (<= ?. char) (<= char ?\\)) ; ] (and (<= ?^ char) (<= char ?~)) (and (<= ?\240 char) (<= char ?\377))) (forward-char)) (t (error "Invalid character `%c'" char))))) (defun ebnf-ebx-hex-character (&optional no-error) ;; #x [0-9A-Fa-f]+ (forward-char) (if (/= (following-char) ?x) (or no-error (error "Invalid hexadecimal character")) (forward-char) (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0) (error "Invalid hexadecimal character")))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (provide 'ebnf-ebx) ;;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb ;;; ebnf-ebx.el ends here