From a158ff811faadd2891b3db1f9b204388ac640c60 Mon Sep 17 00:00:00 2001 From: Juanma Barranquero Date: Sun, 16 Mar 2003 10:48:34 +0000 Subject: [PATCH] (xml-ucs-to-string): New function to convert Unicode codepoints to strings. Uses decode-char (mule.el) if available. (xml-parse-tag, xml-parse-attlist, xml-skip-dtd, xml-parse-dtd, xml-parse-elem-type): Use ' \t\n\r' instead of '[:space:]'. (xml-parse-attlist): Added attribute normalization. (xml-parse-tag): Replace "\r\n" and "\r" with "\n". --- lisp/ChangeLog | 9 ++++ lisp/xml.el | 122 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 93 insertions(+), 38 deletions(-) diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 91dafdfbcd4..895ef1bd5bb 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,12 @@ +2003-03-15 Mark A. Hershberger + + * xml.el (xml-ucs-to-string): New function to convert Unicode + codepoints to strings. Uses decode-char (mule.el) if available. + (xml-parse-tag, xml-parse-attlist, xml-skip-dtd, xml-parse-dtd) + (xml-parse-elem-type): Use ' \t\n\r' instead of '[:space:]'. + (xml-parse-attlist): Added attribute normalization. + (xml-parse-tag): Replace "\r\n" and "\r" with "\n". + 2003-03-14 John Paul Wallington * files.el (recover-session): Error if there are no previous diff --git a/lisp/xml.el b/lisp/xml.el index d6a0bc74b45..fc6365b50a0 100644 --- a/lisp/xml.el +++ b/lisp/xml.el @@ -184,7 +184,7 @@ Returns one of: ;; beginning of a document) ((looking-at "<\\?") (search-forward "?>" end) - (goto-char (- (re-search-forward "[^[:space:]]") 1)) + (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) (xml-parse-tag end)) ;; Character data (CDATA) sections, in which no tag should be interpreted ((looking-at "[:space:]]+\\)") + ((looking-at "<\\([^/> \t\n\r]+\\)") (goto-char (match-end 1)) (let* ((case-fold-search nil) ;; XML is case-sensitive. (node-name (match-string 1)) @@ -219,7 +219,7 @@ Returns one of: pos) ;; is this an empty element ? - (if (looking-at "/[[:space:]]*>") + (if (looking-at "/[ \t\n\r]*>") (progn (forward-char 2) (nreverse (cons '("") children))) @@ -230,7 +230,7 @@ Returns one of: (forward-char 1) ;; Now check that we have the right end-tag. Note that this ;; one might contain spaces after the tag name - (while (not (looking-at (concat ""))) + (while (not (looking-at (concat ""))) (cond ((looking-at " (point) end) - (error "XML: end of attribute list not found before end of region")) - ) + (error "XML: end of attribute list not found before end of region"))) (nreverse attlist))) ;;******************************************************************* @@ -318,15 +336,15 @@ The DTD must end before the position END in the current buffer. The point must be just before the starting tag of the DTD. This follows the rule [28] in the XML specifications." (forward-char (length "") + (if (looking-at "[ \t\n\r]*>") (error "XML: invalid DTD (excepting name of the document)")) (condition-case nil (progn - (forward-word 1) ;; name of the document - (goto-char (- (re-search-forward "[[:space:]]") 1)) - (goto-char (- (re-search-forward "[^[:space:]]") 1)) + (forward-word 1) + (goto-char (- (re-search-forward "[ \t\n\r]") 1)) + (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) (if (looking-at "\\[") - (re-search-forward "\\][[:space:]]*>" end) + (re-search-forward "\\][ \t\n\r]*>" end) (search-forward ">" end))) (error (error "XML: No end to the DTD")))) @@ -334,7 +352,7 @@ This follows the rule [28] in the XML specifications." "Parse the DTD that point is looking at. The DTD must end before the position END in the current buffer." (forward-char (length "") (error "XML: invalid DTD (excepting name of the document)")) @@ -344,24 +362,24 @@ The DTD must end before the position END in the current buffer." type element end-pos) (goto-char (match-end 0)) - (goto-char (- (re-search-forward "[^[:space:]]") 1)) + (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) - ;; External DTDs => don't know how to handle them yet + ;; External DTDs => don't know how to handle them yet (if (looking-at "SYSTEM") (error "XML: Don't know how to handle external DTDs")) (if (not (= (char-after) ?\[)) (error "XML: Unknown declaration in the DTD")) - ;; Parse the rest of the DTD + ;; Parse the rest of the DTD (forward-char 1) - (while (and (not (looking-at "[[:space:]]*\\]")) + (while (and (not (looking-at "[ \t\n\r]*\\]")) (<= (point) end)) (cond ;; Translation of rule [45] of XML specifications ((looking-at - "[[:space:]]*]+\\)>") + "[ \t\n\r]*]+\\)>") (setq element (intern (match-string-no-properties 1)) type (match-string-no-properties 2)) @@ -369,13 +387,13 @@ The DTD must end before the position END in the current buffer." ;; Translation of rule [46] of XML specifications (cond - ((string-match "^EMPTY[[:space:]]*$" type) ;; empty declaration + ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration (setq type 'empty)) - ((string-match "^ANY[[:space:]]*$" type) ;; any type of contents + ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents (setq type 'any)) - ((string-match "^(\\(.*\\))[[:space:]]*$" type) ;; children ([47]) + ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47]) (setq type (xml-parse-elem-type (match-string-no-properties 1 type)))) - ((string-match "^%[^;]+;[[:space:]]*$" type) ;; substitution + ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution nil) (t (error "XML: Invalid element type in the DTD"))) @@ -417,8 +435,8 @@ The DTD must end before the position END in the current buffer." (mapcar 'xml-parse-elem-type (split-string elem ",")))) ))) - (if (string-match "[[:space:]]*\\([^+*?]+\\)\\([+*?]?\\)" string) - (setq elem (match-string 1 string) + (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string) + (setq elem (match-string 1 string) modifier (match-string 2 string)))) (if (and (stringp elem) (string= elem "#PCDATA")) @@ -434,6 +452,22 @@ The DTD must end before the position END in the current buffer." (t elem)))) +;;******************************************************************* +;;** +;;** Converting code points to strings +;;** +;;******************************************************************* + +(defun xml-ucs-to-string (codepoint) + "Return a string representation of CODEPOINT. If it can't be +converted, return '?'." + (cond ((boundp 'decode-char) + (char-to-string (decode-char 'ucs codepoint))) + ((and (< codepoint 128) + (> codepoint 31)) + (char-to-string codepoint)) + (t "?"))) ; FIXME: There's gotta be a better way to + ; designate an unknown character. ;;******************************************************************* ;;** @@ -451,6 +485,18 @@ The DTD must end before the position END in the current buffer." (setq string (replace-match "'" t nil string))) (while (string-match """ string) (setq string (replace-match "\"" t nil string))) + (while (string-match "&#\\([0-9]+\\);" string) + (setq string (replace-match (xml-ucs-to-string + (string-to-number + (match-string-no-properties 1 string))) + t nil string))) + (while (string-match "&#x\\([0-9a-fA-F]+\\);" string) + (setq string (replace-match (xml-ucs-to-string + (string-to-number + (match-string-no-properties 1 string) + 16)) + t nil string))) + ;; This goes last so it doesn't confuse the matches above. (while (string-match "&" string) (setq string (replace-match "&" t nil string)))