1
0
mirror of https://git.savannah.gnu.org/git/emacs.git synced 2024-11-28 07:45:00 +00:00

Improve RFC 3986 conformance of url package.

Fix 2012-04-10 change to url.el.

* url-http.el (url-http-create-request): Ignore obsolete
attributes slot of url-object.

* url-parse.el: Improve RFC 3986 conformance.
(url-generic-parse-url): Do not populate the ATTRIBUTES slot,
since this is not reliable for general RFC 3986 URIs.  Keep the
whole path and query inside the FILENAME slot.  Improve docstring.
(url-recreate-url-attributes): Mark as obsolete.
(url-recreate-url): Handle missing scheme and userinfo.

* url-util.el (url-encode-url): New function for URL quoting.
(url-encoding-table, url-host-allowed-chars)
(url-path-allowed-chars): New constants.
(url--allowed-chars): New helper function.
(url-hexify-string): Use them.

* url-vars.el (url-nonrelative-link): Make the regexp stricter.

* url.el (url-retrieve-internal): Use url-encode-url.

Fixes: debbugs:7017
This commit is contained in:
Chong Yidong 2012-05-09 16:33:48 +08:00
parent 66b03a53a3
commit ce7b18ec41
7 changed files with 269 additions and 123 deletions

View File

@ -202,6 +202,18 @@ server properties.
*** New command `tabulated-list-sort', bound to `S', sorts the column *** New command `tabulated-list-sort', bound to `S', sorts the column
at point, or the Nth column if a numeric prefix argument is given. at point, or the Nth column if a numeric prefix argument is given.
** URL
*** Structs made by `url-generic-parse-url' have nil `attributes' slot.
Previously, this slot stored semicolon-separated attribute-value pairs
appended to some imap URLs, but this is not compatible with RFC 3986.
So now the `filename' slot stores the entire path and query components
and the `attributes' slot is always nil.
*** New function `url-encode-url' for encoding a URI string.
The `url-retrieve' function now uses this to encode its URL argument,
in case that is not properly encoded.
** Obsolete packages: ** Obsolete packages:
*** assoc.el *** assoc.el

View File

@ -1,3 +1,25 @@
2012-05-09 Chong Yidong <cyd@gnu.org>
* url-util.el (url-encode-url): New function for URL quoting.
(url-encoding-table, url-host-allowed-chars)
(url-path-allowed-chars): New constants.
(url--allowed-chars): New helper function.
(url-hexify-string): Use them.
* url-parse.el: Improve RFC 3986 conformance.
(url-generic-parse-url): Do not populate the ATTRIBUTES slot,
since this is not reliable for general RFC 3986 URIs. Keep the
whole path and query inside the FILENAME slot. Improve docstring.
(url-recreate-url-attributes): Mark as obsolete.
(url-recreate-url): Handle missing scheme and userinfo.
* url-http.el (url-http-create-request): Ignore obsolete
attributes slot of url-object.
* url-vars.el (url-nonrelative-link): Make the regexp stricter.
* url.el (url-retrieve-internal): Use url-encode-url (Bug#7017).
2012-04-26 Stefan Monnier <monnier@iro.umontreal.ca> 2012-04-26 Stefan Monnier <monnier@iro.umontreal.ca>
* url.el (url-retrieve-synchronously): Replace lexical-let by * url.el (url-retrieve-synchronously): Replace lexical-let by

View File

@ -223,8 +223,7 @@ request.")
(let ((url-basic-auth-storage (let ((url-basic-auth-storage
'url-http-proxy-basic-auth-storage)) 'url-http-proxy-basic-auth-storage))
(url-get-authentication url-http-target-url nil 'any nil)))) (url-get-authentication url-http-target-url nil 'any nil))))
(real-fname (concat (url-filename url-http-target-url) (real-fname (url-filename url-http-target-url))
(url-recreate-url-attributes url-http-target-url)))
(host (url-host url-http-target-url)) (host (url-host url-http-target-url))
(auth (if (cdr-safe (assoc "Authorization" url-http-extra-headers)) (auth (if (cdr-safe (assoc "Authorization" url-http-extra-headers))
nil nil

View File

@ -48,21 +48,31 @@
;;;###autoload ;;;###autoload
(defun url-recreate-url (urlobj) (defun url-recreate-url (urlobj)
"Recreate a URL string from the parsed URLOBJ." "Recreate a URL string from the parsed URLOBJ."
(concat (url-type urlobj) ":" (if (url-host urlobj) "//" "") (let ((type (url-type urlobj))
(if (url-user urlobj) (user (url-user urlobj))
(concat (url-user urlobj) (pass (url-password urlobj))
(if (url-password urlobj) (host (url-host urlobj))
(concat ":" (url-password urlobj))) (port (url-portspec urlobj))
"@")) (file (url-filename urlobj))
(url-host urlobj) (frag (url-target urlobj)))
(if (and (url-port urlobj) (concat (if type (concat type ":"))
(not (equal (url-port urlobj) (if (url-fullness urlobj) "//")
(url-scheme-get-property (url-type urlobj) 'default-port)))) (if (or user pass)
(format ":%d" (url-port urlobj))) (concat user
(or (url-filename urlobj) "/") (if pass (concat ":" pass))
(url-recreate-url-attributes urlobj) "@"))
(if (url-target urlobj) host
(concat "#" (url-target urlobj))))) ;; RFC 3986: "omit the port component and its : delimiter
;; if port is empty or if its value would be the same as
;; that of the scheme's default."
(and port
(or (null type)
(not (equal port
(url-scheme-get-property type
'default-port))))
(format ":%d" (url-port urlobj)))
(or file "/")
(if frag (concat "#" frag)))))
(defun url-recreate-url-attributes (urlobj) (defun url-recreate-url-attributes (urlobj)
"Recreate the attributes of an URL string from the parsed URLOBJ." "Recreate the attributes of an URL string from the parsed URLOBJ."
@ -73,107 +83,129 @@
(concat (car x) "=" (cdr x)) (concat (car x) "=" (cdr x))
(car x))) (car x)))
(url-attributes urlobj) ";")))) (url-attributes urlobj) ";"))))
(make-obsolete 'url-recreate-url-attributes nil "24.2")
;;;###autoload ;;;###autoload
(defun url-generic-parse-url (url) (defun url-generic-parse-url (url)
"Return an URL-struct of the parts of URL. "Return an URL-struct of the parts of URL.
The CL-style struct contains the following fields: The CL-style struct contains the following fields:
TYPE USER PASSWORD HOST PORTSPEC FILENAME TARGET ATTRIBUTES FULLNESS."
;; See RFC 3986. TYPE is the URI scheme (string or nil).
(cond USER is the user name (string or nil).
((null url) PASSWORD is the password (string [deprecated] or nil).
(url-parse-make-urlobj)) HOST is the host (a registered name, IP literal in square
((or (not (string-match url-nonrelative-link url)) brackets, or IPv4 address in dotted-decimal form).
(= ?/ (string-to-char url))) PORTSPEC is the specified port (a number), or nil.
;; This isn't correct, as a relative URL can be a fragment link FILENAME is the path AND the query component of the URI.
;; (e.g. "#foo") and many other things (see section 4.2). TARGET is the fragment identifier component (used to refer to a
;; However, let's not fix something that isn't broken, especially subordinate resource, e.g. a part of a webpage).
;; when close to a release. ATTRIBUTES is nil; this slot originally stored the attribute and
(url-parse-make-urlobj nil nil nil nil nil url)) value alists for IMAP URIs, but this feature was removed
(t since it conflicts with RFC 3986.
FULLNESS is non-nil iff the authority component of the URI is
present.
The parser follows RFC 3986, except that it also tries to handle
URIs that are not fully specified (e.g. lacking TYPE), and it
does not check for or perform %-encoding.
Here is an example. The URL
foo://bob:pass@example.com:42/a/b/c.dtb?type=animal&name=narwhal#nose
parses to
TYPE = \"foo\"
USER = \"bob\"
PASSWORD = \"pass\"
HOST = \"example.com\"
PORTSPEC = 42
FILENAME = \"/a/b/c.dtb?type=animal&name=narwhal\"
TARGET = \"nose\"
ATTRIBUTES = nil
FULLNESS = t"
(if (null url)
(url-parse-make-urlobj)
(with-temp-buffer (with-temp-buffer
;; Don't let those temp-buffer modifications accidentally ;; Don't let those temp-buffer modifications accidentally
;; deactivate the mark of the current-buffer. ;; deactivate the mark of the current-buffer.
(let ((deactivate-mark nil)) (let ((deactivate-mark nil))
(set-syntax-table url-parse-syntax-table) (set-syntax-table url-parse-syntax-table)
(let ((save-pos nil) (erase-buffer)
(prot nil) (insert url)
(user nil) (goto-char (point-min))
(pass nil) (let ((save-pos (point))
(host nil) scheme user pass host port file fragment full
(port nil)
(file nil)
(refs nil)
(attr nil)
(full nil)
(inhibit-read-only t)) (inhibit-read-only t))
(erase-buffer)
(insert url)
(goto-char (point-min))
(setq save-pos (point))
;; 3.1. Scheme ;; 3.1. Scheme
(unless (looking-at "//") ;; This is nil for a URI that is not fully specified.
(skip-chars-forward "a-zA-Z+.\\-") (when (looking-at "\\([a-zA-Z][-a-zA-Z0-9+.]*\\):")
(downcase-region save-pos (point)) (goto-char (match-end 0))
(setq prot (buffer-substring save-pos (point))) (setq save-pos (point))
(skip-chars-forward ":") (setq scheme (downcase (match-string 1))))
(setq save-pos (point)))
;; 3.2. Authority ;; 3.2. Authority
(when (looking-at "//") (when (looking-at "//")
(setq full t) (setq full t)
(forward-char 2) (forward-char 2)
(setq save-pos (point)) (setq save-pos (point))
(skip-chars-forward "^/") (skip-chars-forward "^/?#")
(setq host (buffer-substring save-pos (point))) (setq host (buffer-substring save-pos (point)))
;; 3.2.1 User Information
(if (string-match "^\\([^@]+\\)@" host) (if (string-match "^\\([^@]+\\)@" host)
(setq user (match-string 1 host) (setq user (match-string 1 host)
host (substring host (match-end 0) nil))) host (substring host (match-end 0))))
(if (and user (string-match "\\([^:]+\\):\\(.*\\)" user)) (if (and user (string-match "\\`\\([^:]*\\):\\(.*\\)" user))
(setq pass (match-string 2 user) (setq pass (match-string 2 user)
user (match-string 1 user))) user (match-string 1 user)))
;; This gives wrong results for IPv6 literal addresses. (cond
(if (string-match ":\\([0-9+]+\\)" host) ;; IPv6 literal address.
(setq port (string-to-number (match-string 1 host)) ((string-match "^\\(\\[[^]]+\\]\\)\\(?::\\([0-9]*\\)\\)?$" host)
host (substring host 0 (match-beginning 0)))) (setq port (match-string 2 host)
(if (string-match ":$" host) host (match-string 1 host)))
(setq host (substring host 0 (match-beginning 0)))) ;; Registered name or IPv4 address.
(setq host (downcase host) ((string-match ":\\([0-9]*\\)$" host)
save-pos (point))) (setq port (match-string 1 host)
host (substring host 0 (match-beginning 0)))))
(cond ((equal port "")
(setq port nil))
(port
(setq port (string-to-number port))))
(setq host (downcase host)))
(if (not port) (and (null port)
(setq port (url-scheme-get-property prot 'default-port))) scheme
(setq port (url-scheme-get-property scheme 'default-port)))
;; Now point is on the / ? or # which terminates the
;; authority, or at the end of the URI, or (if there is no
;; authority) at the beginning of the absolute path.
;; 3.3. Path
;; Gross hack to preserve ';' in data URLs
(setq save-pos (point)) (setq save-pos (point))
(if (string= "data" scheme)
;; For the "data" URI scheme, all the rest is the FILE.
(setq file (buffer-substring save-pos (point-max)))
;; For hysterical raisins, our data structure returns the
;; path and query components together in one slot.
;; 3.3. Path
(skip-chars-forward "^?#")
;; 3.4. Query
(when (looking-at "?")
(skip-chars-forward "^#"))
(setq file (buffer-substring save-pos (point)))
;; 3.5 Fragment
(when (looking-at "#")
(let ((opoint (point)))
(forward-char 1)
(unless (eobp)
(setq fragment (buffer-substring (point) (point-max))))
(delete-region opoint (point-max)))))
;; 3.4. Query
(if (string= "data" prot)
(goto-char (point-max))
;; Now check for references
(skip-chars-forward "^#")
(if (eobp)
nil
(delete-region
(point)
(progn
(skip-chars-forward "#")
(setq refs (buffer-substring (point) (point-max)))
(point-max))))
(goto-char save-pos)
(skip-chars-forward "^;")
(unless (eobp)
(setq attr (url-parse-args (buffer-substring (point) (point-max))
t)
attr (nreverse attr))))
(setq file (buffer-substring save-pos (point)))
(if (and host (string-match "%[0-9][0-9]" host)) (if (and host (string-match "%[0-9][0-9]" host))
(setq host (url-unhex-string host))) (setq host (url-unhex-string host)))
(url-parse-make-urlobj (url-parse-make-urlobj scheme user pass host port file
prot user pass host port file refs attr full))))))) fragment nil full))))))
(defmacro url-bit-for-url (method lookfor url) (defmacro url-bit-for-url (method lookfor url)
`(let* ((urlobj (url-generic-parse-url url)) `(let* ((urlobj (url-generic-parse-url url))

View File

@ -333,40 +333,117 @@ forbidden in URL encoding."
(concat tmp str))) (concat tmp str)))
(defconst url-unreserved-chars (defconst url-unreserved-chars
'( '(?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z ?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z
?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9
?- ?_ ?. ?! ?~ ?* ?' ?\( ?\)) ?- ?_ ?. ?~)
"A list of characters that are _NOT_ reserved in the URL spec. "List of characters that are unreserved in the URL spec.
This is taken from RFC 2396.") This is taken from RFC 3986 (section 2.3).")
(defconst url-encoding-table
(let ((vec (make-vector 256 nil)))
(dotimes (byte 256)
(aset vec byte (format "%%%02x" byte)))
vec)
"Vector translating bytes to URI-encoded %-sequences.")
(defun url--allowed-chars (char-list)
"Return an \"allowed character\" mask (a 256-slot vector).
The Nth element is non-nil if character N is in CHAR-LIST. The
result can be passed as the second arg to `url-hexify-string'."
(let ((vec (make-vector 256 nil)))
(dolist (byte char-list)
(ignore-errors (aset vec byte t)))
vec))
;;;###autoload ;;;###autoload
(defun url-hexify-string (string) (defun url-hexify-string (string &optional allowed-chars)
"Return a new string that is STRING URI-encoded. "URI-encode STRING and return the result.
First, STRING is converted to utf-8, if necessary. Then, for each If STRING is multibyte, it is first converted to a utf-8 byte
character in the utf-8 string, those found in `url-unreserved-chars' string. Each byte corresponding to an allowed character is left
are left as-is, all others are represented as a three-character as-is, while all other bytes are converted to a three-character
string: \"%\" followed by two lowercase hex digits." string: \"%\" followed by two lowercase hex digits.
;; To go faster and avoid a lot of consing, we could do:
;; The allowed characters are specified by ALLOWED-CHARS. If this
;; (defconst url-hexify-table argument is nil, the list `url-unreserved-chars' determines the
;; (let ((map (make-vector 256 nil))) allowed characters. Otherwise, ALLOWED-CHARS should be a vector
;; (dotimes (byte 256) (aset map byte whose Nth element is non-nil if character N is allowed."
;; (if (memq byte url-unreserved-chars) (unless allowed-chars
;; (char-to-string byte) (setq allowed-chars (url--allowed-chars url-unreserved-chars)))
;; (format "%%%02x" byte))))
;; map))
;;
;; (mapconcat (curry 'aref url-hexify-table) ...)
(mapconcat (lambda (byte) (mapconcat (lambda (byte)
(if (memq byte url-unreserved-chars) (if (aref allowed-chars byte)
(char-to-string byte) (char-to-string byte)
(format "%%%02x" byte))) (aref url-encoding-table byte)))
(if (multibyte-string-p string) (if (multibyte-string-p string)
(encode-coding-string string 'utf-8) (encode-coding-string string 'utf-8)
string) string)
"")) ""))
(defconst url-host-allowed-chars
;; Allow % to avoid re-encoding %-encoded sequences.
(url--allowed-chars (append '(?% ?! ?$ ?& ?' ?\( ?\) ?* ?+ ?, ?\; ?=)
url-unreserved-chars))
"Allowed-character byte mask for the host segment of a URI.
These characters are specified in RFC 3986, Appendix A.")
(defconst url-path-allowed-chars
(let ((vec (copy-sequence url-host-allowed-chars)))
(aset vec ?/ t)
(aset vec ?: t)
(aset vec ?@ t)
vec)
"Allowed-character byte mask for the path segment of a URI.
These characters are specified in RFC 3986, Appendix A.")
(defconst url-query-allowed-chars
(let ((vec (copy-sequence url-path-allowed-chars)))
(aset vec ?? t)
vec)
"Allowed-character byte mask for the query segment of a URI.
These characters are specified in RFC 3986, Appendix A.")
;;;###autoload
(defun url-encode-url (url)
"Return a properly URI-encoded version of URL.
This function also performs URI normalization, e.g. converting
the scheme to lowercase if it is uppercase. Apart from
normalization, if URL is already URI-encoded, this function
should return it unchanged."
(if (multibyte-string-p url)
(setq url (encode-coding-string url 'utf-8)))
(let* ((obj (url-generic-parse-url url))
(user (url-user obj))
(pass (url-password obj))
(host (url-host obj))
(file (url-filename obj))
(frag (url-target obj))
path query)
(if user
(setf (url-user obj) (url-hexify-string user)))
(if pass
(setf (url-password obj) (url-hexify-string pass)))
(when host
;; No special encoding for IPv6 literals.
(unless (string-match "\\`\\[.*\\]\\'" host)
(setf (url-host obj)
(url-hexify-string host url-host-allowed-chars))))
;; Split FILENAME slot into its PATH and QUERY components, and
;; encode them separately. The PATH component can contain
;; unreserved characters, %-encodings, and /:@!$&'()*+,;=
(when file
(if (string-match "\\?" file)
(setq path (substring file 0 (match-beginning 0))
query (substring file (match-end 0)))
(setq path file))
(setq path (url-hexify-string path url-path-allowed-chars))
(if query
(setq query (url-hexify-string query url-query-allowed-chars)))
(setf (url-filename obj)
(if query (concat path "?" query) path)))
(if frag
(setf (url-target obj)
(url-hexify-string frag url-query-allowed-chars)))
(url-recreate-url obj)))
;;;###autoload ;;;###autoload
(defun url-file-extension (fname &optional x) (defun url-file-extension (fname &optional x)

View File

@ -304,8 +304,12 @@ undefined."
:type '(choice (const :tag "None" :value nil) string) :type '(choice (const :tag "None" :value nil) string)
:group 'url) :group 'url)
;; From RFC3986: Scheme names consist of a sequence of characters
;; beginning with a letter and followed by any combination of letters,
;; digits, plus ("+"), period ("."), or hyphen ("-").
(defvar url-nonrelative-link (defvar url-nonrelative-link
"\\`\\([-a-zA-Z0-9+.]+:\\)" "\\`\\([a-zA-Z][-a-zA-Z0-9+.]*:\\)"
"A regular expression that will match an absolute URL.") "A regular expression that will match an absolute URL.")
(defcustom url-max-redirections 30 (defcustom url-max-redirections 30

View File

@ -125,7 +125,9 @@ variable in the original buffer as a forwarding pointer.")
;;;###autoload ;;;###autoload
(defun url-retrieve (url callback &optional cbargs silent inhibit-cookies) (defun url-retrieve (url callback &optional cbargs silent inhibit-cookies)
"Retrieve URL asynchronously and call CALLBACK with CBARGS when finished. "Retrieve URL asynchronously and call CALLBACK with CBARGS when finished.
URL is either a string or a parsed URL. URL is either a string or a parsed URL. If it is a string
containing characters that are not valid in a URI, those
characters are percent-encoded; see `url-encode-url'.
CALLBACK is called when the object has been completely retrieved, with CALLBACK is called when the object has been completely retrieved, with
the current buffer containing the object, and any MIME headers associated the current buffer containing the object, and any MIME headers associated
@ -179,10 +181,8 @@ URL-encoded before it's used."
(url-do-setup) (url-do-setup)
(url-gc-dead-buffers) (url-gc-dead-buffers)
(if (stringp url) (if (stringp url)
(set-text-properties 0 (length url) nil url)) (set-text-properties 0 (length url) nil url))
(when (multibyte-string-p url) (setq url (url-encode-url url))
(let ((url-unreserved-chars (append '(?: ?/) url-unreserved-chars)))
(setq url (url-hexify-string url))))
(if (not (vectorp url)) (if (not (vectorp url))
(setq url (url-generic-parse-url url))) (setq url (url-generic-parse-url url)))
(if (not (functionp callback)) (if (not (functionp callback))