;;; mail-extr.el --- extract full name and address from RFC 822 mail header. ;; Copyright (C) 1992 Free Software Foundation, Inc. ;; Author: Joe Wells ;; Version: 1.0 ;; Adapted-By: ESR ;; Keywords: mail ;; This file is part of GNU Emacs. ;; GNU Emacs is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 1, or (at your option) ;; any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. ;;; Commentary: ;; Here is `mail-extr', a package for extracting full names and canonical ;; addresses from RFC 822 mail headers. It is intended to be hooked into ;; other Emacs Lisp packages that deal with RFC 822 format messages, such as ;; Gnews, GNUS, RMAIL, MH-E, BBDB, VM, Supercite, etc. Thus, this release is ;; mainly for Emacs Lisp developers. ;; There are two main benefits: ;; 1. Higher probability of getting the correct full name for a human than ;; any other package I know of. (On the other hand, it will cheerfully ;; mangle non-human names/comments.) ;; 2. Address part is put in a canonical form. ;; The interface is not yet carved in stone; please give me suggestions. ;; I have an extensive test-case collection of funny addresses if you want to ;; work with the code. Developing this code requires frequent testing to ;; make sure you're not breaking functionality. I'm not posting the ;; test-cases because they take over 100K. ;; If you find an address that mail-extr fails on, please send it to me along ;; with what you think the correct results should be. I do not consider it a ;; bug if mail-extr mangles a comment that does not correspond to a real ;; human full name, although I would prefer that mail-extr would return the ;; comment as-is. ;; Features: ;; * Full name handling: ;; * knows where full names can be found in an address. ;; * avoids using empty comments and quoted text. ;; * extracts full names from mailbox names. ;; * recognizes common formats for comments after a full name. ;; * puts a period and a space after each initial. ;; * understands & referring to the mailbox name capitalized. ;; * strips name prefixes like "Prof.", etc.. ;; * understands what characters can occur in names (not just letters). ;; * figures out middle initial from mailbox name. ;; * removes funny nicknames. ;; * keeps suffixes such as Jr., Sr., III, etc. ;; * reorders "Last, First" type names. ;; * Address handling: ;; * parses rfc822 quoted text, comments, and domain literals. ;; * parses rfc822 multi-line headers. ;; * does something reasonable with rfc822 GROUP addresses. ;; * handles many rfc822 noncompliant and garbage addresses. ;; * canonicalizes addresses (after stripping comments/phrases outside <>). ;; * converts ! addresses into .UUCP and %-style addresses. ;; * converts rfc822 ROUTE addresses to %-style addresses. ;; * truncates %-style addresses at leftmost fully qualified domain name. ;; * handles local relative precedence of ! vs. % and @ (untested). ;; It does almost no string creation. It primarily uses the built-in ;; parsing routines with the appropriate syntax tables. This should ;; result in greater speed. ;; TODO: ;; * handle all test cases. (This will take forever.) ;; * software to pick the correct header to use (eg., "Senders-Name:"). ;; * multiple addresses in the "From:" header (almost all of the necessary ;; code is there). ;; * flag to not treat `,' as an address separator. (This is useful when ;; there is a "From:" header but no "Sender:" header, because then there ;; is only allowed to be one address.) ;; * mailbox name does not necessarily contain full name. ;; * fixing capitalization when it's all upper or lowercase. (Hard!) ;; * some of the domain literal handling is missing. (But I've never even ;; seen one of these in a mail address, so maybe no big deal.) ;; * arrange to have syntax tables byte-compiled. ;; * speed hacks. ;; * delete unused variables. ;; * arrange for testing with different relative precedences of ! vs. @ ;; and %. ;; * put variant-method back into mail-extract-address-components. ;; * insert documentation strings! ;; * handle X.400-gatewayed addresses according to RFC 1148. ;;; Change Log: ;; ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu) ;; ;; * Cleaned up some more. Release version 1.0 to world. ;; ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu) ;; ;; * Cleaned up full name extraction extensively. ;; ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu) ;; ;; * Total rewrite. Integrated mail-canonicalize-address into ;; mail-extract-address-components. Now handles GROUP addresses more ;; or less correctly. Better handling of lots of different cases. ;; ;; Fri Jun 14 19:39:50 1991 ;; * Created. ;;; Code: ;; Variable definitions. (defvar mail-@-binds-tighter-than-! nil) ;;---------------------------------------------------------------------- ;; what orderings are meaningful????? ;;(defvar mail-operator-precedence-list '(?! ?% ?@)) ;; Right operand of a % or a @ must be a domain name, period. No other ;; operators allowed. Left operand of a @ is an address relative to that ;; site. ;; Left operand of a ! must be a domain name. Right operand is an ;; arbitrary address. ;;---------------------------------------------------------------------- (defconst mail-space-char 32) (defconst mail-whitespace " \t\n") ;; Any character that can occur in a name in an RFC822 address. ;; Yes, there are weird people with digits in their names. (defconst mail-all-letters "A-Za-z---{|}'~0-9`.") ;; Any character that can occur in a name, not counting characters that ;; separate parts of a multipart name. (defconst mail-all-letters-but-separators "A-Za-z{|}'~0-9`") ;; Any character that can start a name (defconst mail-first-letters "A-Za-z") ;; Any character that can end a name. (defconst mail-last-letters "A-Za-z`'.") ;; Matches an initial not followed by both a period and a space. (defconst mail-bad-initials-pattern (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)" mail-all-letters mail-first-letters mail-all-letters)) (defconst mail-non-name-chars (concat "^" mail-all-letters ".")) (defconst mail-non-begin-name-chars (concat "^" mail-first-letters)) (defconst mail-non-end-name-chars (concat "^" mail-last-letters)) ;; Matches periods used instead of spaces. Must not match the period ;; following an initial. (defconst mail-bad-\.-pattern (format "\\([%s][%s]\\)\\.+\\([%s]\\)" mail-all-letters mail-last-letters mail-first-letters)) ;; Matches an embedded or leading nickname that should be removed. (defconst mail-nickname-pattern (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] " mail-all-letters)) ;; Matches a leading title that is not part of the name (does not ;; contribute to uniquely identifying the person). (defconst mail-full-name-prefixes '"\\` *\\(Prof\\|Dr\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.? ") ;; Matches the occurrence of a generational name suffix, and the last ;; character of the preceding name. (defconst mail-full-name-suffix-pattern (format "\\(,? ?\\([JjSs]r\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)" mail-all-letters mail-all-letters)) (defconst mail-roman-numeral-pattern "V?I+V?\\b") ;; Matches a trailing uppercase (with other characters possible) acronym. ;; Must not match a trailing uppercase last name or trailing initial (defconst mail-weird-acronym-pattern "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)") ;; Matches a mixed-case or lowercase name (not an initial). (defconst mail-mixed-case-name-pattern (format "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)" mail-all-letters mail-last-letters mail-first-letters mail-all-letters mail-all-letters mail-last-letters mail-first-letters mail-all-letters)) ;; Matches a trailing alternative address. (defconst mail-alternative-address-pattern "[a-zA-Z.]+[!@][a-zA-Z.]") ;; Matches a variety of trailing comments not including comma-delimited ;; comments. (defconst mail-trailing-comment-start-pattern " [-{]\\|--\\|[+@#> ".") (?@ ".") (?, ".") (?\; ".") (?: ".") (?\\ "\\") (?\" "\"") (?. ".") (?\[ ".") (?\] ".") ;; % and ! aren't RFC822 characters, but it is convenient to pretend (?% ".") (?! ".") ) (address-comment-syntax-table (0 255 "w") (?\( "\(\)") (?\) "\)\(") (?\\ "\\")) (address-domain-literal-syntax-table (0 255 "w") (?\[ "\(\]") ;?????? (?\] "\)\[") ;?????? (?\\ "\\")) (address-text-comment-syntax-table (0 255 "w") (?\( "\(\)") (?\) "\)\(") (?\[ "\(\]") (?\] "\)\[") (?\{ "\(\}") (?\} "\)\{") (?\\ "\\") (?\" "\"") ;; (?\' "\)\`") ;; (?\` "\(\'") ) (address-text-syntax-table (0 255 ".") (?A ?Z "w") (?a ?z "w") (?- "w") (?\} "w") (?\{ "w") (?| "w") (?\' "w") (?~ "w") (?0 ?9 "w")) )) ;; Utility functions and macros. (defmacro undo-backslash-quoting (beg end) (`(save-excursion (save-restriction (narrow-to-region (, beg) (, end)) (goto-char (point-min)) ;; undo \ quoting (while (re-search-forward "\\\\\\(.\\)" nil t) (replace-match "\\1") ;; CHECK: does this leave point after the replacement? ))))) (defmacro mail-nuke-char-at (pos) (` (save-excursion (goto-char (, pos)) (delete-char 1) (insert mail-space-char)))) (defmacro mail-nuke-elements-outside-range (list-symbol beg-symbol end-symbol &optional no-replace) (` (progn (setq temp (, list-symbol)) (while temp (cond ((or (> (car temp) (, end-symbol)) (< (car temp) (, beg-symbol))) (, (or no-replace (` (mail-nuke-char-at (car temp))))) (setcar temp nil))) (setq temp (cdr temp))) (setq (, list-symbol) (delq nil (, list-symbol)))))) (defun mail-demarkerize (marker) (and marker (if (markerp marker) (let ((temp (marker-position marker))) (set-marker marker nil) temp) marker))) (defun mail-markerize (pos) (and pos (if (markerp pos) pos (copy-marker pos)))) (defmacro mail-last-element (list) "Return last element of LIST." (` (let ((list (, list))) (while (not (null (cdr list))) (setq list (cdr list))) (car list)))) (defmacro safe-move-sexp (arg) "Safely skip over one balanced sexp, if there is one. Return t if success." (` (condition-case error (progn (goto-char (scan-sexps (point) (, arg))) t) (error (if (string-equal (nth 1 error) "Unbalanced parentheses") nil (while t (signal (car error) (cdr error)))))))) ;; The main function to grind addresses (defun mail-extract-address-components (address) "Given an rfc 822 ADDRESS, extract full name and canonical address. Returns a list of the form (FULL-NAME CANONICAL-ADDRESS)." (let ((canonicalization-buffer (get-buffer-create "*canonical address*")) (extraction-buffer (get-buffer-create "*extract address components*")) (foo 'bar) char multiple-addresses <-pos >-pos @-pos :-pos ,-pos !-pos %-pos \;-pos group-:-pos group-\;-pos route-addr-:-pos record-pos-symbol first-real-pos last-real-pos phrase-beg phrase-end comment-beg comment-end quote-beg quote-end atom-beg atom-end mbox-beg mbox-end \.-ends-name temp name-suffix saved-point fi mi li saved-%-pos saved-!-pos saved-@-pos domain-pos \.-pos insert-point) (save-excursion (set-buffer extraction-buffer) (buffer-disable-undo extraction-buffer) (set-syntax-table address-syntax-table) (widen) (erase-buffer) (setq case-fold-search nil) ;; Insert extra space at beginning to allow later replacement with < ;; without having to move markers. (insert mail-space-char address) ;; stolen from rfc822.el ;; Unfold multiple lines. (goto-char (point-min)) (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t) (replace-match "\\1 " t)) ;; first pass grabs useful information about address (goto-char (point-min)) (while (progn (skip-chars-forward mail-whitespace) (not (eobp))) (setq char (char-after (point))) (or first-real-pos (if (not (eq char ?\()) (setq first-real-pos (point)))) (cond ;; comment ((eq char ?\() (set-syntax-table address-comment-syntax-table) ;; only record the first non-empty comment's position (if (and (not comment-beg) (save-excursion (forward-char 1) (skip-chars-forward mail-whitespace) (not (eq ?\) (char-after (point)))))) (setq comment-beg (point))) ;; TODO: don't record if unbalanced (or (safe-move-sexp 1) (forward-char 1)) (set-syntax-table address-syntax-table) (if (and comment-beg (not comment-end)) (setq comment-end (point)))) ;; quoted text ((eq char ?\") ;; only record the first non-empty quote's position (if (and (not quote-beg) (save-excursion (forward-char 1) (skip-chars-forward mail-whitespace) (not (eq ?\" (char-after (point)))))) (setq quote-beg (point))) ;; TODO: don't record if unbalanced (or (safe-move-sexp 1) (forward-char 1)) (if (and quote-beg (not quote-end)) (setq quote-end (point)))) ;; domain literals ((eq char ?\[) (set-syntax-table address-domain-literal-syntax-table) (or (safe-move-sexp 1) (forward-char 1)) (set-syntax-table address-syntax-table)) ;; commas delimit addresses when outside < > pairs. ((and (eq char ?,) (or (null <-pos) (and >-pos ;; handle weird munged addresses (> (mail-last-element <-pos) (car >-pos))))) (setq multiple-addresses t) (delete-char 1) (narrow-to-region (point-min) (point))) ;; record the position of various interesting chars, determine ;; legality later. ((setq record-pos-symbol (cdr (assq char '((?< . <-pos) (?> . >-pos) (?@ . @-pos) (?: . :-pos) (?, . ,-pos) (?! . !-pos) (?% . %-pos) (?\; . \;-pos))))) (set record-pos-symbol (cons (point) (symbol-value record-pos-symbol))) (forward-char 1)) ((eq char ?.) (forward-char 1)) ((memq char '( ;; comment terminator illegal ?\) ;; domain literal terminator illegal ?\] ;; \ allowed only within quoted strings, ;; domain literals, and comments ?\\ )) (mail-nuke-char-at (point)) (forward-char 1)) (t (forward-word 1))) (or (eq char ?\() (setq last-real-pos (point)))) ;; Use only the leftmost <, if any. Replace all others with spaces. (while (cdr <-pos) (mail-nuke-char-at (car <-pos)) (setq <-pos (cdr <-pos))) ;; Use only the rightmost >, if any. Replace all others with spaces. (while (cdr >-pos) (mail-nuke-char-at (nth 1 >-pos)) (setcdr >-pos (nthcdr 2 >-pos))) ;; If multiple @s and a :, but no < and >, insert around buffer. ;; This commonly happens on the UUCP "From " line. Ugh. (cond ((and (> (length @-pos) 1) :-pos ;TODO: check if between @s (not <-pos)) (goto-char (point-min)) (delete-char 1) (setq <-pos (list (point))) (insert ?<))) ;; If < but no >, insert > in rightmost possible position (cond ((and <-pos (null >-pos)) (goto-char (point-max)) (setq >-pos (list (point))) (insert ?>))) ;; If > but no <, replace > with space. (cond ((and >-pos (null <-pos)) (mail-nuke-char-at (car >-pos)) (setq >-pos nil))) ;; Turn >-pos and <-pos into non-lists (setq >-pos (car >-pos) <-pos (car <-pos)) ;; Trim other punctuation lists of items outside < > pair to handle ;; stupid MTAs. (cond (<-pos ; don't need to check >-pos also ;; handle bozo software that violates RFC 822 by sticking ;; punctuation marks outside of a < > pair (mail-nuke-elements-outside-range @-pos <-pos >-pos t) ;; RFC 822 says nothing about these two outside < >, but ;; remove those positions from the lists to make things ;; easier. (mail-nuke-elements-outside-range !-pos <-pos >-pos t) (mail-nuke-elements-outside-range %-pos <-pos >-pos t))) ;; Check for : that indicates GROUP list and for : part of ;; ROUTE-ADDR spec. ;; Can't possibly be more than two :. Nuke any extra. (while :-pos (setq temp (car :-pos) :-pos (cdr :-pos)) (cond ((and <-pos >-pos (> temp <-pos) (< temp >-pos)) (if (or route-addr-:-pos (< (length @-pos) 2) (> temp (car @-pos)) (< temp (nth 1 @-pos))) (mail-nuke-char-at temp) (setq route-addr-:-pos temp))) ((or (not <-pos) (and <-pos (< temp <-pos))) (setq group-:-pos temp)))) ;; Nuke any ; that is in or to the left of a < > pair or to the left ;; of a GROUP starting :. Also, there may only be one ;. (while \;-pos (setq temp (car \;-pos) \;-pos (cdr \;-pos)) (cond ((and <-pos >-pos (> temp <-pos) (< temp >-pos)) (mail-nuke-char-at temp)) ((and (or (not group-:-pos) (> temp group-:-pos)) (not group-\;-pos)) (setq group-\;-pos temp)))) ;; Handle junk like ";@host.company.dom" that sendmail adds. ;; **** should I remember comment positions? (and group-\;-pos ;; this is fine for now (mail-nuke-elements-outside-range !-pos group-:-pos group-\;-pos t) (mail-nuke-elements-outside-range @-pos group-:-pos group-\;-pos t) (mail-nuke-elements-outside-range %-pos group-:-pos group-\;-pos t) (mail-nuke-elements-outside-range ,-pos group-:-pos group-\;-pos t) (and last-real-pos (> last-real-pos (1+ group-\;-pos)) (setq last-real-pos (1+ group-\;-pos))) (and comment-end (> comment-end group-\;-pos) (setq comment-end nil comment-beg nil)) (and quote-end (> quote-end group-\;-pos) (setq quote-end nil quote-beg nil)) (narrow-to-region (point-min) group-\;-pos)) ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any ;; others. ;; Hell, go ahead an nuke all of the commas. ;; **** This will cause problems when we start handling commas in ;; the PHRASE part .... no it won't ... yes it will ... ????? (mail-nuke-elements-outside-range ,-pos 1 1) ;; can only have multiple @s inside < >. The fact that some MTAs ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is ;; handled above. ;; Locate PHRASE part of ROUTE-ADDR. (cond (<-pos (goto-char <-pos) (skip-chars-backward mail-whitespace) (setq phrase-end (point)) (goto-char (or ;;group-:-pos (point-min))) (skip-chars-forward mail-whitespace) (if (< (point) phrase-end) (setq phrase-beg (point)) (setq phrase-end nil)))) ;; handle ROUTE-ADDRS with real ROUTEs. ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and ;; any % or ! must be semantically meaningless. ;; TODO: do this processing into canonicalization buffer (cond (route-addr-:-pos (setq !-pos nil %-pos nil >-pos (copy-marker >-pos) route-addr-:-pos (copy-marker route-addr-:-pos)) (goto-char >-pos) (insert-before-markers ?X) (goto-char (car @-pos)) (while (setq @-pos (cdr @-pos)) (delete-char 1) (setq %-pos (cons (point-marker) %-pos)) (insert "%") (goto-char (1- >-pos)) (save-excursion (insert-buffer-substring extraction-buffer (car @-pos) route-addr-:-pos) (delete-region (car @-pos) route-addr-:-pos)) (or (cdr @-pos) (setq saved-@-pos (list (point))))) (setq @-pos saved-@-pos) (goto-char >-pos) (delete-char -1) (mail-nuke-char-at route-addr-:-pos) (mail-demarkerize route-addr-:-pos) (setq route-addr-:-pos nil >-pos (mail-demarkerize >-pos) %-pos (mapcar 'mail-demarkerize %-pos)))) ;; de-listify @-pos (setq @-pos (car @-pos)) ;; TODO: remove comments in the middle of an address (set-buffer canonicalization-buffer) (buffer-disable-undo canonicalization-buffer) (set-syntax-table address-syntax-table) (setq case-fold-search nil) (widen) (erase-buffer) (insert-buffer-substring extraction-buffer) (if <-pos (narrow-to-region (progn (goto-char (1+ <-pos)) (skip-chars-forward mail-whitespace) (point)) >-pos) ;; ****** Oh no! What if the address is completely empty! (narrow-to-region first-real-pos last-real-pos)) (and @-pos %-pos (mail-nuke-elements-outside-range %-pos (point-min) @-pos)) (and %-pos !-pos (mail-nuke-elements-outside-range !-pos (point-min) (car %-pos))) (and @-pos !-pos (not %-pos) (mail-nuke-elements-outside-range !-pos (point-min) @-pos)) ;; Error condition:?? (and %-pos (not @-pos)) (cond (!-pos ;; **** I don't understand this save-restriction and the ;; narrow-to-region inside it. Why did I do that? (save-restriction (cond ((and @-pos mail-@-binds-tighter-than-!) (goto-char @-pos) (setq %-pos (cons (point) %-pos) @-pos nil) (delete-char 1) (insert "%") (setq insert-point (point-max))) (mail-@-binds-tighter-than-! (setq insert-point (point-max))) (%-pos (setq insert-point (mail-last-element %-pos) saved-%-pos (mapcar 'mail-markerize %-pos) %-pos nil @-pos (mail-markerize @-pos))) (@-pos (setq insert-point @-pos) (setq @-pos (mail-markerize @-pos))) (t (setq insert-point (point-max)))) (narrow-to-region (point-min) insert-point) (setq saved-!-pos (car !-pos)) (while !-pos (goto-char (point-max)) (cond ((and (not @-pos) (not (cdr !-pos))) (setq @-pos (point)) (insert-before-markers "@ ")) (t (setq %-pos (cons (point) %-pos)) (insert-before-markers "% "))) (backward-char 1) (insert-buffer-substring (current-buffer) (if (nth 1 !-pos) (1+ (nth 1 !-pos)) (point-min)) (car !-pos)) (delete-char 1) (or (save-excursion (safe-move-sexp -1) (skip-chars-backward mail-whitespace) (eq ?. (preceding-char))) (insert-before-markers (if (save-excursion (skip-chars-backward mail-whitespace) (eq ?. (preceding-char))) "" ".") "uucp")) (setq !-pos (cdr !-pos)))) (and saved-%-pos (setq %-pos (append (mapcar 'mail-demarkerize saved-%-pos) %-pos))) (setq @-pos (mail-demarkerize @-pos)) (narrow-to-region (1+ saved-!-pos) (point-max)))) (cond ((and %-pos (not @-pos)) (goto-char (car %-pos)) (delete-char 1) (setq @-pos (point)) (insert "@") (setq %-pos (cdr %-pos)))) (setq %-pos (nreverse %-pos)) ;; RFC 1034 doesn't approve of this, oh well: (downcase-region (or (car %-pos) @-pos (point-max)) (point-max)) (cond (%-pos ; implies @-pos valid (setq temp %-pos) (catch 'truncated (while temp (goto-char (or (nth 1 temp) @-pos)) (skip-chars-backward mail-whitespace) (save-excursion (safe-move-sexp -1) (setq domain-pos (point)) (skip-chars-backward mail-whitespace) (setq \.-pos (eq ?. (preceding-char)))) (cond ((and \.-pos (get (intern (buffer-substring domain-pos (point))) 'domain-name)) (narrow-to-region (point-min) (point)) (goto-char (car temp)) (delete-char 1) (setq @-pos (point)) (setcdr temp nil) (setq %-pos (delq @-pos %-pos)) (insert "@") (throw 'truncated t))) (setq temp (cdr temp)))))) (setq mbox-beg (point-min) mbox-end (if %-pos (car %-pos) (or @-pos (point-max)))) ;; Done canonicalizing address. (set-buffer extraction-buffer) ;; Find the full name (cond ((and phrase-beg (eq quote-beg phrase-beg) (<= quote-end phrase-end)) (narrow-to-region (1+ quote-beg) (1- quote-end)) (undo-backslash-quoting (point-min) (point-max))) (phrase-beg (narrow-to-region phrase-beg phrase-end)) (comment-beg (narrow-to-region (1+ comment-beg) (1- comment-end)) (undo-backslash-quoting (point-min) (point-max))) (t ;; *** Work in canon buffer instead? No, can't. Hmm. (delete-region (point-min) (point-max)) (insert-buffer-substring canonicalization-buffer mbox-beg mbox-end) (goto-char (point-min)) (setq \.-ends-name (search-forward "_" nil t)) (goto-char (point-min)) (while (progn (skip-chars-forward mail-whitespace) (not (eobp))) (setq char (char-after (point))) (cond ((eq char ?\") (setq quote-beg (point)) (or (safe-move-sexp 1) ;; TODO: handle this error condition!!!!! (forward-char 1)) ;; take into account deletions (setq quote-end (- (point) 2)) (save-excursion (backward-char 1) (delete-char 1) (goto-char quote-beg) (delete-char 1)) (undo-backslash-quoting quote-beg quote-end) (or (eq mail-space-char (char-after (point))) (insert " ")) (setq \.-ends-name t)) ((eq char ?.) (if (eq (char-after (1+ (point))) ?_) (progn (forward-char 1) (delete-char 1) (insert mail-space-char)) (if \.-ends-name (narrow-to-region (point-min) (point)) (delete-char 1) (insert " ")))) ((memq (char-syntax char) '(?. ?\\)) (delete-char 1) (insert " ")) (t (setq atom-beg (point)) (forward-word 1) (setq atom-end (point)) (save-restriction (narrow-to-region atom-beg atom-end) (goto-char (point-min)) (while (re-search-forward "\\([^_]+\\)_" nil t) (replace-match "\\1 ")) (goto-char (point-max)))))))) (set-syntax-table address-text-syntax-table) (setq xxx (variant-method (buffer-string))) (delete-region (point-min) (point-max)) (insert xxx) (goto-char (point-min)) ;; ;; Compress whitespace ;; (goto-char (point-min)) ;; (while (re-search-forward "[ \t\n]+" nil t) ;; (replace-match " ")) ;; ;; ;; Fix . used as space ;; (goto-char (point-min)) ;; (while (re-search-forward mail-bad-\.-pattern nil t) ;; (replace-match "\\1 \\2")) ;; ;; ;; Delete trailing parenthesized comment ;; (goto-char (point-max)) ;; (skip-chars-backward mail-whitespace) ;; (cond ((memq (char-after (1- (point))) '(?\) ?\} ?\])) ;; (setq comment-end (point)) ;; (set-syntax-table address-text-comment-syntax-table) ;; (or (safe-move-sexp -1) ;; (backward-char 1)) ;; (set-syntax-table address-text-syntax-table) ;; (setq comment-beg (point)) ;; (skip-chars-backward mail-whitespace) ;; (if (bobp) ;; (narrow-to-region (1+ comment-beg) (1- comment-end)) ;; (narrow-to-region (point-min) (point))))) ;; ;; ;; Find, save, and delete any name suffix ;; ;; *** Broken! ;; (goto-char (point-min)) ;; (cond ((re-search-forward mail-full-name-suffix-pattern nil t) ;; (setq name-suffix (buffer-substring (match-beginning 3) ;; (match-end 3))) ;; (replace-match "\\1 \\4"))) ;; ;; ;; Delete ALL CAPS words and after, if preceded by mixed-case or ;; ;; lowercase words. Eg. XT-DEM. ;; (goto-char (point-min)) ;; ;; ## This will lose on something like "SMITH MAX". ;; ;; ## maybe it should be ;; ;; ## " \\([A-Z]+[-_/][A-Z]+\\|[A-Z][A-Z][A-Z]\\)\\b.*[^A-Z \t]" ;; ;; ## that is, three-letter-upper-case-word with non-upper-case ;; ;; ## characters following it. ;; (if (re-search-forward mail-mixed-case-name-pattern nil t) ;; (if (re-search-forward mail-weird-acronym-pattern nil t) ;; (narrow-to-region (point-min) (match-beginning 0)))) ;; ;; ;; Delete trailing alternative address ;; (goto-char (point-min)) ;; (if (re-search-forward mail-alternative-address-pattern nil t) ;; (narrow-to-region (point-min) (match-beginning 0))) ;; ;; ;; Delete trailing comment ;; (goto-char (point-min)) ;; (if (re-search-forward mail-trailing-comment-start-pattern nil t) ;; (or (progn ;; (goto-char (match-beginning 0)) ;; (skip-chars-backward mail-whitespace) ;; (bobp)) ;; (narrow-to-region (point-min) (match-beginning 0)))) ;; ;; ;; Delete trailing comma-separated comment ;; (goto-char (point-min)) ;; ;; ## doesn't this break "Smith, John"? Yes. ;; (re-search-forward mail-last-name-first-pattern nil t) ;; (while (search-forward "," nil t) ;; (or (save-excursion ;; (backward-char 2) ;; (looking-at mail-full-name-suffix-pattern)) ;; (narrow-to-region (point-min) (1- (point))))) ;; ;; ;; Delete telephone numbers and ham radio call signs ;; (goto-char (point-min)) ;; (if (re-search-forward mail-telephone-extension-pattern nil t) ;; (narrow-to-region (point-min) (match-beginning 0))) ;; (goto-char (point-min)) ;; (if (re-search-forward mail-ham-call-sign-pattern nil t) ;; (if (eq (match-beginning 0) (point-min)) ;; (narrow-to-region (match-end 0) (point-max)) ;; (narrow-to-region (point-min) (match-beginning 0)))) ;; ;; ;; Delete trailing word followed immediately by . ;; (goto-char (point-min)) ;; ;; ## what's this for? doesn't it mess up "Public, Harry Q."? No. ;; (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t) ;; (narrow-to-region (point-min) (match-beginning 0))) ;; ;; ;; Handle & substitution ;; ;; TODO: remember to disable middle initial guessing ;; (goto-char (point-min)) ;; (cond ((re-search-forward "\\( \\|\\`\\)&\\( \\|\\'\\)" nil t) ;; (goto-char (match-end 1)) ;; (delete-char 1) ;; (capitalize-region ;; (point) ;; (progn ;; (insert-buffer-substring canonicalization-buffer ;; mbox-beg mbox-end) ;; (point))))) ;; ;; ;; Delete nickname ;; (goto-char (point-min)) ;; (if (re-search-forward mail-nickname-pattern nil t) ;; (replace-match (if (eq (match-beginning 2) (1- (match-end 2))) ;; " \\2 " ;; " "))) ;; ;; ;; Fixup initials ;; (while (progn ;; (goto-char (point-min)) ;; (re-search-forward mail-bad-initials-pattern nil t)) ;; (replace-match ;; (if (match-beginning 4) ;; "\\1. \\4" ;; (if (match-beginning 5) ;; "\\1. \\5" ;; "\\1. ")))) ;; ;; ;; Delete title ;; (goto-char (point-min)) ;; (if (re-search-forward mail-full-name-prefixes nil t) ;; (narrow-to-region (point) (point-max))) ;; ;; ;; Delete trailing and preceding non-name characters ;; (goto-char (point-min)) ;; (skip-chars-forward mail-non-begin-name-chars) ;; (narrow-to-region (point) (point-max)) ;; (goto-char (point-max)) ;; (skip-chars-backward mail-non-end-name-chars) ;; (narrow-to-region (point-min) (point)) ;; If name is "First Last" and userid is "F?L", then assume ;; the middle initial is the second letter in the userid. ;; initially by Jamie Zawinski (cond ((and (eq 3 (- mbox-end mbox-beg)) (progn (goto-char (point-min)) (looking-at mail-two-name-pattern))) (setq fi (char-after (match-beginning 0)) li (char-after (match-beginning 3))) (save-excursion (set-buffer canonicalization-buffer) ;; char-equal is ignoring case here, so no need to upcase ;; or downcase. (let ((case-fold-search t)) (and (char-equal fi (char-after mbox-beg)) (char-equal li (char-after (1- mbox-end))) (setq mi (char-after (1+ mbox-beg)))))) (cond ((and mi ;; TODO: use better table than syntax table (eq ?w (char-syntax mi))) (goto-char (match-beginning 3)) (insert (upcase mi) ". "))))) ;; ;; Restore suffix ;; (cond (name-suffix ;; (goto-char (point-max)) ;; (insert ", " name-suffix) ;; (backward-word 1) ;; (cond ((memq (following-char) '(?j ?J ?s ?S)) ;; (capitalize-word 1) ;; (or (eq (following-char) ?.) ;; (insert ?.))) ;; (t ;; (upcase-word 1))))) ;; Result (list (buffer-string) (progn (set-buffer canonicalization-buffer) (buffer-string))) ))) ;; TODO: put this back in the above function now that it's proven: (defun variant-method (string) (let ((variant-buffer (get-buffer-create "*variant method buffer*")) (word-count 0) mixed-case-flag lower-case-flag upper-case-flag suffix-flag last-name-comma-flag comment-beg comment-end initial beg end ) (save-excursion (set-buffer variant-buffer) (buffer-disable-undo variant-buffer) (set-syntax-table address-text-syntax-table) (widen) (erase-buffer) (setq case-fold-search nil) (insert string) ;; Fix . used as space (goto-char (point-min)) (while (re-search-forward mail-bad-\.-pattern nil t) (replace-match "\\1 \\2")) ;; Skip any initial garbage. (goto-char (point-min)) (skip-chars-forward mail-non-begin-name-chars) (skip-chars-backward "& \"") (narrow-to-region (point) (point-max)) (catch 'stop (while t (skip-chars-forward mail-whitespace) (cond ;; Delete title ((and (eq word-count 0) (looking-at mail-full-name-prefixes)) (goto-char (match-end 0)) (narrow-to-region (point) (point-max))) ;; Stop after name suffix ((and (>= word-count 2) (looking-at mail-full-name-suffix-pattern)) (skip-chars-backward mail-whitespace) (setq suffix-flag (point)) (if (eq ?, (following-char)) (forward-char 1) (insert ?,)) ;; Enforce at least one space after comma (or (eq mail-space-char (following-char)) (insert mail-space-char)) (skip-chars-forward mail-whitespace) (cond ((memq (following-char) '(?j ?J ?s ?S)) (capitalize-word 1) (if (eq (following-char) ?.) (forward-char 1) (insert ?.))) (t (upcase-word 1))) (setq word-count (1+ word-count)) (throw 'stop t)) ;; Handle SCA names ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As" (setq word-count 0) (goto-char (match-beginning 1)) (narrow-to-region (point) (point-max))) ;; Various stopping points ((or ;; Stop before ALL CAPS acronyms, if preceded by mixed-case or ;; lowercase words. Eg. XT-DEM. (and (>= word-count 2) (or mixed-case-flag lower-case-flag) (looking-at mail-weird-acronym-pattern) (not (looking-at mail-roman-numeral-pattern))) ;; Stop before 4-or-more letter lowercase words preceded by ;; mixed case or uppercase words. (and (>= word-count 2) (or upper-case-flag mixed-case-flag) (looking-at "[a-z][a-z][a-z][a-z]+\\b")) ;; Stop before trailing alternative address (looking-at mail-alternative-address-pattern) ;; Stop before trailing comment not introduced by comma (looking-at mail-trailing-comment-start-pattern) ;; Stop before telephone numbers (looking-at mail-telephone-extension-pattern)) (throw 'stop t)) ;; Check for initial last name followed by comma ((and (eq ?, (following-char)) (eq word-count 1)) (forward-char 1) (setq last-name-comma-flag t) (or (eq mail-space-char (following-char)) (insert mail-space-char))) ;; Stop before trailing comma-separated comment ((eq ?, (following-char)) (throw 'stop t)) ;; Delete parenthesized/quoted comment/nickname ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`)) (setq comment-beg (point)) (set-syntax-table address-text-comment-syntax-table) (cond ((memq (following-char) '(?\' ?\`)) (if (eq ?\' (following-char)) (forward-char 1)) (or (search-forward "'" nil t) (delete-char 1))) (t (or (safe-move-sexp 1) (goto-char (point-max))))) (set-syntax-table address-text-syntax-table) (setq comment-end (point)) (cond ;; Handle case of entire name being quoted ((and (eq word-count 0) (looking-at " *\\'") (>= (- comment-end comment-beg) 2)) (narrow-to-region (1+ comment-beg) (1- comment-end)) (goto-char (point-min))) (t ;; Handle case of quoted initial (if (and (or (= 3 (- comment-end comment-beg)) (and (= 4 (- comment-end comment-beg)) (eq ?. (char-after (+ 2 comment-beg))))) (not (looking-at " *\\'"))) (setq initial (char-after (1+ comment-beg))) (setq initial nil)) (delete-region comment-beg comment-end) (if initial (insert initial ". "))))) ;; Delete ham radio call signs ((looking-at mail-ham-call-sign-pattern) (delete-region (match-beginning 0) (match-end 0))) ;; Handle & substitution ;; TODO: remember to disable middle initial guessing ((and (or (bobp) (eq mail-space-char (preceding-char))) (looking-at "&\\( \\|\\'\\)")) (delete-char 1) (capitalize-region (point) (progn (insert-buffer-substring canonicalization-buffer mbox-beg mbox-end) (point)))) ;; Fixup initials ((looking-at mail-initial-pattern) (or (eq (following-char) (upcase (following-char))) (setq lower-case-flag t)) (forward-char 1) (if (eq ?. (following-char)) (forward-char 1) (insert ?.)) (or (eq mail-space-char (following-char)) (insert mail-space-char)) (setq word-count (1+ word-count))) ;; Regular name words ((looking-at mail-name-pattern) (setq beg (point)) (setq end (match-end 0)) (set (if (re-search-forward "[a-z]" end t) (if (progn (goto-char beg) (re-search-forward "[A-Z]" end t)) 'mixed-case-flag 'lower-case-flag) 'upper-case-flag) t) (goto-char end) (setq word-count (1+ word-count))) (t (throw 'stop t))))) (narrow-to-region (point-min) (point)) ;; Delete trailing word followed immediately by . (cond ((not suffix-flag) (goto-char (point-min)) (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t) (narrow-to-region (point-min) (match-beginning 0))))) ;; If last name first put it at end (but before suffix) (cond (last-name-comma-flag (goto-char (point-min)) (search-forward ",") (setq end (1- (point))) (goto-char (or suffix-flag (point-max))) (or (eq mail-space-char (preceding-char)) (insert mail-space-char)) (insert-buffer-substring (current-buffer) (point-min) end) (narrow-to-region (1+ end) (point-max)))) (goto-char (point-max)) (skip-chars-backward mail-non-end-name-chars) (if (eq ?. (following-char)) (forward-char 1)) (narrow-to-region (point) (progn (goto-char (point-min)) (skip-chars-forward mail-non-begin-name-chars) (point))) ;; Compress whitespace (goto-char (point-min)) (while (re-search-forward "[ \t\n]+" nil t) (replace-match " ")) (buffer-substring (point-min) (point-max)) ))) ;; The country names are just in there for show right now, and because ;; Jamie thought it would be neat. They aren't used yet. ;; Keep in mind that the country abbreviations follow ISO-3166. There is ;; a U.S. FIPS that specifies a different set of two-letter country ;; abbreviations. ;; TODO: put this in its own obarray, instead of cluttering up the main ;; symbol table with junk. (mapcar (function (lambda (x) (if (symbolp x) (put x 'domain-name t) (put (car x) 'domain-name (nth 1 x))))) '((ag "Antigua") (ar "Argentina") ; Argentine Republic arpa ; Advanced Projects Research Agency (at "Austria") ; The Republic of _ (au "Australia") (bb "Barbados") (be "Belgium") ; The Kingdom of _ (bg "Bulgaria") bitnet ; Because It's Time NET (bo "Bolivia") ; Republic of _ (br "Brazil") ; The Federative Republic of _ (bs "Bahamas") (bz "Belize") (ca "Canada") (ch "Switzerland") ; The Swiss Confederation (cl "Chile") ; The Republic of _ (cn "China") ; The People's Republic of _ (co "Columbia") com ; Commercial (cr "Costa Rica") ; The Republic of _ (cs "Czechoslovakia") (de "Germany") (dk "Denmark") (dm "Dominica") (do "Dominican Republic") ; The _ (ec "Ecuador") ; The Republic of _ edu ; Educational (eg "Egypt") ; The Arab Republic of _ (es "Spain") ; The Kingdom of _ (fi "Finland") ; The Republic of _ (fj "Fiji") (fr "France") gov ; Government (U.S.A.) (gr "Greece") ; The Hellenic Republic (hk "Hong Kong") (hu "Hungary") ; The Hungarian People's Republic (???) (ie "Ireland") (il "Israel") ; The State of _ (in "India") ; The Republic of _ int ; something British, don't know what (is "Iceland") ; The Republic of _ (it "Italy") ; The Italian Republic (jm "Jamaica") (jp "Japan") (kn "St. Kitts and Nevis") (kr "South Korea") (lc "St. Lucia") (lk "Sri Lanka") ; The Democratic Socialist Republic of _ mil ; Military (U.S.A.) (mx "Mexico") ; The United Mexican States (my "Malaysia") ; changed to Myanmar???? (na "Namibia") nato ; North Atlantic Treaty Organization net ; Network (ni "Nicaragua") ; The Republic of _ (nl "Netherlands") ; The Kingdom of the _ (no "Norway") ; The Kingdom of _ (nz "New Zealand") org ; Organization (pe "Peru") (pg "Papua New Guinea") (ph "Philippines") ; The Republic of the _ (pl "Poland") (pr "Puerto Rico") (pt "Portugal") ; The Portugese Republic (py "Paraguay") (se "Sweden") ; The Kingdom of _ (sg "Singapore") ; The Republic of _ (sr "Suriname") (su "Soviet Union") (th "Thailand") ; The Kingdom of _ (tn "Tunisia") (tr "Turkey") ; The Republic of _ (tt "Trinidad and Tobago") (tw "Taiwan") (uk "United Kingdom") ; The _ of Great Britain unter-dom ; something German (us "U.S.A.") ; The United States of America uucp ; Unix to Unix CoPy (uy "Uruguay") ; The Eastern Republic of _ (vc "St. Vincent and the Grenadines") (ve "Venezuela") ; The Republic of _ (yu "Yugoslavia") ; The Socialist Federal Republic of _ ;; Also said to be Zambia ... (za "South Africa") ; The Republic of _ (why not Zaire???) (zw "Zimbabwe") ; Republic of _ )) ;; fipnet ;; Code for testing. (defun time-extract () (let (times list) (setq times (cons (current-time-string) times) list problem-address-alist) (while list (mail-extract-address-components (car (car list))) (setq list (cdr list))) (setq times (cons (current-time-string) times)) (nreverse times))) (defun test-extract (&optional starting-point) (interactive) (set-buffer (get-buffer-create "*Testing*")) (erase-buffer) (sit-for 0) (mapcar 'test-extract-internal (if starting-point (memq starting-point problem-address-alist) problem-address-alist))) (defvar failed-item) (defun test-extract-internal (item) (setq failed-item item) (let* ((address (car item)) (correct-name (nth 1 item)) (correct-canon (nth 2 item)) (result (mail-extract-address-components address)) (name (car result)) (canon (nth 1 result)) (name-correct (or (null correct-name) (string-equal (downcase correct-name) (downcase name)))) (canon-correct (or (null correct-canon) (string-equal correct-canon canon)))) (cond ((not (and name-correct canon-correct)) (pop-to-buffer "*Testing*") (select-window (get-buffer-window (current-buffer))) (goto-char (point-max)) (insert "Address: " address "\n") (if (not name-correct) (insert " Correct Name: [" correct-name "]\; Result: [" name "]\n")) (if (not canon-correct) (insert " Correct Canon: [" correct-canon "]\; Result: [" canon "]\n")) (insert "\n") (sit-for 0)))) (setq failed-item nil)) (defun test-continue-extract () (interactive) (test-extract failed-item)) ;; Assorted junk. ;; warsaw@nlm.nih.gov (A Bad Dude -- Barry Warsaw) ;;'(from ;; reply-to ;; return-path ;; x-uucp-from ;; sender ;; resent-from ;; resent-sender ;; resent-reply-to) ;;; mail-extr.el ends here