mirror of
https://git.savannah.gnu.org/git/emacs.git
synced 2024-12-14 09:39:42 +00:00
1469 lines
47 KiB
EmacsLisp
1469 lines
47 KiB
EmacsLisp
;;; mail-extr.el --- extract full name and address from RFC 822 mail header.
|
||
|
||
;; Copyright (C) 1992 Free Software Foundation, Inc.
|
||
|
||
;; Author: Joe Wells <jbw@cs.bu.edu>
|
||
;; Version: 1.0
|
||
;; Adapted-By: ESR
|
||
;; Keywords: mail
|
||
|
||
;; This file is part of GNU Emacs.
|
||
|
||
;; GNU Emacs is free software; you can redistribute it and/or modify
|
||
;; it under the terms of the GNU General Public License as published by
|
||
;; the Free Software Foundation; either version 1, or (at your option)
|
||
;; any later version.
|
||
|
||
;; GNU Emacs is distributed in the hope that it will be useful,
|
||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
;; GNU General Public License for more details.
|
||
|
||
;; You should have received a copy of the GNU General Public License
|
||
;; along with GNU Emacs; see the file COPYING. If not, write to
|
||
;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||
|
||
;;; Commentary:
|
||
|
||
;; Here is `mail-extr', a package for extracting full names and canonical
|
||
;; addresses from RFC 822 mail headers. It is intended to be hooked into
|
||
;; other Emacs Lisp packages that deal with RFC 822 format messages, such as
|
||
;; Gnews, GNUS, RMAIL, MH-E, BBDB, VM, Supercite, etc. Thus, this release is
|
||
;; mainly for Emacs Lisp developers.
|
||
|
||
;; There are two main benefits:
|
||
|
||
;; 1. Higher probability of getting the correct full name for a human than
|
||
;; any other package I know of. (On the other hand, it will cheerfully
|
||
;; mangle non-human names/comments.)
|
||
;; 2. Address part is put in a canonical form.
|
||
|
||
;; The interface is not yet carved in stone; please give me suggestions.
|
||
|
||
;; I have an extensive test-case collection of funny addresses if you want to
|
||
;; work with the code. Developing this code requires frequent testing to
|
||
;; make sure you're not breaking functionality. I'm not posting the
|
||
;; test-cases because they take over 100K.
|
||
|
||
;; If you find an address that mail-extr fails on, please send it to me along
|
||
;; with what you think the correct results should be. I do not consider it a
|
||
;; bug if mail-extr mangles a comment that does not correspond to a real
|
||
;; human full name, although I would prefer that mail-extr would return the
|
||
;; comment as-is.
|
||
|
||
;; Features:
|
||
|
||
;; * Full name handling:
|
||
|
||
;; * knows where full names can be found in an address.
|
||
;; * avoids using empty comments and quoted text.
|
||
;; * extracts full names from mailbox names.
|
||
;; * recognizes common formats for comments after a full name.
|
||
;; * puts a period and a space after each initial.
|
||
;; * understands & referring to the mailbox name capitalized.
|
||
;; * strips name prefixes like "Prof.", etc..
|
||
;; * understands what characters can occur in names (not just letters).
|
||
;; * figures out middle initial from mailbox name.
|
||
;; * removes funny nicknames.
|
||
;; * keeps suffixes such as Jr., Sr., III, etc.
|
||
;; * reorders "Last, First" type names.
|
||
|
||
;; * Address handling:
|
||
|
||
;; * parses rfc822 quoted text, comments, and domain literals.
|
||
;; * parses rfc822 multi-line headers.
|
||
;; * does something reasonable with rfc822 GROUP addresses.
|
||
;; * handles many rfc822 noncompliant and garbage addresses.
|
||
;; * canonicalizes addresses (after stripping comments/phrases outside <>).
|
||
;; * converts ! addresses into .UUCP and %-style addresses.
|
||
;; * converts rfc822 ROUTE addresses to %-style addresses.
|
||
;; * truncates %-style addresses at leftmost fully qualified domain name.
|
||
;; * handles local relative precedence of ! vs. % and @ (untested).
|
||
|
||
;; It does almost no string creation. It primarily uses the built-in
|
||
;; parsing routines with the appropriate syntax tables. This should
|
||
;; result in greater speed.
|
||
|
||
;; TODO:
|
||
|
||
;; * handle all test cases. (This will take forever.)
|
||
;; * software to pick the correct header to use (eg., "Senders-Name:").
|
||
;; * multiple addresses in the "From:" header (almost all of the necessary
|
||
;; code is there).
|
||
;; * flag to not treat `,' as an address separator. (This is useful when
|
||
;; there is a "From:" header but no "Sender:" header, because then there
|
||
;; is only allowed to be one address.)
|
||
;; * mailbox name does not necessarily contain full name.
|
||
;; * fixing capitalization when it's all upper or lowercase. (Hard!)
|
||
;; * some of the domain literal handling is missing. (But I've never even
|
||
;; seen one of these in a mail address, so maybe no big deal.)
|
||
;; * arrange to have syntax tables byte-compiled.
|
||
;; * speed hacks.
|
||
;; * delete unused variables.
|
||
;; * arrange for testing with different relative precedences of ! vs. @
|
||
;; and %.
|
||
;; * put variant-method back into mail-extract-address-components.
|
||
;; * insert documentation strings!
|
||
;; * handle X.400-gatewayed addresses according to RFC 1148.
|
||
|
||
;;; Change Log:
|
||
;;
|
||
;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu)
|
||
;;
|
||
;; * Cleaned up some more. Release version 1.0 to world.
|
||
;;
|
||
;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu)
|
||
;;
|
||
;; * Cleaned up full name extraction extensively.
|
||
;;
|
||
;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu)
|
||
;;
|
||
;; * Total rewrite. Integrated mail-canonicalize-address into
|
||
;; mail-extract-address-components. Now handles GROUP addresses more
|
||
;; or less correctly. Better handling of lots of different cases.
|
||
;;
|
||
;; Fri Jun 14 19:39:50 1991
|
||
;; * Created.
|
||
|
||
;;; Code:
|
||
|
||
;; Variable definitions.
|
||
|
||
(defvar mail-@-binds-tighter-than-! nil)
|
||
|
||
;;----------------------------------------------------------------------
|
||
;; what orderings are meaningful?????
|
||
;;(defvar mail-operator-precedence-list '(?! ?% ?@))
|
||
;; Right operand of a % or a @ must be a domain name, period. No other
|
||
;; operators allowed. Left operand of a @ is an address relative to that
|
||
;; site.
|
||
|
||
;; Left operand of a ! must be a domain name. Right operand is an
|
||
;; arbitrary address.
|
||
;;----------------------------------------------------------------------
|
||
|
||
(defconst mail-space-char 32)
|
||
|
||
(defconst mail-whitespace " \t\n")
|
||
|
||
;; Any character that can occur in a name in an RFC822 address.
|
||
;; Yes, there are weird people with digits in their names.
|
||
(defconst mail-all-letters "A-Za-z---{|}'~0-9`.")
|
||
|
||
;; Any character that can occur in a name, not counting characters that
|
||
;; separate parts of a multipart name.
|
||
(defconst mail-all-letters-but-separators "A-Za-z{|}'~0-9`")
|
||
|
||
;; Any character that can start a name
|
||
(defconst mail-first-letters "A-Za-z")
|
||
|
||
;; Any character that can end a name.
|
||
(defconst mail-last-letters "A-Za-z`'.")
|
||
|
||
;; Matches an initial not followed by both a period and a space.
|
||
(defconst mail-bad-initials-pattern
|
||
(format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)"
|
||
mail-all-letters mail-first-letters mail-all-letters))
|
||
|
||
(defconst mail-non-name-chars (concat "^" mail-all-letters "."))
|
||
|
||
(defconst mail-non-begin-name-chars (concat "^" mail-first-letters))
|
||
|
||
(defconst mail-non-end-name-chars (concat "^" mail-last-letters))
|
||
|
||
;; Matches periods used instead of spaces. Must not match the period
|
||
;; following an initial.
|
||
(defconst mail-bad-\.-pattern
|
||
(format "\\([%s][%s]\\)\\.+\\([%s]\\)"
|
||
mail-all-letters mail-last-letters mail-first-letters))
|
||
|
||
;; Matches an embedded or leading nickname that should be removed.
|
||
(defconst mail-nickname-pattern
|
||
(format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] "
|
||
mail-all-letters))
|
||
|
||
;; Matches a leading title that is not part of the name (does not
|
||
;; contribute to uniquely identifying the person).
|
||
(defconst mail-full-name-prefixes
|
||
'"\\` *\\(Prof\\|Dr\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.? ")
|
||
|
||
;; Matches the occurrence of a generational name suffix, and the last
|
||
;; character of the preceding name.
|
||
(defconst mail-full-name-suffix-pattern
|
||
(format
|
||
"\\(,? ?\\([JjSs]r\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)"
|
||
mail-all-letters mail-all-letters))
|
||
|
||
(defconst mail-roman-numeral-pattern
|
||
"V?I+V?\\b")
|
||
|
||
;; Matches a trailing uppercase (with other characters possible) acronym.
|
||
;; Must not match a trailing uppercase last name or trailing initial
|
||
(defconst mail-weird-acronym-pattern "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)")
|
||
|
||
;; Matches a mixed-case or lowercase name (not an initial).
|
||
(defconst mail-mixed-case-name-pattern
|
||
(format
|
||
"\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)"
|
||
mail-all-letters mail-last-letters
|
||
mail-first-letters mail-all-letters mail-all-letters mail-last-letters
|
||
mail-first-letters mail-all-letters))
|
||
|
||
;; Matches a trailing alternative address.
|
||
(defconst mail-alternative-address-pattern "[a-zA-Z.]+[!@][a-zA-Z.]")
|
||
|
||
;; Matches a variety of trailing comments not including comma-delimited
|
||
;; comments.
|
||
(defconst mail-trailing-comment-start-pattern " [-{]\\|--\\|[+@#></\;]")
|
||
|
||
;; Matches a name (not an initial).
|
||
;; This doesn't force a word boundary at the end because sometimes a
|
||
;; comment is separated by a `-' with no preceding space.
|
||
(defconst mail-name-pattern
|
||
(format
|
||
"\\b[%s][%s]*[%s]"
|
||
mail-first-letters mail-all-letters mail-last-letters))
|
||
|
||
(defconst mail-initial-pattern
|
||
(format "\\b[%s]\\([. ]\\|\\b\\)" mail-first-letters))
|
||
|
||
;; Matches a single name before a comma.
|
||
(defconst mail-last-name-first-pattern
|
||
(concat "\\`" mail-name-pattern ","))
|
||
|
||
;; Matches telephone extensions.
|
||
(defconst mail-telephone-extension-pattern
|
||
"\\(\\([Ee]xt\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+")
|
||
|
||
;; Matches ham radio call signs.
|
||
(defconst mail-ham-call-sign-pattern
|
||
"\\b[A-Z]+[0-9][A-Z0-9]*")
|
||
|
||
;; Matches normal single-part name
|
||
(defconst mail-normal-name-pattern
|
||
(format
|
||
"\\b[%s][%s]+[%s]"
|
||
mail-first-letters mail-all-letters-but-separators mail-last-letters))
|
||
|
||
;; Matches normal two names with missing middle initial
|
||
(defconst mail-two-name-pattern
|
||
(concat "\\`\\(" mail-normal-name-pattern
|
||
"\\|" mail-initial-pattern
|
||
"\\) +\\(" mail-normal-name-pattern "\\)\\(,\\|\\'\\)"))
|
||
|
||
(defvar address-syntax-table (make-syntax-table))
|
||
(defvar address-comment-syntax-table (make-syntax-table))
|
||
(defvar address-domain-literal-syntax-table (make-syntax-table))
|
||
(defvar address-text-comment-syntax-table (make-syntax-table))
|
||
(defvar address-text-syntax-table (make-syntax-table))
|
||
(mapcar
|
||
(function
|
||
(lambda (pair)
|
||
(let ((syntax-table (symbol-value (car pair))))
|
||
(mapcar
|
||
(function
|
||
(lambda (item)
|
||
(if (eq 2 (length item))
|
||
(modify-syntax-entry (car item) (car (cdr item)) syntax-table)
|
||
(let ((char (car item))
|
||
(bound (car (cdr item)))
|
||
(syntax (car (cdr (cdr item)))))
|
||
(while (<= char bound)
|
||
(modify-syntax-entry char syntax syntax-table)
|
||
(setq char (1+ char)))))))
|
||
(cdr pair)))))
|
||
'((address-syntax-table
|
||
(0 31 "w") ;control characters
|
||
(32 " ") ;SPC
|
||
(?! ?~ "w") ;printable characters
|
||
(127 "w") ;DEL
|
||
(128 255 "w") ;high-bit-on characters
|
||
(?\t " ")
|
||
(?\r " ")
|
||
(?\n " ")
|
||
(?\( ".")
|
||
(?\) ".")
|
||
(?< ".")
|
||
(?> ".")
|
||
(?@ ".")
|
||
(?, ".")
|
||
(?\; ".")
|
||
(?: ".")
|
||
(?\\ "\\")
|
||
(?\" "\"")
|
||
(?. ".")
|
||
(?\[ ".")
|
||
(?\] ".")
|
||
;; % and ! aren't RFC822 characters, but it is convenient to pretend
|
||
(?% ".")
|
||
(?! ".")
|
||
)
|
||
(address-comment-syntax-table
|
||
(0 255 "w")
|
||
(?\( "\(\)")
|
||
(?\) "\)\(")
|
||
(?\\ "\\"))
|
||
(address-domain-literal-syntax-table
|
||
(0 255 "w")
|
||
(?\[ "\(\]") ;??????
|
||
(?\] "\)\[") ;??????
|
||
(?\\ "\\"))
|
||
(address-text-comment-syntax-table
|
||
(0 255 "w")
|
||
(?\( "\(\)")
|
||
(?\) "\)\(")
|
||
(?\[ "\(\]")
|
||
(?\] "\)\[")
|
||
(?\{ "\(\}")
|
||
(?\} "\)\{")
|
||
(?\\ "\\")
|
||
(?\" "\"")
|
||
;; (?\' "\)\`")
|
||
;; (?\` "\(\'")
|
||
)
|
||
(address-text-syntax-table
|
||
(0 255 ".")
|
||
(?A ?Z "w")
|
||
(?a ?z "w")
|
||
(?- "w")
|
||
(?\} "w")
|
||
(?\{ "w")
|
||
(?| "w")
|
||
(?\' "w")
|
||
(?~ "w")
|
||
(?0 ?9 "w"))
|
||
))
|
||
|
||
|
||
;; Utility functions and macros.
|
||
|
||
(defmacro undo-backslash-quoting (beg end)
|
||
(`(save-excursion
|
||
(save-restriction
|
||
(narrow-to-region (, beg) (, end))
|
||
(goto-char (point-min))
|
||
;; undo \ quoting
|
||
(while (re-search-forward "\\\\\\(.\\)" nil t)
|
||
(replace-match "\\1")
|
||
;; CHECK: does this leave point after the replacement?
|
||
)))))
|
||
|
||
(defmacro mail-nuke-char-at (pos)
|
||
(` (save-excursion
|
||
(goto-char (, pos))
|
||
(delete-char 1)
|
||
(insert mail-space-char))))
|
||
|
||
(defmacro mail-nuke-elements-outside-range (list-symbol beg-symbol end-symbol
|
||
&optional no-replace)
|
||
(` (progn
|
||
(setq temp (, list-symbol))
|
||
(while temp
|
||
(cond ((or (> (car temp) (, end-symbol))
|
||
(< (car temp) (, beg-symbol)))
|
||
(, (or no-replace
|
||
(` (mail-nuke-char-at (car temp)))))
|
||
(setcar temp nil)))
|
||
(setq temp (cdr temp)))
|
||
(setq (, list-symbol) (delq nil (, list-symbol))))))
|
||
|
||
(defun mail-demarkerize (marker)
|
||
(and marker
|
||
(if (markerp marker)
|
||
(let ((temp (marker-position marker)))
|
||
(set-marker marker nil)
|
||
temp)
|
||
marker)))
|
||
|
||
(defun mail-markerize (pos)
|
||
(and pos
|
||
(if (markerp pos)
|
||
pos
|
||
(copy-marker pos))))
|
||
|
||
(defmacro mail-last-element (list)
|
||
"Return last element of LIST."
|
||
(` (let ((list (, list)))
|
||
(while (not (null (cdr list)))
|
||
(setq list (cdr list)))
|
||
(car list))))
|
||
|
||
(defmacro safe-move-sexp (arg)
|
||
"Safely skip over one balanced sexp, if there is one. Return t if success."
|
||
(` (condition-case error
|
||
(progn
|
||
(goto-char (scan-sexps (point) (, arg)))
|
||
t)
|
||
(error
|
||
(if (string-equal (nth 1 error) "Unbalanced parentheses")
|
||
nil
|
||
(while t
|
||
(signal (car error) (cdr error))))))))
|
||
|
||
|
||
;; The main function to grind addresses
|
||
|
||
(defun mail-extract-address-components (address)
|
||
"Given an rfc 822 ADDRESS, extract full name and canonical address.
|
||
Returns a list of the form (FULL-NAME CANONICAL-ADDRESS)."
|
||
(let ((canonicalization-buffer (get-buffer-create "*canonical address*"))
|
||
(extraction-buffer (get-buffer-create "*extract address components*"))
|
||
(foo 'bar)
|
||
char
|
||
multiple-addresses
|
||
<-pos >-pos @-pos :-pos ,-pos !-pos %-pos \;-pos
|
||
group-:-pos group-\;-pos route-addr-:-pos
|
||
record-pos-symbol
|
||
first-real-pos last-real-pos
|
||
phrase-beg phrase-end
|
||
comment-beg comment-end
|
||
quote-beg quote-end
|
||
atom-beg atom-end
|
||
mbox-beg mbox-end
|
||
\.-ends-name
|
||
temp
|
||
name-suffix
|
||
saved-point
|
||
fi mi li
|
||
saved-%-pos saved-!-pos saved-@-pos
|
||
domain-pos \.-pos insert-point)
|
||
|
||
(save-excursion
|
||
(set-buffer extraction-buffer)
|
||
(buffer-disable-undo extraction-buffer)
|
||
(set-syntax-table address-syntax-table)
|
||
(widen)
|
||
(erase-buffer)
|
||
(setq case-fold-search nil)
|
||
|
||
;; Insert extra space at beginning to allow later replacement with <
|
||
;; without having to move markers.
|
||
(insert mail-space-char address)
|
||
|
||
;; stolen from rfc822.el
|
||
;; Unfold multiple lines.
|
||
(goto-char (point-min))
|
||
(while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t)
|
||
(replace-match "\\1 " t))
|
||
|
||
;; first pass grabs useful information about address
|
||
(goto-char (point-min))
|
||
(while (progn
|
||
(skip-chars-forward mail-whitespace)
|
||
(not (eobp)))
|
||
(setq char (char-after (point)))
|
||
(or first-real-pos
|
||
(if (not (eq char ?\())
|
||
(setq first-real-pos (point))))
|
||
(cond
|
||
;; comment
|
||
((eq char ?\()
|
||
(set-syntax-table address-comment-syntax-table)
|
||
;; only record the first non-empty comment's position
|
||
(if (and (not comment-beg)
|
||
(save-excursion
|
||
(forward-char 1)
|
||
(skip-chars-forward mail-whitespace)
|
||
(not (eq ?\) (char-after (point))))))
|
||
(setq comment-beg (point)))
|
||
;; TODO: don't record if unbalanced
|
||
(or (safe-move-sexp 1)
|
||
(forward-char 1))
|
||
(set-syntax-table address-syntax-table)
|
||
(if (and comment-beg
|
||
(not comment-end))
|
||
(setq comment-end (point))))
|
||
;; quoted text
|
||
((eq char ?\")
|
||
;; only record the first non-empty quote's position
|
||
(if (and (not quote-beg)
|
||
(save-excursion
|
||
(forward-char 1)
|
||
(skip-chars-forward mail-whitespace)
|
||
(not (eq ?\" (char-after (point))))))
|
||
(setq quote-beg (point)))
|
||
;; TODO: don't record if unbalanced
|
||
(or (safe-move-sexp 1)
|
||
(forward-char 1))
|
||
(if (and quote-beg
|
||
(not quote-end))
|
||
(setq quote-end (point))))
|
||
;; domain literals
|
||
((eq char ?\[)
|
||
(set-syntax-table address-domain-literal-syntax-table)
|
||
(or (safe-move-sexp 1)
|
||
(forward-char 1))
|
||
(set-syntax-table address-syntax-table))
|
||
;; commas delimit addresses when outside < > pairs.
|
||
((and (eq char ?,)
|
||
(or (null <-pos)
|
||
(and >-pos
|
||
;; handle weird munged addresses
|
||
(> (mail-last-element <-pos) (car >-pos)))))
|
||
(setq multiple-addresses t)
|
||
(delete-char 1)
|
||
(narrow-to-region (point-min) (point)))
|
||
;; record the position of various interesting chars, determine
|
||
;; legality later.
|
||
((setq record-pos-symbol
|
||
(cdr (assq char
|
||
'((?< . <-pos) (?> . >-pos) (?@ . @-pos)
|
||
(?: . :-pos) (?, . ,-pos) (?! . !-pos)
|
||
(?% . %-pos) (?\; . \;-pos)))))
|
||
(set record-pos-symbol
|
||
(cons (point) (symbol-value record-pos-symbol)))
|
||
(forward-char 1))
|
||
((eq char ?.)
|
||
(forward-char 1))
|
||
((memq char '(
|
||
;; comment terminator illegal
|
||
?\)
|
||
;; domain literal terminator illegal
|
||
?\]
|
||
;; \ allowed only within quoted strings,
|
||
;; domain literals, and comments
|
||
?\\
|
||
))
|
||
(mail-nuke-char-at (point))
|
||
(forward-char 1))
|
||
(t
|
||
(forward-word 1)))
|
||
(or (eq char ?\()
|
||
(setq last-real-pos (point))))
|
||
|
||
;; Use only the leftmost <, if any. Replace all others with spaces.
|
||
(while (cdr <-pos)
|
||
(mail-nuke-char-at (car <-pos))
|
||
(setq <-pos (cdr <-pos)))
|
||
|
||
;; Use only the rightmost >, if any. Replace all others with spaces.
|
||
(while (cdr >-pos)
|
||
(mail-nuke-char-at (nth 1 >-pos))
|
||
(setcdr >-pos (nthcdr 2 >-pos)))
|
||
|
||
;; If multiple @s and a :, but no < and >, insert around buffer.
|
||
;; This commonly happens on the UUCP "From " line. Ugh.
|
||
(cond ((and (> (length @-pos) 1)
|
||
:-pos ;TODO: check if between @s
|
||
(not <-pos))
|
||
(goto-char (point-min))
|
||
(delete-char 1)
|
||
(setq <-pos (list (point)))
|
||
(insert ?<)))
|
||
|
||
;; If < but no >, insert > in rightmost possible position
|
||
(cond ((and <-pos
|
||
(null >-pos))
|
||
(goto-char (point-max))
|
||
(setq >-pos (list (point)))
|
||
(insert ?>)))
|
||
|
||
;; If > but no <, replace > with space.
|
||
(cond ((and >-pos
|
||
(null <-pos))
|
||
(mail-nuke-char-at (car >-pos))
|
||
(setq >-pos nil)))
|
||
|
||
;; Turn >-pos and <-pos into non-lists
|
||
(setq >-pos (car >-pos)
|
||
<-pos (car <-pos))
|
||
|
||
;; Trim other punctuation lists of items outside < > pair to handle
|
||
;; stupid MTAs.
|
||
(cond (<-pos ; don't need to check >-pos also
|
||
;; handle bozo software that violates RFC 822 by sticking
|
||
;; punctuation marks outside of a < > pair
|
||
(mail-nuke-elements-outside-range @-pos <-pos >-pos t)
|
||
;; RFC 822 says nothing about these two outside < >, but
|
||
;; remove those positions from the lists to make things
|
||
;; easier.
|
||
(mail-nuke-elements-outside-range !-pos <-pos >-pos t)
|
||
(mail-nuke-elements-outside-range %-pos <-pos >-pos t)))
|
||
|
||
;; Check for : that indicates GROUP list and for : part of
|
||
;; ROUTE-ADDR spec.
|
||
;; Can't possibly be more than two :. Nuke any extra.
|
||
(while :-pos
|
||
(setq temp (car :-pos)
|
||
:-pos (cdr :-pos))
|
||
(cond ((and <-pos >-pos
|
||
(> temp <-pos)
|
||
(< temp >-pos))
|
||
(if (or route-addr-:-pos
|
||
(< (length @-pos) 2)
|
||
(> temp (car @-pos))
|
||
(< temp (nth 1 @-pos)))
|
||
(mail-nuke-char-at temp)
|
||
(setq route-addr-:-pos temp)))
|
||
((or (not <-pos)
|
||
(and <-pos
|
||
(< temp <-pos)))
|
||
(setq group-:-pos temp))))
|
||
|
||
;; Nuke any ; that is in or to the left of a < > pair or to the left
|
||
;; of a GROUP starting :. Also, there may only be one ;.
|
||
(while \;-pos
|
||
(setq temp (car \;-pos)
|
||
\;-pos (cdr \;-pos))
|
||
(cond ((and <-pos >-pos
|
||
(> temp <-pos)
|
||
(< temp >-pos))
|
||
(mail-nuke-char-at temp))
|
||
((and (or (not group-:-pos)
|
||
(> temp group-:-pos))
|
||
(not group-\;-pos))
|
||
(setq group-\;-pos temp))))
|
||
|
||
;; Handle junk like ";@host.company.dom" that sendmail adds.
|
||
;; **** should I remember comment positions?
|
||
(and group-\;-pos
|
||
;; this is fine for now
|
||
(mail-nuke-elements-outside-range !-pos group-:-pos group-\;-pos t)
|
||
(mail-nuke-elements-outside-range @-pos group-:-pos group-\;-pos t)
|
||
(mail-nuke-elements-outside-range %-pos group-:-pos group-\;-pos t)
|
||
(mail-nuke-elements-outside-range ,-pos group-:-pos group-\;-pos t)
|
||
(and last-real-pos
|
||
(> last-real-pos (1+ group-\;-pos))
|
||
(setq last-real-pos (1+ group-\;-pos)))
|
||
(and comment-end
|
||
(> comment-end group-\;-pos)
|
||
(setq comment-end nil
|
||
comment-beg nil))
|
||
(and quote-end
|
||
(> quote-end group-\;-pos)
|
||
(setq quote-end nil
|
||
quote-beg nil))
|
||
(narrow-to-region (point-min) group-\;-pos))
|
||
|
||
;; Any commas must be between < and : of ROUTE-ADDR. Nuke any
|
||
;; others.
|
||
;; Hell, go ahead an nuke all of the commas.
|
||
;; **** This will cause problems when we start handling commas in
|
||
;; the PHRASE part .... no it won't ... yes it will ... ?????
|
||
(mail-nuke-elements-outside-range ,-pos 1 1)
|
||
|
||
;; can only have multiple @s inside < >. The fact that some MTAs
|
||
;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is
|
||
;; handled above.
|
||
|
||
;; Locate PHRASE part of ROUTE-ADDR.
|
||
(cond (<-pos
|
||
(goto-char <-pos)
|
||
(skip-chars-backward mail-whitespace)
|
||
(setq phrase-end (point))
|
||
(goto-char (or ;;group-:-pos
|
||
(point-min)))
|
||
(skip-chars-forward mail-whitespace)
|
||
(if (< (point) phrase-end)
|
||
(setq phrase-beg (point))
|
||
(setq phrase-end nil))))
|
||
|
||
;; handle ROUTE-ADDRS with real ROUTEs.
|
||
;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and
|
||
;; any % or ! must be semantically meaningless.
|
||
;; TODO: do this processing into canonicalization buffer
|
||
(cond (route-addr-:-pos
|
||
(setq !-pos nil
|
||
%-pos nil
|
||
>-pos (copy-marker >-pos)
|
||
route-addr-:-pos (copy-marker route-addr-:-pos))
|
||
(goto-char >-pos)
|
||
(insert-before-markers ?X)
|
||
(goto-char (car @-pos))
|
||
(while (setq @-pos (cdr @-pos))
|
||
(delete-char 1)
|
||
(setq %-pos (cons (point-marker) %-pos))
|
||
(insert "%")
|
||
(goto-char (1- >-pos))
|
||
(save-excursion
|
||
(insert-buffer-substring extraction-buffer
|
||
(car @-pos) route-addr-:-pos)
|
||
(delete-region (car @-pos) route-addr-:-pos))
|
||
(or (cdr @-pos)
|
||
(setq saved-@-pos (list (point)))))
|
||
(setq @-pos saved-@-pos)
|
||
(goto-char >-pos)
|
||
(delete-char -1)
|
||
(mail-nuke-char-at route-addr-:-pos)
|
||
(mail-demarkerize route-addr-:-pos)
|
||
(setq route-addr-:-pos nil
|
||
>-pos (mail-demarkerize >-pos)
|
||
%-pos (mapcar 'mail-demarkerize %-pos))))
|
||
|
||
;; de-listify @-pos
|
||
(setq @-pos (car @-pos))
|
||
|
||
;; TODO: remove comments in the middle of an address
|
||
|
||
(set-buffer canonicalization-buffer)
|
||
|
||
(buffer-disable-undo canonicalization-buffer)
|
||
(set-syntax-table address-syntax-table)
|
||
(setq case-fold-search nil)
|
||
|
||
(widen)
|
||
(erase-buffer)
|
||
(insert-buffer-substring extraction-buffer)
|
||
|
||
(if <-pos
|
||
(narrow-to-region (progn
|
||
(goto-char (1+ <-pos))
|
||
(skip-chars-forward mail-whitespace)
|
||
(point))
|
||
>-pos)
|
||
;; ****** Oh no! What if the address is completely empty!
|
||
(narrow-to-region first-real-pos last-real-pos))
|
||
|
||
(and @-pos %-pos
|
||
(mail-nuke-elements-outside-range %-pos (point-min) @-pos))
|
||
(and %-pos !-pos
|
||
(mail-nuke-elements-outside-range !-pos (point-min) (car %-pos)))
|
||
(and @-pos !-pos (not %-pos)
|
||
(mail-nuke-elements-outside-range !-pos (point-min) @-pos))
|
||
|
||
;; Error condition:?? (and %-pos (not @-pos))
|
||
|
||
(cond (!-pos
|
||
;; **** I don't understand this save-restriction and the
|
||
;; narrow-to-region inside it. Why did I do that?
|
||
(save-restriction
|
||
(cond ((and @-pos
|
||
mail-@-binds-tighter-than-!)
|
||
(goto-char @-pos)
|
||
(setq %-pos (cons (point) %-pos)
|
||
@-pos nil)
|
||
(delete-char 1)
|
||
(insert "%")
|
||
(setq insert-point (point-max)))
|
||
(mail-@-binds-tighter-than-!
|
||
(setq insert-point (point-max)))
|
||
(%-pos
|
||
(setq insert-point (mail-last-element %-pos)
|
||
saved-%-pos (mapcar 'mail-markerize %-pos)
|
||
%-pos nil
|
||
@-pos (mail-markerize @-pos)))
|
||
(@-pos
|
||
(setq insert-point @-pos)
|
||
(setq @-pos (mail-markerize @-pos)))
|
||
(t
|
||
(setq insert-point (point-max))))
|
||
(narrow-to-region (point-min) insert-point)
|
||
(setq saved-!-pos (car !-pos))
|
||
(while !-pos
|
||
(goto-char (point-max))
|
||
(cond ((and (not @-pos)
|
||
(not (cdr !-pos)))
|
||
(setq @-pos (point))
|
||
(insert-before-markers "@ "))
|
||
(t
|
||
(setq %-pos (cons (point) %-pos))
|
||
(insert-before-markers "% ")))
|
||
(backward-char 1)
|
||
(insert-buffer-substring
|
||
(current-buffer)
|
||
(if (nth 1 !-pos)
|
||
(1+ (nth 1 !-pos))
|
||
(point-min))
|
||
(car !-pos))
|
||
(delete-char 1)
|
||
(or (save-excursion
|
||
(safe-move-sexp -1)
|
||
(skip-chars-backward mail-whitespace)
|
||
(eq ?. (preceding-char)))
|
||
(insert-before-markers
|
||
(if (save-excursion
|
||
(skip-chars-backward mail-whitespace)
|
||
(eq ?. (preceding-char)))
|
||
""
|
||
".")
|
||
"uucp"))
|
||
(setq !-pos (cdr !-pos))))
|
||
(and saved-%-pos
|
||
(setq %-pos (append (mapcar 'mail-demarkerize saved-%-pos)
|
||
%-pos)))
|
||
(setq @-pos (mail-demarkerize @-pos))
|
||
(narrow-to-region (1+ saved-!-pos) (point-max))))
|
||
(cond ((and %-pos
|
||
(not @-pos))
|
||
(goto-char (car %-pos))
|
||
(delete-char 1)
|
||
(setq @-pos (point))
|
||
(insert "@")
|
||
(setq %-pos (cdr %-pos))))
|
||
(setq %-pos (nreverse %-pos))
|
||
;; RFC 1034 doesn't approve of this, oh well:
|
||
(downcase-region (or (car %-pos) @-pos (point-max)) (point-max))
|
||
(cond (%-pos ; implies @-pos valid
|
||
(setq temp %-pos)
|
||
(catch 'truncated
|
||
(while temp
|
||
(goto-char (or (nth 1 temp)
|
||
@-pos))
|
||
(skip-chars-backward mail-whitespace)
|
||
(save-excursion
|
||
(safe-move-sexp -1)
|
||
(setq domain-pos (point))
|
||
(skip-chars-backward mail-whitespace)
|
||
(setq \.-pos (eq ?. (preceding-char))))
|
||
(cond ((and \.-pos
|
||
(get
|
||
(intern
|
||
(buffer-substring domain-pos (point)))
|
||
'domain-name))
|
||
(narrow-to-region (point-min) (point))
|
||
(goto-char (car temp))
|
||
(delete-char 1)
|
||
(setq @-pos (point))
|
||
(setcdr temp nil)
|
||
(setq %-pos (delq @-pos %-pos))
|
||
(insert "@")
|
||
(throw 'truncated t)))
|
||
(setq temp (cdr temp))))))
|
||
(setq mbox-beg (point-min)
|
||
mbox-end (if %-pos (car %-pos)
|
||
(or @-pos
|
||
(point-max))))
|
||
|
||
;; Done canonicalizing address.
|
||
|
||
(set-buffer extraction-buffer)
|
||
|
||
;; Find the full name
|
||
|
||
(cond ((and phrase-beg
|
||
(eq quote-beg phrase-beg)
|
||
(<= quote-end phrase-end))
|
||
(narrow-to-region (1+ quote-beg) (1- quote-end))
|
||
(undo-backslash-quoting (point-min) (point-max)))
|
||
(phrase-beg
|
||
(narrow-to-region phrase-beg phrase-end))
|
||
(comment-beg
|
||
(narrow-to-region (1+ comment-beg) (1- comment-end))
|
||
(undo-backslash-quoting (point-min) (point-max)))
|
||
(t
|
||
;; *** Work in canon buffer instead? No, can't. Hmm.
|
||
(delete-region (point-min) (point-max))
|
||
(insert-buffer-substring canonicalization-buffer
|
||
mbox-beg mbox-end)
|
||
(goto-char (point-min))
|
||
(setq \.-ends-name (search-forward "_" nil t))
|
||
(goto-char (point-min))
|
||
(while (progn
|
||
(skip-chars-forward mail-whitespace)
|
||
(not (eobp)))
|
||
(setq char (char-after (point)))
|
||
(cond
|
||
((eq char ?\")
|
||
(setq quote-beg (point))
|
||
(or (safe-move-sexp 1)
|
||
;; TODO: handle this error condition!!!!!
|
||
(forward-char 1))
|
||
;; take into account deletions
|
||
(setq quote-end (- (point) 2))
|
||
(save-excursion
|
||
(backward-char 1)
|
||
(delete-char 1)
|
||
(goto-char quote-beg)
|
||
(delete-char 1))
|
||
(undo-backslash-quoting quote-beg quote-end)
|
||
(or (eq mail-space-char (char-after (point)))
|
||
(insert " "))
|
||
(setq \.-ends-name t))
|
||
((eq char ?.)
|
||
(if (eq (char-after (1+ (point))) ?_)
|
||
(progn
|
||
(forward-char 1)
|
||
(delete-char 1)
|
||
(insert mail-space-char))
|
||
(if \.-ends-name
|
||
(narrow-to-region (point-min) (point))
|
||
(delete-char 1)
|
||
(insert " "))))
|
||
((memq (char-syntax char) '(?. ?\\))
|
||
(delete-char 1)
|
||
(insert " "))
|
||
(t
|
||
(setq atom-beg (point))
|
||
(forward-word 1)
|
||
(setq atom-end (point))
|
||
(save-restriction
|
||
(narrow-to-region atom-beg atom-end)
|
||
(goto-char (point-min))
|
||
(while (re-search-forward "\\([^_]+\\)_" nil t)
|
||
(replace-match "\\1 "))
|
||
(goto-char (point-max))))))))
|
||
|
||
(set-syntax-table address-text-syntax-table)
|
||
|
||
(setq xxx (variant-method (buffer-string)))
|
||
(delete-region (point-min) (point-max))
|
||
(insert xxx)
|
||
(goto-char (point-min))
|
||
|
||
;; ;; Compress whitespace
|
||
;; (goto-char (point-min))
|
||
;; (while (re-search-forward "[ \t\n]+" nil t)
|
||
;; (replace-match " "))
|
||
;;
|
||
;; ;; Fix . used as space
|
||
;; (goto-char (point-min))
|
||
;; (while (re-search-forward mail-bad-\.-pattern nil t)
|
||
;; (replace-match "\\1 \\2"))
|
||
;;
|
||
;; ;; Delete trailing parenthesized comment
|
||
;; (goto-char (point-max))
|
||
;; (skip-chars-backward mail-whitespace)
|
||
;; (cond ((memq (char-after (1- (point))) '(?\) ?\} ?\]))
|
||
;; (setq comment-end (point))
|
||
;; (set-syntax-table address-text-comment-syntax-table)
|
||
;; (or (safe-move-sexp -1)
|
||
;; (backward-char 1))
|
||
;; (set-syntax-table address-text-syntax-table)
|
||
;; (setq comment-beg (point))
|
||
;; (skip-chars-backward mail-whitespace)
|
||
;; (if (bobp)
|
||
;; (narrow-to-region (1+ comment-beg) (1- comment-end))
|
||
;; (narrow-to-region (point-min) (point)))))
|
||
;;
|
||
;; ;; Find, save, and delete any name suffix
|
||
;; ;; *** Broken!
|
||
;; (goto-char (point-min))
|
||
;; (cond ((re-search-forward mail-full-name-suffix-pattern nil t)
|
||
;; (setq name-suffix (buffer-substring (match-beginning 3)
|
||
;; (match-end 3)))
|
||
;; (replace-match "\\1 \\4")))
|
||
;;
|
||
;; ;; Delete ALL CAPS words and after, if preceded by mixed-case or
|
||
;; ;; lowercase words. Eg. XT-DEM.
|
||
;; (goto-char (point-min))
|
||
;; ;; ## This will lose on something like "SMITH MAX".
|
||
;; ;; ## maybe it should be
|
||
;; ;; ## " \\([A-Z]+[-_/][A-Z]+\\|[A-Z][A-Z][A-Z]\\)\\b.*[^A-Z \t]"
|
||
;; ;; ## that is, three-letter-upper-case-word with non-upper-case
|
||
;; ;; ## characters following it.
|
||
;; (if (re-search-forward mail-mixed-case-name-pattern nil t)
|
||
;; (if (re-search-forward mail-weird-acronym-pattern nil t)
|
||
;; (narrow-to-region (point-min) (match-beginning 0))))
|
||
;;
|
||
;; ;; Delete trailing alternative address
|
||
;; (goto-char (point-min))
|
||
;; (if (re-search-forward mail-alternative-address-pattern nil t)
|
||
;; (narrow-to-region (point-min) (match-beginning 0)))
|
||
;;
|
||
;; ;; Delete trailing comment
|
||
;; (goto-char (point-min))
|
||
;; (if (re-search-forward mail-trailing-comment-start-pattern nil t)
|
||
;; (or (progn
|
||
;; (goto-char (match-beginning 0))
|
||
;; (skip-chars-backward mail-whitespace)
|
||
;; (bobp))
|
||
;; (narrow-to-region (point-min) (match-beginning 0))))
|
||
;;
|
||
;; ;; Delete trailing comma-separated comment
|
||
;; (goto-char (point-min))
|
||
;; ;; ## doesn't this break "Smith, John"? Yes.
|
||
;; (re-search-forward mail-last-name-first-pattern nil t)
|
||
;; (while (search-forward "," nil t)
|
||
;; (or (save-excursion
|
||
;; (backward-char 2)
|
||
;; (looking-at mail-full-name-suffix-pattern))
|
||
;; (narrow-to-region (point-min) (1- (point)))))
|
||
;;
|
||
;; ;; Delete telephone numbers and ham radio call signs
|
||
;; (goto-char (point-min))
|
||
;; (if (re-search-forward mail-telephone-extension-pattern nil t)
|
||
;; (narrow-to-region (point-min) (match-beginning 0)))
|
||
;; (goto-char (point-min))
|
||
;; (if (re-search-forward mail-ham-call-sign-pattern nil t)
|
||
;; (if (eq (match-beginning 0) (point-min))
|
||
;; (narrow-to-region (match-end 0) (point-max))
|
||
;; (narrow-to-region (point-min) (match-beginning 0))))
|
||
;;
|
||
;; ;; Delete trailing word followed immediately by .
|
||
;; (goto-char (point-min))
|
||
;; ;; ## what's this for? doesn't it mess up "Public, Harry Q."? No.
|
||
;; (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
|
||
;; (narrow-to-region (point-min) (match-beginning 0)))
|
||
;;
|
||
;; ;; Handle & substitution
|
||
;; ;; TODO: remember to disable middle initial guessing
|
||
;; (goto-char (point-min))
|
||
;; (cond ((re-search-forward "\\( \\|\\`\\)&\\( \\|\\'\\)" nil t)
|
||
;; (goto-char (match-end 1))
|
||
;; (delete-char 1)
|
||
;; (capitalize-region
|
||
;; (point)
|
||
;; (progn
|
||
;; (insert-buffer-substring canonicalization-buffer
|
||
;; mbox-beg mbox-end)
|
||
;; (point)))))
|
||
;;
|
||
;; ;; Delete nickname
|
||
;; (goto-char (point-min))
|
||
;; (if (re-search-forward mail-nickname-pattern nil t)
|
||
;; (replace-match (if (eq (match-beginning 2) (1- (match-end 2)))
|
||
;; " \\2 "
|
||
;; " ")))
|
||
;;
|
||
;; ;; Fixup initials
|
||
;; (while (progn
|
||
;; (goto-char (point-min))
|
||
;; (re-search-forward mail-bad-initials-pattern nil t))
|
||
;; (replace-match
|
||
;; (if (match-beginning 4)
|
||
;; "\\1. \\4"
|
||
;; (if (match-beginning 5)
|
||
;; "\\1. \\5"
|
||
;; "\\1. "))))
|
||
;;
|
||
;; ;; Delete title
|
||
;; (goto-char (point-min))
|
||
;; (if (re-search-forward mail-full-name-prefixes nil t)
|
||
;; (narrow-to-region (point) (point-max)))
|
||
;;
|
||
;; ;; Delete trailing and preceding non-name characters
|
||
;; (goto-char (point-min))
|
||
;; (skip-chars-forward mail-non-begin-name-chars)
|
||
;; (narrow-to-region (point) (point-max))
|
||
;; (goto-char (point-max))
|
||
;; (skip-chars-backward mail-non-end-name-chars)
|
||
;; (narrow-to-region (point-min) (point))
|
||
|
||
;; If name is "First Last" and userid is "F?L", then assume
|
||
;; the middle initial is the second letter in the userid.
|
||
;; initially by Jamie Zawinski <jwz@lucid.com>
|
||
(cond ((and (eq 3 (- mbox-end mbox-beg))
|
||
(progn
|
||
(goto-char (point-min))
|
||
(looking-at mail-two-name-pattern)))
|
||
(setq fi (char-after (match-beginning 0))
|
||
li (char-after (match-beginning 3)))
|
||
(save-excursion
|
||
(set-buffer canonicalization-buffer)
|
||
;; char-equal is ignoring case here, so no need to upcase
|
||
;; or downcase.
|
||
(let ((case-fold-search t))
|
||
(and (char-equal fi (char-after mbox-beg))
|
||
(char-equal li (char-after (1- mbox-end)))
|
||
(setq mi (char-after (1+ mbox-beg))))))
|
||
(cond ((and mi
|
||
;; TODO: use better table than syntax table
|
||
(eq ?w (char-syntax mi)))
|
||
(goto-char (match-beginning 3))
|
||
(insert (upcase mi) ". ")))))
|
||
|
||
;; ;; Restore suffix
|
||
;; (cond (name-suffix
|
||
;; (goto-char (point-max))
|
||
;; (insert ", " name-suffix)
|
||
;; (backward-word 1)
|
||
;; (cond ((memq (following-char) '(?j ?J ?s ?S))
|
||
;; (capitalize-word 1)
|
||
;; (or (eq (following-char) ?.)
|
||
;; (insert ?.)))
|
||
;; (t
|
||
;; (upcase-word 1)))))
|
||
|
||
;; Result
|
||
(list (buffer-string)
|
||
(progn
|
||
(set-buffer canonicalization-buffer)
|
||
(buffer-string)))
|
||
)))
|
||
|
||
;; TODO: put this back in the above function now that it's proven:
|
||
(defun variant-method (string)
|
||
(let ((variant-buffer (get-buffer-create "*variant method buffer*"))
|
||
(word-count 0)
|
||
mixed-case-flag lower-case-flag upper-case-flag
|
||
suffix-flag last-name-comma-flag
|
||
comment-beg comment-end initial beg end
|
||
)
|
||
(save-excursion
|
||
(set-buffer variant-buffer)
|
||
(buffer-disable-undo variant-buffer)
|
||
(set-syntax-table address-text-syntax-table)
|
||
(widen)
|
||
(erase-buffer)
|
||
(setq case-fold-search nil)
|
||
|
||
(insert string)
|
||
|
||
;; Fix . used as space
|
||
(goto-char (point-min))
|
||
(while (re-search-forward mail-bad-\.-pattern nil t)
|
||
(replace-match "\\1 \\2"))
|
||
|
||
;; Skip any initial garbage.
|
||
(goto-char (point-min))
|
||
(skip-chars-forward mail-non-begin-name-chars)
|
||
(skip-chars-backward "& \"")
|
||
(narrow-to-region (point) (point-max))
|
||
|
||
(catch 'stop
|
||
(while t
|
||
(skip-chars-forward mail-whitespace)
|
||
|
||
(cond
|
||
|
||
;; Delete title
|
||
((and (eq word-count 0)
|
||
(looking-at mail-full-name-prefixes))
|
||
(goto-char (match-end 0))
|
||
(narrow-to-region (point) (point-max)))
|
||
|
||
;; Stop after name suffix
|
||
((and (>= word-count 2)
|
||
(looking-at mail-full-name-suffix-pattern))
|
||
(skip-chars-backward mail-whitespace)
|
||
(setq suffix-flag (point))
|
||
(if (eq ?, (following-char))
|
||
(forward-char 1)
|
||
(insert ?,))
|
||
;; Enforce at least one space after comma
|
||
(or (eq mail-space-char (following-char))
|
||
(insert mail-space-char))
|
||
(skip-chars-forward mail-whitespace)
|
||
(cond ((memq (following-char) '(?j ?J ?s ?S))
|
||
(capitalize-word 1)
|
||
(if (eq (following-char) ?.)
|
||
(forward-char 1)
|
||
(insert ?.)))
|
||
(t
|
||
(upcase-word 1)))
|
||
(setq word-count (1+ word-count))
|
||
(throw 'stop t))
|
||
|
||
;; Handle SCA names
|
||
((looking-at "MKA \\(.+\\)") ; "Mundanely Known As"
|
||
(setq word-count 0)
|
||
(goto-char (match-beginning 1))
|
||
(narrow-to-region (point) (point-max)))
|
||
|
||
;; Various stopping points
|
||
((or
|
||
;; Stop before ALL CAPS acronyms, if preceded by mixed-case or
|
||
;; lowercase words. Eg. XT-DEM.
|
||
(and (>= word-count 2)
|
||
(or mixed-case-flag lower-case-flag)
|
||
(looking-at mail-weird-acronym-pattern)
|
||
(not (looking-at mail-roman-numeral-pattern)))
|
||
;; Stop before 4-or-more letter lowercase words preceded by
|
||
;; mixed case or uppercase words.
|
||
(and (>= word-count 2)
|
||
(or upper-case-flag mixed-case-flag)
|
||
(looking-at "[a-z][a-z][a-z][a-z]+\\b"))
|
||
;; Stop before trailing alternative address
|
||
(looking-at mail-alternative-address-pattern)
|
||
;; Stop before trailing comment not introduced by comma
|
||
(looking-at mail-trailing-comment-start-pattern)
|
||
;; Stop before telephone numbers
|
||
(looking-at mail-telephone-extension-pattern))
|
||
(throw 'stop t))
|
||
|
||
;; Check for initial last name followed by comma
|
||
((and (eq ?, (following-char))
|
||
(eq word-count 1))
|
||
(forward-char 1)
|
||
(setq last-name-comma-flag t)
|
||
(or (eq mail-space-char (following-char))
|
||
(insert mail-space-char)))
|
||
|
||
;; Stop before trailing comma-separated comment
|
||
((eq ?, (following-char))
|
||
(throw 'stop t))
|
||
|
||
;; Delete parenthesized/quoted comment/nickname
|
||
((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`))
|
||
(setq comment-beg (point))
|
||
(set-syntax-table address-text-comment-syntax-table)
|
||
(cond ((memq (following-char) '(?\' ?\`))
|
||
(if (eq ?\' (following-char))
|
||
(forward-char 1))
|
||
(or (search-forward "'" nil t)
|
||
(delete-char 1)))
|
||
(t
|
||
(or (safe-move-sexp 1)
|
||
(goto-char (point-max)))))
|
||
(set-syntax-table address-text-syntax-table)
|
||
(setq comment-end (point))
|
||
(cond
|
||
;; Handle case of entire name being quoted
|
||
((and (eq word-count 0)
|
||
(looking-at " *\\'")
|
||
(>= (- comment-end comment-beg) 2))
|
||
(narrow-to-region (1+ comment-beg) (1- comment-end))
|
||
(goto-char (point-min)))
|
||
(t
|
||
;; Handle case of quoted initial
|
||
(if (and (or (= 3 (- comment-end comment-beg))
|
||
(and (= 4 (- comment-end comment-beg))
|
||
(eq ?. (char-after (+ 2 comment-beg)))))
|
||
(not (looking-at " *\\'")))
|
||
(setq initial (char-after (1+ comment-beg)))
|
||
(setq initial nil))
|
||
(delete-region comment-beg comment-end)
|
||
(if initial
|
||
(insert initial ". ")))))
|
||
|
||
;; Delete ham radio call signs
|
||
((looking-at mail-ham-call-sign-pattern)
|
||
(delete-region (match-beginning 0) (match-end 0)))
|
||
|
||
;; Handle & substitution
|
||
;; TODO: remember to disable middle initial guessing
|
||
((and (or (bobp)
|
||
(eq mail-space-char (preceding-char)))
|
||
(looking-at "&\\( \\|\\'\\)"))
|
||
(delete-char 1)
|
||
(capitalize-region
|
||
(point)
|
||
(progn
|
||
(insert-buffer-substring canonicalization-buffer
|
||
mbox-beg mbox-end)
|
||
(point))))
|
||
|
||
;; Fixup initials
|
||
((looking-at mail-initial-pattern)
|
||
(or (eq (following-char) (upcase (following-char)))
|
||
(setq lower-case-flag t))
|
||
(forward-char 1)
|
||
(if (eq ?. (following-char))
|
||
(forward-char 1)
|
||
(insert ?.))
|
||
(or (eq mail-space-char (following-char))
|
||
(insert mail-space-char))
|
||
(setq word-count (1+ word-count)))
|
||
|
||
;; Regular name words
|
||
((looking-at mail-name-pattern)
|
||
(setq beg (point))
|
||
(setq end (match-end 0))
|
||
(set (if (re-search-forward "[a-z]" end t)
|
||
(if (progn
|
||
(goto-char beg)
|
||
(re-search-forward "[A-Z]" end t))
|
||
'mixed-case-flag
|
||
'lower-case-flag)
|
||
'upper-case-flag) t)
|
||
(goto-char end)
|
||
(setq word-count (1+ word-count)))
|
||
|
||
(t
|
||
(throw 'stop t)))))
|
||
|
||
(narrow-to-region (point-min) (point))
|
||
|
||
;; Delete trailing word followed immediately by .
|
||
(cond ((not suffix-flag)
|
||
(goto-char (point-min))
|
||
(if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
|
||
(narrow-to-region (point-min) (match-beginning 0)))))
|
||
|
||
;; If last name first put it at end (but before suffix)
|
||
(cond (last-name-comma-flag
|
||
(goto-char (point-min))
|
||
(search-forward ",")
|
||
(setq end (1- (point)))
|
||
(goto-char (or suffix-flag (point-max)))
|
||
(or (eq mail-space-char (preceding-char))
|
||
(insert mail-space-char))
|
||
(insert-buffer-substring (current-buffer) (point-min) end)
|
||
(narrow-to-region (1+ end) (point-max))))
|
||
|
||
(goto-char (point-max))
|
||
(skip-chars-backward mail-non-end-name-chars)
|
||
(if (eq ?. (following-char))
|
||
(forward-char 1))
|
||
(narrow-to-region (point)
|
||
(progn
|
||
(goto-char (point-min))
|
||
(skip-chars-forward mail-non-begin-name-chars)
|
||
(point)))
|
||
|
||
;; Compress whitespace
|
||
(goto-char (point-min))
|
||
(while (re-search-forward "[ \t\n]+" nil t)
|
||
(replace-match " "))
|
||
|
||
(buffer-substring (point-min) (point-max))
|
||
|
||
)))
|
||
|
||
;; The country names are just in there for show right now, and because
|
||
;; Jamie thought it would be neat. They aren't used yet.
|
||
|
||
;; Keep in mind that the country abbreviations follow ISO-3166. There is
|
||
;; a U.S. FIPS that specifies a different set of two-letter country
|
||
;; abbreviations.
|
||
|
||
;; TODO: put this in its own obarray, instead of cluttering up the main
|
||
;; symbol table with junk.
|
||
|
||
(mapcar
|
||
(function
|
||
(lambda (x)
|
||
(if (symbolp x)
|
||
(put x 'domain-name t)
|
||
(put (car x) 'domain-name (nth 1 x)))))
|
||
'((ag "Antigua")
|
||
(ar "Argentina") ; Argentine Republic
|
||
arpa ; Advanced Projects Research Agency
|
||
(at "Austria") ; The Republic of _
|
||
(au "Australia")
|
||
(bb "Barbados")
|
||
(be "Belgium") ; The Kingdom of _
|
||
(bg "Bulgaria")
|
||
bitnet ; Because It's Time NET
|
||
(bo "Bolivia") ; Republic of _
|
||
(br "Brazil") ; The Federative Republic of _
|
||
(bs "Bahamas")
|
||
(bz "Belize")
|
||
(ca "Canada")
|
||
(ch "Switzerland") ; The Swiss Confederation
|
||
(cl "Chile") ; The Republic of _
|
||
(cn "China") ; The People's Republic of _
|
||
(co "Columbia")
|
||
com ; Commercial
|
||
(cr "Costa Rica") ; The Republic of _
|
||
(cs "Czechoslovakia")
|
||
(de "Germany")
|
||
(dk "Denmark")
|
||
(dm "Dominica")
|
||
(do "Dominican Republic") ; The _
|
||
(ec "Ecuador") ; The Republic of _
|
||
edu ; Educational
|
||
(eg "Egypt") ; The Arab Republic of _
|
||
(es "Spain") ; The Kingdom of _
|
||
(fi "Finland") ; The Republic of _
|
||
(fj "Fiji")
|
||
(fr "France")
|
||
gov ; Government (U.S.A.)
|
||
(gr "Greece") ; The Hellenic Republic
|
||
(hk "Hong Kong")
|
||
(hu "Hungary") ; The Hungarian People's Republic (???)
|
||
(ie "Ireland")
|
||
(il "Israel") ; The State of _
|
||
(in "India") ; The Republic of _
|
||
int ; something British, don't know what
|
||
(is "Iceland") ; The Republic of _
|
||
(it "Italy") ; The Italian Republic
|
||
(jm "Jamaica")
|
||
(jp "Japan")
|
||
(kn "St. Kitts and Nevis")
|
||
(kr "South Korea")
|
||
(lc "St. Lucia")
|
||
(lk "Sri Lanka") ; The Democratic Socialist Republic of _
|
||
mil ; Military (U.S.A.)
|
||
(mx "Mexico") ; The United Mexican States
|
||
(my "Malaysia") ; changed to Myanmar????
|
||
(na "Namibia")
|
||
nato ; North Atlantic Treaty Organization
|
||
net ; Network
|
||
(ni "Nicaragua") ; The Republic of _
|
||
(nl "Netherlands") ; The Kingdom of the _
|
||
(no "Norway") ; The Kingdom of _
|
||
(nz "New Zealand")
|
||
org ; Organization
|
||
(pe "Peru")
|
||
(pg "Papua New Guinea")
|
||
(ph "Philippines") ; The Republic of the _
|
||
(pl "Poland")
|
||
(pr "Puerto Rico")
|
||
(pt "Portugal") ; The Portugese Republic
|
||
(py "Paraguay")
|
||
(se "Sweden") ; The Kingdom of _
|
||
(sg "Singapore") ; The Republic of _
|
||
(sr "Suriname")
|
||
(su "Soviet Union")
|
||
(th "Thailand") ; The Kingdom of _
|
||
(tn "Tunisia")
|
||
(tr "Turkey") ; The Republic of _
|
||
(tt "Trinidad and Tobago")
|
||
(tw "Taiwan")
|
||
(uk "United Kingdom") ; The _ of Great Britain
|
||
unter-dom ; something German
|
||
(us "U.S.A.") ; The United States of America
|
||
uucp ; Unix to Unix CoPy
|
||
(uy "Uruguay") ; The Eastern Republic of _
|
||
(vc "St. Vincent and the Grenadines")
|
||
(ve "Venezuela") ; The Republic of _
|
||
(yu "Yugoslavia") ; The Socialist Federal Republic of _
|
||
;; Also said to be Zambia ...
|
||
(za "South Africa") ; The Republic of _ (why not Zaire???)
|
||
(zw "Zimbabwe") ; Republic of _
|
||
))
|
||
;; fipnet
|
||
|
||
|
||
;; Code for testing.
|
||
|
||
(defun time-extract ()
|
||
(let (times list)
|
||
(setq times (cons (current-time-string) times)
|
||
list problem-address-alist)
|
||
(while list
|
||
(mail-extract-address-components (car (car list)))
|
||
(setq list (cdr list)))
|
||
(setq times (cons (current-time-string) times))
|
||
(nreverse times)))
|
||
|
||
(defun test-extract (&optional starting-point)
|
||
(interactive)
|
||
(set-buffer (get-buffer-create "*Testing*"))
|
||
(erase-buffer)
|
||
(sit-for 0)
|
||
(mapcar 'test-extract-internal
|
||
(if starting-point
|
||
(memq starting-point problem-address-alist)
|
||
problem-address-alist)))
|
||
|
||
(defvar failed-item)
|
||
(defun test-extract-internal (item)
|
||
(setq failed-item item)
|
||
(let* ((address (car item))
|
||
(correct-name (nth 1 item))
|
||
(correct-canon (nth 2 item))
|
||
(result (mail-extract-address-components address))
|
||
(name (car result))
|
||
(canon (nth 1 result))
|
||
(name-correct (or (null correct-name)
|
||
(string-equal (downcase correct-name)
|
||
(downcase name))))
|
||
(canon-correct (or (null correct-canon)
|
||
(string-equal correct-canon canon))))
|
||
(cond ((not (and name-correct canon-correct))
|
||
(pop-to-buffer "*Testing*")
|
||
(select-window (get-buffer-window (current-buffer)))
|
||
(goto-char (point-max))
|
||
(insert "Address: " address "\n")
|
||
(if (not name-correct)
|
||
(insert " Correct Name: [" correct-name
|
||
"]\; Result: [" name "]\n"))
|
||
(if (not canon-correct)
|
||
(insert " Correct Canon: [" correct-canon
|
||
"]\; Result: [" canon "]\n"))
|
||
(insert "\n")
|
||
(sit-for 0))))
|
||
(setq failed-item nil))
|
||
|
||
(defun test-continue-extract ()
|
||
(interactive)
|
||
(test-extract failed-item))
|
||
|
||
|
||
;; Assorted junk.
|
||
|
||
;; warsaw@nlm.nih.gov (A Bad Dude -- Barry Warsaw)
|
||
|
||
;;'(from
|
||
;; reply-to
|
||
;; return-path
|
||
;; x-uucp-from
|
||
;; sender
|
||
;; resent-from
|
||
;; resent-sender
|
||
;; resent-reply-to)
|
||
|
||
;;; mail-extr.el ends here
|