809
|
1 ;;; mail-extr.el --- extract full name and address from RFC 822 mail header.
|
|
2
|
|
3 ;; Author: Joe Wells <jbw@cs.bu.edu>
|
|
4 ;; Last-Modified: 7 Apr 1992
|
|
5 ;; Version: 1.0
|
|
6 ;; Adapted-By: ESR
|
|
7 ;; Keywords: mail
|
|
8
|
|
9 ;; Copyright (C) 1992 Free Software Foundation, Inc.
|
|
10
|
|
11 ;; This file is part of GNU Emacs.
|
|
12
|
|
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
14 ;; it under the terms of the GNU General Public License as published by
|
|
15 ;; the Free Software Foundation; either version 1, or (at your option)
|
|
16 ;; any later version.
|
|
17
|
|
18 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
21 ;; GNU General Public License for more details.
|
|
22
|
|
23 ;; You should have received a copy of the GNU General Public License
|
|
24 ;; along with GNU Emacs; see the file COPYING. If not, write to
|
|
25 ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
26
|
|
27 ;;; Commentary:
|
|
28
|
|
29 ;; Here is `mail-extr', a package for extracting full names and canonical
|
|
30 ;; addresses from RFC 822 mail headers. It is intended to be hooked into
|
|
31 ;; other Emacs Lisp packages that deal with RFC 822 format messages, such as
|
|
32 ;; Gnews, GNUS, RMAIL, MH-E, BBDB, VM, Supercite, etc. Thus, this release is
|
|
33 ;; mainly for Emacs Lisp developers.
|
|
34
|
|
35 ;; There are two main benefits:
|
|
36
|
|
37 ;; 1. Higher probability of getting the correct full name for a human than
|
|
38 ;; any other package I know of. (On the other hand, it will cheerfully
|
|
39 ;; mangle non-human names/comments.)
|
|
40 ;; 2. Address part is put in a canonical form.
|
|
41
|
|
42 ;; The interface is not yet carved in stone; please give me suggestions.
|
|
43
|
|
44 ;; I have an extensive test-case collection of funny addresses if you want to
|
|
45 ;; work with the code. Developing this code requires frequent testing to
|
|
46 ;; make sure you're not breaking functionality. I'm not posting the
|
|
47 ;; test-cases because they take over 100K.
|
|
48
|
|
49 ;; If you find an address that mail-extr fails on, please send it to me along
|
|
50 ;; with what you think the correct results should be. I do not consider it a
|
|
51 ;; bug if mail-extr mangles a comment that does not correspond to a real
|
|
52 ;; human full name, although I would prefer that mail-extr would return the
|
|
53 ;; comment as-is.
|
|
54
|
|
55 ;; Features:
|
|
56
|
|
57 ;; * Full name handling:
|
|
58
|
|
59 ;; * knows where full names can be found in an address.
|
|
60 ;; * avoids using empty comments and quoted text.
|
|
61 ;; * extracts full names from mailbox names.
|
|
62 ;; * recognizes common formats for comments after a full name.
|
|
63 ;; * puts a period and a space after each initial.
|
|
64 ;; * understands & referring to the mailbox name capitalized.
|
|
65 ;; * strips name prefixes like "Prof.", etc..
|
|
66 ;; * understands what characters can occur in names (not just letters).
|
|
67 ;; * figures out middle initial from mailbox name.
|
|
68 ;; * removes funny nicknames.
|
|
69 ;; * keeps suffixes such as Jr., Sr., III, etc.
|
|
70 ;; * reorders "Last, First" type names.
|
|
71
|
|
72 ;; * Address handling:
|
|
73
|
|
74 ;; * parses rfc822 quoted text, comments, and domain literals.
|
|
75 ;; * parses rfc822 multi-line headers.
|
|
76 ;; * does something reasonable with rfc822 GROUP addresses.
|
|
77 ;; * handles many rfc822 noncompliant and garbage addresses.
|
|
78 ;; * canonicalizes addresses (after stripping comments/phrases outside <>).
|
|
79 ;; * converts ! addresses into .UUCP and %-style addresses.
|
|
80 ;; * converts rfc822 ROUTE addresses to %-style addresses.
|
|
81 ;; * truncates %-style addresses at leftmost fully qualified domain name.
|
|
82 ;; * handles local relative precedence of ! vs. % and @ (untested).
|
|
83
|
|
84 ;; It does almost no string creation. It primarily uses the built-in
|
|
85 ;; parsing routines with the appropriate syntax tables. This should
|
|
86 ;; result in greater speed.
|
|
87
|
|
88 ;; TODO:
|
|
89
|
|
90 ;; * handle all test cases. (This will take forever.)
|
|
91 ;; * software to pick the correct header to use (eg., "Senders-Name:").
|
|
92 ;; * multiple addresses in the "From:" header (almost all of the necessary
|
|
93 ;; code is there).
|
|
94 ;; * flag to not treat `,' as an address separator. (This is useful when
|
|
95 ;; there is a "From:" header but no "Sender:" header, because then there
|
|
96 ;; is only allowed to be one address.)
|
|
97 ;; * mailbox name does not necessarily contain full name.
|
|
98 ;; * fixing capitalization when it's all upper or lowercase. (Hard!)
|
|
99 ;; * some of the domain literal handling is missing. (But I've never even
|
|
100 ;; seen one of these in a mail address, so maybe no big deal.)
|
|
101 ;; * arrange to have syntax tables byte-compiled.
|
|
102 ;; * speed hacks.
|
|
103 ;; * delete unused variables.
|
|
104 ;; * arrange for testing with different relative precedences of ! vs. @
|
|
105 ;; and %.
|
|
106 ;; * put variant-method back into mail-extract-address-components.
|
|
107 ;; * insert documentation strings!
|
|
108 ;; * handle X.400-gatewayed addresses according to RFC 1148.
|
|
109
|
|
110 ;;; Change Log:
|
|
111 ;;
|
|
112 ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu)
|
|
113 ;;
|
|
114 ;; * Cleaned up some more. Release version 1.0 to world.
|
|
115 ;;
|
|
116 ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu)
|
|
117 ;;
|
|
118 ;; * Cleaned up full name extraction extensively.
|
|
119 ;;
|
|
120 ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu)
|
|
121 ;;
|
|
122 ;; * Total rewrite. Integrated mail-canonicalize-address into
|
|
123 ;; mail-extract-address-components. Now handles GROUP addresses more
|
|
124 ;; or less correctly. Better handling of lots of different cases.
|
|
125 ;;
|
|
126 ;; Fri Jun 14 19:39:50 1991
|
|
127 ;; * Created.
|
|
128
|
|
129 ;;; Code:
|
|
130
|
|
131 ;; Variable definitions.
|
|
132
|
|
133 (defvar mail-@-binds-tighter-than-! nil)
|
|
134
|
|
135 ;;----------------------------------------------------------------------
|
|
136 ;; what orderings are meaningful?????
|
|
137 ;;(defvar mail-operator-precedence-list '(?! ?% ?@))
|
|
138 ;; Right operand of a % or a @ must be a domain name, period. No other
|
|
139 ;; operators allowed. Left operand of a @ is an address relative to that
|
|
140 ;; site.
|
|
141
|
|
142 ;; Left operand of a ! must be a domain name. Right operand is an
|
|
143 ;; arbitrary address.
|
|
144 ;;----------------------------------------------------------------------
|
|
145
|
|
146 (defconst mail-space-char 32)
|
|
147
|
|
148 (defconst mail-whitespace " \t\n")
|
|
149
|
|
150 ;; Any character that can occur in a name in an RFC822 address.
|
|
151 ;; Yes, there are weird people with digits in their names.
|
|
152 (defconst mail-all-letters "A-Za-z---{|}'~0-9`.")
|
|
153
|
|
154 ;; Any character that can occur in a name, not counting characters that
|
|
155 ;; separate parts of a multipart name.
|
|
156 (defconst mail-all-letters-but-separators "A-Za-z{|}'~0-9`")
|
|
157
|
|
158 ;; Any character that can start a name
|
|
159 (defconst mail-first-letters "A-Za-z")
|
|
160
|
|
161 ;; Any character that can end a name.
|
|
162 (defconst mail-last-letters "A-Za-z`'.")
|
|
163
|
|
164 ;; Matches an initial not followed by both a period and a space.
|
|
165 (defconst mail-bad-initials-pattern
|
|
166 (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)"
|
|
167 mail-all-letters mail-first-letters mail-all-letters))
|
|
168
|
|
169 (defconst mail-non-name-chars (concat "^" mail-all-letters "."))
|
|
170
|
|
171 (defconst mail-non-begin-name-chars (concat "^" mail-first-letters))
|
|
172
|
|
173 (defconst mail-non-end-name-chars (concat "^" mail-last-letters))
|
|
174
|
|
175 ;; Matches periods used instead of spaces. Must not match the period
|
|
176 ;; following an initial.
|
|
177 (defconst mail-bad-\.-pattern
|
|
178 (format "\\([%s][%s]\\)\\.+\\([%s]\\)"
|
|
179 mail-all-letters mail-last-letters mail-first-letters))
|
|
180
|
|
181 ;; Matches an embedded or leading nickname that should be removed.
|
|
182 (defconst mail-nickname-pattern
|
|
183 (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] "
|
|
184 mail-all-letters))
|
|
185
|
|
186 ;; Matches a leading title that is not part of the name (does not
|
|
187 ;; contribute to uniquely identifying the person).
|
|
188 (defconst mail-full-name-prefixes
|
|
189 '"\\` *\\(Prof\\|Dr\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.? ")
|
|
190
|
|
191 ;; Matches the occurrence of a generational name suffix, and the last
|
|
192 ;; character of the preceding name.
|
|
193 (defconst mail-full-name-suffix-pattern
|
|
194 (format
|
|
195 "\\(,? ?\\([JjSs]r\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)"
|
|
196 mail-all-letters mail-all-letters))
|
|
197
|
|
198 (defconst mail-roman-numeral-pattern
|
|
199 "V?I+V?\\b")
|
|
200
|
|
201 ;; Matches a trailing uppercase (with other characters possible) acronym.
|
|
202 ;; Must not match a trailing uppercase last name or trailing initial
|
|
203 (defconst mail-weird-acronym-pattern "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)")
|
|
204
|
|
205 ;; Matches a mixed-case or lowercase name (not an initial).
|
|
206 (defconst mail-mixed-case-name-pattern
|
|
207 (format
|
|
208 "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)"
|
|
209 mail-all-letters mail-last-letters
|
|
210 mail-first-letters mail-all-letters mail-all-letters mail-last-letters
|
|
211 mail-first-letters mail-all-letters))
|
|
212
|
|
213 ;; Matches a trailing alternative address.
|
|
214 (defconst mail-alternative-address-pattern "[a-zA-Z.]+[!@][a-zA-Z.]")
|
|
215
|
|
216 ;; Matches a variety of trailing comments not including comma-delimited
|
|
217 ;; comments.
|
|
218 (defconst mail-trailing-comment-start-pattern " [-{]\\|--\\|[+@#></\;]")
|
|
219
|
|
220 ;; Matches a name (not an initial).
|
|
221 ;; This doesn't force a word boundary at the end because sometimes a
|
|
222 ;; comment is separated by a `-' with no preceding space.
|
|
223 (defconst mail-name-pattern
|
|
224 (format
|
|
225 "\\b[%s][%s]*[%s]"
|
|
226 mail-first-letters mail-all-letters mail-last-letters))
|
|
227
|
|
228 (defconst mail-initial-pattern
|
|
229 (format "\\b[%s]\\([. ]\\|\\b\\)" mail-first-letters))
|
|
230
|
|
231 ;; Matches a single name before a comma.
|
|
232 (defconst mail-last-name-first-pattern
|
|
233 (concat "\\`" mail-name-pattern ","))
|
|
234
|
|
235 ;; Matches telephone extensions.
|
|
236 (defconst mail-telephone-extension-pattern
|
|
237 "\\(\\([Ee]xt\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+")
|
|
238
|
|
239 ;; Matches ham radio call signs.
|
|
240 (defconst mail-ham-call-sign-pattern
|
|
241 "\\b[A-Z]+[0-9][A-Z0-9]*")
|
|
242
|
|
243 ;; Matches normal single-part name
|
|
244 (defconst mail-normal-name-pattern
|
|
245 (format
|
|
246 "\\b[%s][%s]+[%s]"
|
|
247 mail-first-letters mail-all-letters-but-separators mail-last-letters))
|
|
248
|
|
249 ;; Matches normal two names with missing middle initial
|
|
250 (defconst mail-two-name-pattern
|
|
251 (concat "\\`\\(" mail-normal-name-pattern
|
|
252 "\\|" mail-initial-pattern
|
|
253 "\\) +\\(" mail-normal-name-pattern "\\)\\(,\\|\\'\\)"))
|
|
254
|
|
255 (defvar address-syntax-table (make-syntax-table))
|
|
256 (defvar address-comment-syntax-table (make-syntax-table))
|
|
257 (defvar address-domain-literal-syntax-table (make-syntax-table))
|
|
258 (defvar address-text-comment-syntax-table (make-syntax-table))
|
|
259 (defvar address-text-syntax-table (make-syntax-table))
|
|
260 (mapcar
|
|
261 (function
|
|
262 (lambda (pair)
|
|
263 (let ((syntax-table (symbol-value (car pair))))
|
|
264 (mapcar
|
|
265 (function
|
|
266 (lambda (item)
|
|
267 (if (eq 2 (length item))
|
|
268 (modify-syntax-entry (car item) (car (cdr item)) syntax-table)
|
|
269 (let ((char (car item))
|
|
270 (bound (car (cdr item)))
|
|
271 (syntax (car (cdr (cdr item)))))
|
|
272 (while (<= char bound)
|
|
273 (modify-syntax-entry char syntax syntax-table)
|
|
274 (setq char (1+ char)))))))
|
|
275 (cdr pair)))))
|
|
276 '((address-syntax-table
|
|
277 (0 31 "w") ;control characters
|
|
278 (32 " ") ;SPC
|
|
279 (?! ?~ "w") ;printable characters
|
|
280 (127 "w") ;DEL
|
|
281 (128 255 "w") ;high-bit-on characters
|
|
282 (?\t " ")
|
|
283 (?\r " ")
|
|
284 (?\n " ")
|
|
285 (?\( ".")
|
|
286 (?\) ".")
|
|
287 (?< ".")
|
|
288 (?> ".")
|
|
289 (?@ ".")
|
|
290 (?, ".")
|
|
291 (?\; ".")
|
|
292 (?: ".")
|
|
293 (?\\ "\\")
|
|
294 (?\" "\"")
|
|
295 (?. ".")
|
|
296 (?\[ ".")
|
|
297 (?\] ".")
|
|
298 ;; % and ! aren't RFC822 characters, but it is convenient to pretend
|
|
299 (?% ".")
|
|
300 (?! ".")
|
|
301 )
|
|
302 (address-comment-syntax-table
|
|
303 (0 255 "w")
|
|
304 (?\( "\(\)")
|
|
305 (?\) "\)\(")
|
|
306 (?\\ "\\"))
|
|
307 (address-domain-literal-syntax-table
|
|
308 (0 255 "w")
|
|
309 (?\[ "\(\]") ;??????
|
|
310 (?\] "\)\[") ;??????
|
|
311 (?\\ "\\"))
|
|
312 (address-text-comment-syntax-table
|
|
313 (0 255 "w")
|
|
314 (?\( "\(\)")
|
|
315 (?\) "\)\(")
|
|
316 (?\[ "\(\]")
|
|
317 (?\] "\)\[")
|
|
318 (?\{ "\(\}")
|
|
319 (?\} "\)\{")
|
|
320 (?\\ "\\")
|
|
321 (?\" "\"")
|
|
322 ;; (?\' "\)\`")
|
|
323 ;; (?\` "\(\'")
|
|
324 )
|
|
325 (address-text-syntax-table
|
|
326 (0 255 ".")
|
|
327 (?A ?Z "w")
|
|
328 (?a ?z "w")
|
|
329 (?- "w")
|
|
330 (?\} "w")
|
|
331 (?\{ "w")
|
|
332 (?| "w")
|
|
333 (?\' "w")
|
|
334 (?~ "w")
|
|
335 (?0 ?9 "w"))
|
|
336 ))
|
|
337
|
|
338
|
|
339 ;; Utility functions and macros.
|
|
340
|
|
341 (defmacro undo-backslash-quoting (beg end)
|
|
342 (`(save-excursion
|
|
343 (save-restriction
|
|
344 (narrow-to-region (, beg) (, end))
|
|
345 (goto-char (point-min))
|
|
346 ;; undo \ quoting
|
|
347 (while (re-search-forward "\\\\\\(.\\)" nil t)
|
|
348 (replace-match "\\1")
|
|
349 ;; CHECK: does this leave point after the replacement?
|
|
350 )))))
|
|
351
|
|
352 (defmacro mail-nuke-char-at (pos)
|
|
353 (` (save-excursion
|
|
354 (goto-char (, pos))
|
|
355 (delete-char 1)
|
|
356 (insert mail-space-char))))
|
|
357
|
|
358 (defmacro mail-nuke-elements-outside-range (list-symbol beg-symbol end-symbol
|
|
359 &optional no-replace)
|
|
360 (` (progn
|
|
361 (setq temp (, list-symbol))
|
|
362 (while temp
|
|
363 (cond ((or (> (car temp) (, end-symbol))
|
|
364 (< (car temp) (, beg-symbol)))
|
|
365 (, (or no-replace
|
|
366 (` (mail-nuke-char-at (car temp)))))
|
|
367 (setcar temp nil)))
|
|
368 (setq temp (cdr temp)))
|
|
369 (setq (, list-symbol) (delq nil (, list-symbol))))))
|
|
370
|
|
371 (defun mail-demarkerize (marker)
|
|
372 (and marker
|
|
373 (if (markerp marker)
|
|
374 (let ((temp (marker-position marker)))
|
|
375 (set-marker marker nil)
|
|
376 temp)
|
|
377 marker)))
|
|
378
|
|
379 (defun mail-markerize (pos)
|
|
380 (and pos
|
|
381 (if (markerp pos)
|
|
382 pos
|
|
383 (copy-marker pos))))
|
|
384
|
|
385 (defmacro mail-last-element (list)
|
|
386 "Return last element of LIST."
|
|
387 (` (let ((list (, list)))
|
|
388 (while (not (null (cdr list)))
|
|
389 (setq list (cdr list)))
|
|
390 (car list))))
|
|
391
|
|
392 (defmacro safe-move-sexp (arg)
|
|
393 "Safely skip over one balanced sexp, if there is one. Return t if success."
|
|
394 (` (condition-case error
|
|
395 (progn
|
|
396 (goto-char (scan-sexps (point) (, arg)))
|
|
397 t)
|
|
398 (error
|
|
399 (if (string-equal (nth 1 error) "Unbalanced parentheses")
|
|
400 nil
|
|
401 (while t
|
|
402 (signal (car error) (cdr error))))))))
|
|
403
|
|
404
|
|
405 ;; The main function to grind addresses
|
|
406
|
|
407 (defun mail-extract-address-components (address)
|
|
408 "Given an rfc 822 ADDRESS, extract full name and canonical address.
|
|
409 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS)."
|
|
410 (let ((canonicalization-buffer (get-buffer-create "*canonical address*"))
|
|
411 (extraction-buffer (get-buffer-create "*extract address components*"))
|
|
412 (foo 'bar)
|
|
413 char
|
|
414 multiple-addresses
|
|
415 <-pos >-pos @-pos :-pos ,-pos !-pos %-pos \;-pos
|
|
416 group-:-pos group-\;-pos route-addr-:-pos
|
|
417 record-pos-symbol
|
|
418 first-real-pos last-real-pos
|
|
419 phrase-beg phrase-end
|
|
420 comment-beg comment-end
|
|
421 quote-beg quote-end
|
|
422 atom-beg atom-end
|
|
423 mbox-beg mbox-end
|
|
424 \.-ends-name
|
|
425 temp
|
|
426 name-suffix
|
|
427 saved-point
|
|
428 fi mi li
|
|
429 saved-%-pos saved-!-pos saved-@-pos
|
|
430 domain-pos \.-pos insert-point)
|
|
431
|
|
432 (save-excursion
|
|
433 (set-buffer extraction-buffer)
|
|
434 (buffer-flush-undo extraction-buffer)
|
|
435 (set-syntax-table address-syntax-table)
|
|
436 (widen)
|
|
437 (erase-buffer)
|
|
438 (setq case-fold-search nil)
|
|
439
|
|
440 ;; Insert extra space at beginning to allow later replacement with <
|
|
441 ;; without having to move markers.
|
|
442 (insert mail-space-char address)
|
|
443
|
|
444 ;; stolen from rfc822.el
|
|
445 ;; Unfold multiple lines.
|
|
446 (goto-char (point-min))
|
|
447 (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t)
|
|
448 (replace-match "\\1 " t))
|
|
449
|
|
450 ;; first pass grabs useful information about address
|
|
451 (goto-char (point-min))
|
|
452 (while (progn
|
|
453 (skip-chars-forward mail-whitespace)
|
|
454 (not (eobp)))
|
|
455 (setq char (char-after (point)))
|
|
456 (or first-real-pos
|
|
457 (if (not (eq char ?\())
|
|
458 (setq first-real-pos (point))))
|
|
459 (cond
|
|
460 ;; comment
|
|
461 ((eq char ?\()
|
|
462 (set-syntax-table address-comment-syntax-table)
|
|
463 ;; only record the first non-empty comment's position
|
|
464 (if (and (not comment-beg)
|
|
465 (save-excursion
|
|
466 (forward-char 1)
|
|
467 (skip-chars-forward mail-whitespace)
|
|
468 (not (eq ?\) (char-after (point))))))
|
|
469 (setq comment-beg (point)))
|
|
470 ;; TODO: don't record if unbalanced
|
|
471 (or (safe-move-sexp 1)
|
|
472 (forward-char 1))
|
|
473 (set-syntax-table address-syntax-table)
|
|
474 (if (and comment-beg
|
|
475 (not comment-end))
|
|
476 (setq comment-end (point))))
|
|
477 ;; quoted text
|
|
478 ((eq char ?\")
|
|
479 ;; only record the first non-empty quote's position
|
|
480 (if (and (not quote-beg)
|
|
481 (save-excursion
|
|
482 (forward-char 1)
|
|
483 (skip-chars-forward mail-whitespace)
|
|
484 (not (eq ?\" (char-after (point))))))
|
|
485 (setq quote-beg (point)))
|
|
486 ;; TODO: don't record if unbalanced
|
|
487 (or (safe-move-sexp 1)
|
|
488 (forward-char 1))
|
|
489 (if (and quote-beg
|
|
490 (not quote-end))
|
|
491 (setq quote-end (point))))
|
|
492 ;; domain literals
|
|
493 ((eq char ?\[)
|
|
494 (set-syntax-table address-domain-literal-syntax-table)
|
|
495 (or (safe-move-sexp 1)
|
|
496 (forward-char 1))
|
|
497 (set-syntax-table address-syntax-table))
|
|
498 ;; commas delimit addresses when outside < > pairs.
|
|
499 ((and (eq char ?,)
|
|
500 (or (null <-pos)
|
|
501 (and >-pos
|
|
502 ;; handle weird munged addresses
|
|
503 (> (mail-last-element <-pos) (car >-pos)))))
|
|
504 (setq multiple-addresses t)
|
|
505 (delete-char 1)
|
|
506 (narrow-to-region (point-min) (point)))
|
|
507 ;; record the position of various interesting chars, determine
|
|
508 ;; legality later.
|
|
509 ((setq record-pos-symbol
|
|
510 (cdr (assq char
|
|
511 '((?< . <-pos) (?> . >-pos) (?@ . @-pos)
|
|
512 (?: . :-pos) (?, . ,-pos) (?! . !-pos)
|
|
513 (?% . %-pos) (?\; . \;-pos)))))
|
|
514 (set record-pos-symbol
|
|
515 (cons (point) (symbol-value record-pos-symbol)))
|
|
516 (forward-char 1))
|
|
517 ((eq char ?.)
|
|
518 (forward-char 1))
|
|
519 ((memq char '(
|
|
520 ;; comment terminator illegal
|
|
521 ?\)
|
|
522 ;; domain literal terminator illegal
|
|
523 ?\]
|
|
524 ;; \ allowed only within quoted strings,
|
|
525 ;; domain literals, and comments
|
|
526 ?\\
|
|
527 ))
|
|
528 (mail-nuke-char-at (point))
|
|
529 (forward-char 1))
|
|
530 (t
|
|
531 (forward-word 1)))
|
|
532 (or (eq char ?\()
|
|
533 (setq last-real-pos (point))))
|
|
534
|
|
535 ;; Use only the leftmost <, if any. Replace all others with spaces.
|
|
536 (while (cdr <-pos)
|
|
537 (mail-nuke-char-at (car <-pos))
|
|
538 (setq <-pos (cdr <-pos)))
|
|
539
|
|
540 ;; Use only the rightmost >, if any. Replace all others with spaces.
|
|
541 (while (cdr >-pos)
|
|
542 (mail-nuke-char-at (nth 1 >-pos))
|
|
543 (setcdr >-pos (nthcdr 2 >-pos)))
|
|
544
|
|
545 ;; If multiple @s and a :, but no < and >, insert around buffer.
|
|
546 ;; This commonly happens on the UUCP "From " line. Ugh.
|
|
547 (cond ((and (> (length @-pos) 1)
|
|
548 :-pos ;TODO: check if between @s
|
|
549 (not <-pos))
|
|
550 (goto-char (point-min))
|
|
551 (delete-char 1)
|
|
552 (setq <-pos (list (point)))
|
|
553 (insert ?<)))
|
|
554
|
|
555 ;; If < but no >, insert > in rightmost possible position
|
|
556 (cond ((and <-pos
|
|
557 (null >-pos))
|
|
558 (goto-char (point-max))
|
|
559 (setq >-pos (list (point)))
|
|
560 (insert ?>)))
|
|
561
|
|
562 ;; If > but no <, replace > with space.
|
|
563 (cond ((and >-pos
|
|
564 (null <-pos))
|
|
565 (mail-nuke-char-at (car >-pos))
|
|
566 (setq >-pos nil)))
|
|
567
|
|
568 ;; Turn >-pos and <-pos into non-lists
|
|
569 (setq >-pos (car >-pos)
|
|
570 <-pos (car <-pos))
|
|
571
|
|
572 ;; Trim other punctuation lists of items outside < > pair to handle
|
|
573 ;; stupid MTAs.
|
|
574 (cond (<-pos ; don't need to check >-pos also
|
|
575 ;; handle bozo software that violates RFC 822 by sticking
|
|
576 ;; punctuation marks outside of a < > pair
|
|
577 (mail-nuke-elements-outside-range @-pos <-pos >-pos t)
|
|
578 ;; RFC 822 says nothing about these two outside < >, but
|
|
579 ;; remove those positions from the lists to make things
|
|
580 ;; easier.
|
|
581 (mail-nuke-elements-outside-range !-pos <-pos >-pos t)
|
|
582 (mail-nuke-elements-outside-range %-pos <-pos >-pos t)))
|
|
583
|
|
584 ;; Check for : that indicates GROUP list and for : part of
|
|
585 ;; ROUTE-ADDR spec.
|
|
586 ;; Can't possibly be more than two :. Nuke any extra.
|
|
587 (while :-pos
|
|
588 (setq temp (car :-pos)
|
|
589 :-pos (cdr :-pos))
|
|
590 (cond ((and <-pos >-pos
|
|
591 (> temp <-pos)
|
|
592 (< temp >-pos))
|
|
593 (if (or route-addr-:-pos
|
|
594 (< (length @-pos) 2)
|
|
595 (> temp (car @-pos))
|
|
596 (< temp (nth 1 @-pos)))
|
|
597 (mail-nuke-char-at temp)
|
|
598 (setq route-addr-:-pos temp)))
|
|
599 ((or (not <-pos)
|
|
600 (and <-pos
|
|
601 (< temp <-pos)))
|
|
602 (setq group-:-pos temp))))
|
|
603
|
|
604 ;; Nuke any ; that is in or to the left of a < > pair or to the left
|
|
605 ;; of a GROUP starting :. Also, there may only be one ;.
|
|
606 (while \;-pos
|
|
607 (setq temp (car \;-pos)
|
|
608 \;-pos (cdr \;-pos))
|
|
609 (cond ((and <-pos >-pos
|
|
610 (> temp <-pos)
|
|
611 (< temp >-pos))
|
|
612 (mail-nuke-char-at temp))
|
|
613 ((and (or (not group-:-pos)
|
|
614 (> temp group-:-pos))
|
|
615 (not group-\;-pos))
|
|
616 (setq group-\;-pos temp))))
|
|
617
|
|
618 ;; Handle junk like ";@host.company.dom" that sendmail adds.
|
|
619 ;; **** should I remember comment positions?
|
|
620 (and group-\;-pos
|
|
621 ;; this is fine for now
|
|
622 (mail-nuke-elements-outside-range !-pos group-:-pos group-\;-pos t)
|
|
623 (mail-nuke-elements-outside-range @-pos group-:-pos group-\;-pos t)
|
|
624 (mail-nuke-elements-outside-range %-pos group-:-pos group-\;-pos t)
|
|
625 (mail-nuke-elements-outside-range ,-pos group-:-pos group-\;-pos t)
|
|
626 (and last-real-pos
|
|
627 (> last-real-pos (1+ group-\;-pos))
|
|
628 (setq last-real-pos (1+ group-\;-pos)))
|
|
629 (and comment-end
|
|
630 (> comment-end group-\;-pos)
|
|
631 (setq comment-end nil
|
|
632 comment-beg nil))
|
|
633 (and quote-end
|
|
634 (> quote-end group-\;-pos)
|
|
635 (setq quote-end nil
|
|
636 quote-beg nil))
|
|
637 (narrow-to-region (point-min) group-\;-pos))
|
|
638
|
|
639 ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any
|
|
640 ;; others.
|
|
641 ;; Hell, go ahead an nuke all of the commas.
|
|
642 ;; **** This will cause problems when we start handling commas in
|
|
643 ;; the PHRASE part .... no it won't ... yes it will ... ?????
|
|
644 (mail-nuke-elements-outside-range ,-pos 1 1)
|
|
645
|
|
646 ;; can only have multiple @s inside < >. The fact that some MTAs
|
|
647 ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is
|
|
648 ;; handled above.
|
|
649
|
|
650 ;; Locate PHRASE part of ROUTE-ADDR.
|
|
651 (cond (<-pos
|
|
652 (goto-char <-pos)
|
|
653 (skip-chars-backward mail-whitespace)
|
|
654 (setq phrase-end (point))
|
|
655 (goto-char (or ;;group-:-pos
|
|
656 (point-min)))
|
|
657 (skip-chars-forward mail-whitespace)
|
|
658 (if (< (point) phrase-end)
|
|
659 (setq phrase-beg (point))
|
|
660 (setq phrase-end nil))))
|
|
661
|
|
662 ;; handle ROUTE-ADDRS with real ROUTEs.
|
|
663 ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and
|
|
664 ;; any % or ! must be semantically meaningless.
|
|
665 ;; TODO: do this processing into canonicalization buffer
|
|
666 (cond (route-addr-:-pos
|
|
667 (setq !-pos nil
|
|
668 %-pos nil
|
|
669 >-pos (copy-marker >-pos)
|
|
670 route-addr-:-pos (copy-marker route-addr-:-pos))
|
|
671 (goto-char >-pos)
|
|
672 (insert-before-markers ?X)
|
|
673 (goto-char (car @-pos))
|
|
674 (while (setq @-pos (cdr @-pos))
|
|
675 (delete-char 1)
|
|
676 (setq %-pos (cons (point-marker) %-pos))
|
|
677 (insert "%")
|
|
678 (goto-char (1- >-pos))
|
|
679 (save-excursion
|
|
680 (insert-buffer-substring extraction-buffer
|
|
681 (car @-pos) route-addr-:-pos)
|
|
682 (delete-region (car @-pos) route-addr-:-pos))
|
|
683 (or (cdr @-pos)
|
|
684 (setq saved-@-pos (list (point)))))
|
|
685 (setq @-pos saved-@-pos)
|
|
686 (goto-char >-pos)
|
|
687 (delete-char -1)
|
|
688 (mail-nuke-char-at route-addr-:-pos)
|
|
689 (mail-demarkerize route-addr-:-pos)
|
|
690 (setq route-addr-:-pos nil
|
|
691 >-pos (mail-demarkerize >-pos)
|
|
692 %-pos (mapcar 'mail-demarkerize %-pos))))
|
|
693
|
|
694 ;; de-listify @-pos
|
|
695 (setq @-pos (car @-pos))
|
|
696
|
|
697 ;; TODO: remove comments in the middle of an address
|
|
698
|
|
699 (set-buffer canonicalization-buffer)
|
|
700
|
|
701 (buffer-flush-undo canonicalization-buffer)
|
|
702 (set-syntax-table address-syntax-table)
|
|
703 (setq case-fold-search nil)
|
|
704
|
|
705 (widen)
|
|
706 (erase-buffer)
|
|
707 (insert-buffer-substring extraction-buffer)
|
|
708
|
|
709 (if <-pos
|
|
710 (narrow-to-region (progn
|
|
711 (goto-char (1+ <-pos))
|
|
712 (skip-chars-forward mail-whitespace)
|
|
713 (point))
|
|
714 >-pos)
|
|
715 ;; ****** Oh no! What if the address is completely empty!
|
|
716 (narrow-to-region first-real-pos last-real-pos))
|
|
717
|
|
718 (and @-pos %-pos
|
|
719 (mail-nuke-elements-outside-range %-pos (point-min) @-pos))
|
|
720 (and %-pos !-pos
|
|
721 (mail-nuke-elements-outside-range !-pos (point-min) (car %-pos)))
|
|
722 (and @-pos !-pos (not %-pos)
|
|
723 (mail-nuke-elements-outside-range !-pos (point-min) @-pos))
|
|
724
|
|
725 ;; Error condition:?? (and %-pos (not @-pos))
|
|
726
|
|
727 (cond (!-pos
|
|
728 ;; **** I don't understand this save-restriction and the
|
|
729 ;; narrow-to-region inside it. Why did I do that?
|
|
730 (save-restriction
|
|
731 (cond ((and @-pos
|
|
732 mail-@-binds-tighter-than-!)
|
|
733 (goto-char @-pos)
|
|
734 (setq %-pos (cons (point) %-pos)
|
|
735 @-pos nil)
|
|
736 (delete-char 1)
|
|
737 (insert "%")
|
|
738 (setq insert-point (point-max)))
|
|
739 (mail-@-binds-tighter-than-!
|
|
740 (setq insert-point (point-max)))
|
|
741 (%-pos
|
|
742 (setq insert-point (mail-last-element %-pos)
|
|
743 saved-%-pos (mapcar 'mail-markerize %-pos)
|
|
744 %-pos nil
|
|
745 @-pos (mail-markerize @-pos)))
|
|
746 (@-pos
|
|
747 (setq insert-point @-pos)
|
|
748 (setq @-pos (mail-markerize @-pos)))
|
|
749 (t
|
|
750 (setq insert-point (point-max))))
|
|
751 (narrow-to-region (point-min) insert-point)
|
|
752 (setq saved-!-pos (car !-pos))
|
|
753 (while !-pos
|
|
754 (goto-char (point-max))
|
|
755 (cond ((and (not @-pos)
|
|
756 (not (cdr !-pos)))
|
|
757 (setq @-pos (point))
|
|
758 (insert-before-markers "@ "))
|
|
759 (t
|
|
760 (setq %-pos (cons (point) %-pos))
|
|
761 (insert-before-markers "% ")))
|
|
762 (backward-char 1)
|
|
763 (insert-buffer-substring
|
|
764 (current-buffer)
|
|
765 (if (nth 1 !-pos)
|
|
766 (1+ (nth 1 !-pos))
|
|
767 (point-min))
|
|
768 (car !-pos))
|
|
769 (delete-char 1)
|
|
770 (or (save-excursion
|
|
771 (safe-move-sexp -1)
|
|
772 (skip-chars-backward mail-whitespace)
|
|
773 (eq ?. (preceding-char)))
|
|
774 (insert-before-markers
|
|
775 (if (save-excursion
|
|
776 (skip-chars-backward mail-whitespace)
|
|
777 (eq ?. (preceding-char)))
|
|
778 ""
|
|
779 ".")
|
|
780 "uucp"))
|
|
781 (setq !-pos (cdr !-pos))))
|
|
782 (and saved-%-pos
|
|
783 (setq %-pos (append (mapcar 'mail-demarkerize saved-%-pos)
|
|
784 %-pos)))
|
|
785 (setq @-pos (mail-demarkerize @-pos))
|
|
786 (narrow-to-region (1+ saved-!-pos) (point-max))))
|
|
787 (cond ((and %-pos
|
|
788 (not @-pos))
|
|
789 (goto-char (car %-pos))
|
|
790 (delete-char 1)
|
|
791 (setq @-pos (point))
|
|
792 (insert "@")
|
|
793 (setq %-pos (cdr %-pos))))
|
|
794 (setq %-pos (nreverse %-pos))
|
|
795 ;; RFC 1034 doesn't approve of this, oh well:
|
|
796 (downcase-region (or (car %-pos) @-pos (point-max)) (point-max))
|
|
797 (cond (%-pos ; implies @-pos valid
|
|
798 (setq temp %-pos)
|
|
799 (catch 'truncated
|
|
800 (while temp
|
|
801 (goto-char (or (nth 1 temp)
|
|
802 @-pos))
|
|
803 (skip-chars-backward mail-whitespace)
|
|
804 (save-excursion
|
|
805 (safe-move-sexp -1)
|
|
806 (setq domain-pos (point))
|
|
807 (skip-chars-backward mail-whitespace)
|
|
808 (setq \.-pos (eq ?. (preceding-char))))
|
|
809 (cond ((and \.-pos
|
|
810 (get
|
|
811 (intern
|
|
812 (buffer-substring domain-pos (point)))
|
|
813 'domain-name))
|
|
814 (narrow-to-region (point-min) (point))
|
|
815 (goto-char (car temp))
|
|
816 (delete-char 1)
|
|
817 (setq @-pos (point))
|
|
818 (setcdr temp nil)
|
|
819 (setq %-pos (delq @-pos %-pos))
|
|
820 (insert "@")
|
|
821 (throw 'truncated t)))
|
|
822 (setq temp (cdr temp))))))
|
|
823 (setq mbox-beg (point-min)
|
|
824 mbox-end (if %-pos (car %-pos)
|
|
825 (or @-pos
|
|
826 (point-max))))
|
|
827
|
|
828 ;; Done canonicalizing address.
|
|
829
|
|
830 (set-buffer extraction-buffer)
|
|
831
|
|
832 ;; Find the full name
|
|
833
|
|
834 (cond ((and phrase-beg
|
|
835 (eq quote-beg phrase-beg)
|
|
836 (<= quote-end phrase-end))
|
|
837 (narrow-to-region (1+ quote-beg) (1- quote-end))
|
|
838 (undo-backslash-quoting (point-min) (point-max)))
|
|
839 (phrase-beg
|
|
840 (narrow-to-region phrase-beg phrase-end))
|
|
841 (comment-beg
|
|
842 (narrow-to-region (1+ comment-beg) (1- comment-end))
|
|
843 (undo-backslash-quoting (point-min) (point-max)))
|
|
844 (t
|
|
845 ;; *** Work in canon buffer instead? No, can't. Hmm.
|
|
846 (delete-region (point-min) (point-max))
|
|
847 (insert-buffer-substring canonicalization-buffer
|
|
848 mbox-beg mbox-end)
|
|
849 (goto-char (point-min))
|
|
850 (setq \.-ends-name (search-forward "_" nil t))
|
|
851 (goto-char (point-min))
|
|
852 (while (progn
|
|
853 (skip-chars-forward mail-whitespace)
|
|
854 (not (eobp)))
|
|
855 (setq char (char-after (point)))
|
|
856 (cond
|
|
857 ((eq char ?\")
|
|
858 (setq quote-beg (point))
|
|
859 (or (safe-move-sexp 1)
|
|
860 ;; TODO: handle this error condition!!!!!
|
|
861 (forward-char 1))
|
|
862 ;; take into account deletions
|
|
863 (setq quote-end (- (point) 2))
|
|
864 (save-excursion
|
|
865 (backward-char 1)
|
|
866 (delete-char 1)
|
|
867 (goto-char quote-beg)
|
|
868 (delete-char 1))
|
|
869 (undo-backslash-quoting quote-beg quote-end)
|
|
870 (or (eq mail-space-char (char-after (point)))
|
|
871 (insert " "))
|
|
872 (setq \.-ends-name t))
|
|
873 ((eq char ?.)
|
|
874 (if (eq (char-after (1+ (point))) ?_)
|
|
875 (progn
|
|
876 (forward-char 1)
|
|
877 (delete-char 1)
|
|
878 (insert mail-space-char))
|
|
879 (if \.-ends-name
|
|
880 (narrow-to-region (point-min) (point))
|
|
881 (delete-char 1)
|
|
882 (insert " "))))
|
|
883 ((memq (char-syntax char) '(?. ?\\))
|
|
884 (delete-char 1)
|
|
885 (insert " "))
|
|
886 (t
|
|
887 (setq atom-beg (point))
|
|
888 (forward-word 1)
|
|
889 (setq atom-end (point))
|
|
890 (save-restriction
|
|
891 (narrow-to-region atom-beg atom-end)
|
|
892 (goto-char (point-min))
|
|
893 (while (re-search-forward "\\([^_]+\\)_" nil t)
|
|
894 (replace-match "\\1 "))
|
|
895 (goto-char (point-max))))))))
|
|
896
|
|
897 (set-syntax-table address-text-syntax-table)
|
|
898
|
|
899 (setq xxx (variant-method (buffer-string)))
|
|
900 (delete-region (point-min) (point-max))
|
|
901 (insert xxx)
|
|
902 (goto-char (point-min))
|
|
903
|
|
904 ;; ;; Compress whitespace
|
|
905 ;; (goto-char (point-min))
|
|
906 ;; (while (re-search-forward "[ \t\n]+" nil t)
|
|
907 ;; (replace-match " "))
|
|
908 ;;
|
|
909 ;; ;; Fix . used as space
|
|
910 ;; (goto-char (point-min))
|
|
911 ;; (while (re-search-forward mail-bad-\.-pattern nil t)
|
|
912 ;; (replace-match "\\1 \\2"))
|
|
913 ;;
|
|
914 ;; ;; Delete trailing parenthesized comment
|
|
915 ;; (goto-char (point-max))
|
|
916 ;; (skip-chars-backward mail-whitespace)
|
|
917 ;; (cond ((memq (char-after (1- (point))) '(?\) ?\} ?\]))
|
|
918 ;; (setq comment-end (point))
|
|
919 ;; (set-syntax-table address-text-comment-syntax-table)
|
|
920 ;; (or (safe-move-sexp -1)
|
|
921 ;; (backward-char 1))
|
|
922 ;; (set-syntax-table address-text-syntax-table)
|
|
923 ;; (setq comment-beg (point))
|
|
924 ;; (skip-chars-backward mail-whitespace)
|
|
925 ;; (if (bobp)
|
|
926 ;; (narrow-to-region (1+ comment-beg) (1- comment-end))
|
|
927 ;; (narrow-to-region (point-min) (point)))))
|
|
928 ;;
|
|
929 ;; ;; Find, save, and delete any name suffix
|
|
930 ;; ;; *** Broken!
|
|
931 ;; (goto-char (point-min))
|
|
932 ;; (cond ((re-search-forward mail-full-name-suffix-pattern nil t)
|
|
933 ;; (setq name-suffix (buffer-substring (match-beginning 3)
|
|
934 ;; (match-end 3)))
|
|
935 ;; (replace-match "\\1 \\4")))
|
|
936 ;;
|
|
937 ;; ;; Delete ALL CAPS words and after, if preceded by mixed-case or
|
|
938 ;; ;; lowercase words. Eg. XT-DEM.
|
|
939 ;; (goto-char (point-min))
|
|
940 ;; ;; ## This will lose on something like "SMITH MAX".
|
|
941 ;; ;; ## maybe it should be
|
|
942 ;; ;; ## " \\([A-Z]+[-_/][A-Z]+\\|[A-Z][A-Z][A-Z]\\)\\b.*[^A-Z \t]"
|
|
943 ;; ;; ## that is, three-letter-upper-case-word with non-upper-case
|
|
944 ;; ;; ## characters following it.
|
|
945 ;; (if (re-search-forward mail-mixed-case-name-pattern nil t)
|
|
946 ;; (if (re-search-forward mail-weird-acronym-pattern nil t)
|
|
947 ;; (narrow-to-region (point-min) (match-beginning 0))))
|
|
948 ;;
|
|
949 ;; ;; Delete trailing alternative address
|
|
950 ;; (goto-char (point-min))
|
|
951 ;; (if (re-search-forward mail-alternative-address-pattern nil t)
|
|
952 ;; (narrow-to-region (point-min) (match-beginning 0)))
|
|
953 ;;
|
|
954 ;; ;; Delete trailing comment
|
|
955 ;; (goto-char (point-min))
|
|
956 ;; (if (re-search-forward mail-trailing-comment-start-pattern nil t)
|
|
957 ;; (or (progn
|
|
958 ;; (goto-char (match-beginning 0))
|
|
959 ;; (skip-chars-backward mail-whitespace)
|
|
960 ;; (bobp))
|
|
961 ;; (narrow-to-region (point-min) (match-beginning 0))))
|
|
962 ;;
|
|
963 ;; ;; Delete trailing comma-separated comment
|
|
964 ;; (goto-char (point-min))
|
|
965 ;; ;; ## doesn't this break "Smith, John"? Yes.
|
|
966 ;; (re-search-forward mail-last-name-first-pattern nil t)
|
|
967 ;; (while (search-forward "," nil t)
|
|
968 ;; (or (save-excursion
|
|
969 ;; (backward-char 2)
|
|
970 ;; (looking-at mail-full-name-suffix-pattern))
|
|
971 ;; (narrow-to-region (point-min) (1- (point)))))
|
|
972 ;;
|
|
973 ;; ;; Delete telephone numbers and ham radio call signs
|
|
974 ;; (goto-char (point-min))
|
|
975 ;; (if (re-search-forward mail-telephone-extension-pattern nil t)
|
|
976 ;; (narrow-to-region (point-min) (match-beginning 0)))
|
|
977 ;; (goto-char (point-min))
|
|
978 ;; (if (re-search-forward mail-ham-call-sign-pattern nil t)
|
|
979 ;; (if (eq (match-beginning 0) (point-min))
|
|
980 ;; (narrow-to-region (match-end 0) (point-max))
|
|
981 ;; (narrow-to-region (point-min) (match-beginning 0))))
|
|
982 ;;
|
|
983 ;; ;; Delete trailing word followed immediately by .
|
|
984 ;; (goto-char (point-min))
|
|
985 ;; ;; ## what's this for? doesn't it mess up "Public, Harry Q."? No.
|
|
986 ;; (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
|
|
987 ;; (narrow-to-region (point-min) (match-beginning 0)))
|
|
988 ;;
|
|
989 ;; ;; Handle & substitution
|
|
990 ;; ;; TODO: remember to disable middle initial guessing
|
|
991 ;; (goto-char (point-min))
|
|
992 ;; (cond ((re-search-forward "\\( \\|\\`\\)&\\( \\|\\'\\)" nil t)
|
|
993 ;; (goto-char (match-end 1))
|
|
994 ;; (delete-char 1)
|
|
995 ;; (capitalize-region
|
|
996 ;; (point)
|
|
997 ;; (progn
|
|
998 ;; (insert-buffer-substring canonicalization-buffer
|
|
999 ;; mbox-beg mbox-end)
|
|
1000 ;; (point)))))
|
|
1001 ;;
|
|
1002 ;; ;; Delete nickname
|
|
1003 ;; (goto-char (point-min))
|
|
1004 ;; (if (re-search-forward mail-nickname-pattern nil t)
|
|
1005 ;; (replace-match (if (eq (match-beginning 2) (1- (match-end 2)))
|
|
1006 ;; " \\2 "
|
|
1007 ;; " ")))
|
|
1008 ;;
|
|
1009 ;; ;; Fixup initials
|
|
1010 ;; (while (progn
|
|
1011 ;; (goto-char (point-min))
|
|
1012 ;; (re-search-forward mail-bad-initials-pattern nil t))
|
|
1013 ;; (replace-match
|
|
1014 ;; (if (match-beginning 4)
|
|
1015 ;; "\\1. \\4"
|
|
1016 ;; (if (match-beginning 5)
|
|
1017 ;; "\\1. \\5"
|
|
1018 ;; "\\1. "))))
|
|
1019 ;;
|
|
1020 ;; ;; Delete title
|
|
1021 ;; (goto-char (point-min))
|
|
1022 ;; (if (re-search-forward mail-full-name-prefixes nil t)
|
|
1023 ;; (narrow-to-region (point) (point-max)))
|
|
1024 ;;
|
|
1025 ;; ;; Delete trailing and preceding non-name characters
|
|
1026 ;; (goto-char (point-min))
|
|
1027 ;; (skip-chars-forward mail-non-begin-name-chars)
|
|
1028 ;; (narrow-to-region (point) (point-max))
|
|
1029 ;; (goto-char (point-max))
|
|
1030 ;; (skip-chars-backward mail-non-end-name-chars)
|
|
1031 ;; (narrow-to-region (point-min) (point))
|
|
1032
|
|
1033 ;; If name is "First Last" and userid is "F?L", then assume
|
|
1034 ;; the middle initial is the second letter in the userid.
|
|
1035 ;; initially by Jamie Zawinski <jwz@lucid.com>
|
|
1036 (cond ((and (eq 3 (- mbox-end mbox-beg))
|
|
1037 (progn
|
|
1038 (goto-char (point-min))
|
|
1039 (looking-at mail-two-name-pattern)))
|
|
1040 (setq fi (char-after (match-beginning 0))
|
|
1041 li (char-after (match-beginning 3)))
|
|
1042 (save-excursion
|
|
1043 (set-buffer canonicalization-buffer)
|
|
1044 ;; char-equal is ignoring case here, so no need to upcase
|
|
1045 ;; or downcase.
|
|
1046 (let ((case-fold-search t))
|
|
1047 (and (char-equal fi (char-after mbox-beg))
|
|
1048 (char-equal li (char-after (1- mbox-end)))
|
|
1049 (setq mi (char-after (1+ mbox-beg))))))
|
|
1050 (cond ((and mi
|
|
1051 ;; TODO: use better table than syntax table
|
|
1052 (eq ?w (char-syntax mi)))
|
|
1053 (goto-char (match-beginning 3))
|
|
1054 (insert (upcase mi) ". ")))))
|
|
1055
|
|
1056 ;; ;; Restore suffix
|
|
1057 ;; (cond (name-suffix
|
|
1058 ;; (goto-char (point-max))
|
|
1059 ;; (insert ", " name-suffix)
|
|
1060 ;; (backward-word 1)
|
|
1061 ;; (cond ((memq (following-char) '(?j ?J ?s ?S))
|
|
1062 ;; (capitalize-word 1)
|
|
1063 ;; (or (eq (following-char) ?.)
|
|
1064 ;; (insert ?.)))
|
|
1065 ;; (t
|
|
1066 ;; (upcase-word 1)))))
|
|
1067
|
|
1068 ;; Result
|
|
1069 (list (buffer-string)
|
|
1070 (progn
|
|
1071 (set-buffer canonicalization-buffer)
|
|
1072 (buffer-string)))
|
|
1073 )))
|
|
1074
|
|
1075 ;; TODO: put this back in the above function now that it's proven:
|
|
1076 (defun variant-method (string)
|
|
1077 (let ((variant-buffer (get-buffer-create "*variant method buffer*"))
|
|
1078 (word-count 0)
|
|
1079 mixed-case-flag lower-case-flag upper-case-flag
|
|
1080 suffix-flag last-name-comma-flag
|
|
1081 comment-beg comment-end initial beg end
|
|
1082 )
|
|
1083 (save-excursion
|
|
1084 (set-buffer variant-buffer)
|
|
1085 (buffer-flush-undo variant-buffer)
|
|
1086 (set-syntax-table address-text-syntax-table)
|
|
1087 (widen)
|
|
1088 (erase-buffer)
|
|
1089 (setq case-fold-search nil)
|
|
1090
|
|
1091 (insert string)
|
|
1092
|
|
1093 ;; Fix . used as space
|
|
1094 (goto-char (point-min))
|
|
1095 (while (re-search-forward mail-bad-\.-pattern nil t)
|
|
1096 (replace-match "\\1 \\2"))
|
|
1097
|
|
1098 ;; Skip any initial garbage.
|
|
1099 (goto-char (point-min))
|
|
1100 (skip-chars-forward mail-non-begin-name-chars)
|
|
1101 (skip-chars-backward "& \"")
|
|
1102 (narrow-to-region (point) (point-max))
|
|
1103
|
|
1104 (catch 'stop
|
|
1105 (while t
|
|
1106 (skip-chars-forward mail-whitespace)
|
|
1107
|
|
1108 (cond
|
|
1109
|
|
1110 ;; Delete title
|
|
1111 ((and (eq word-count 0)
|
|
1112 (looking-at mail-full-name-prefixes))
|
|
1113 (goto-char (match-end 0))
|
|
1114 (narrow-to-region (point) (point-max)))
|
|
1115
|
|
1116 ;; Stop after name suffix
|
|
1117 ((and (>= word-count 2)
|
|
1118 (looking-at mail-full-name-suffix-pattern))
|
|
1119 (skip-chars-backward mail-whitespace)
|
|
1120 (setq suffix-flag (point))
|
|
1121 (if (eq ?, (following-char))
|
|
1122 (forward-char 1)
|
|
1123 (insert ?,))
|
|
1124 ;; Enforce at least one space after comma
|
|
1125 (or (eq mail-space-char (following-char))
|
|
1126 (insert mail-space-char))
|
|
1127 (skip-chars-forward mail-whitespace)
|
|
1128 (cond ((memq (following-char) '(?j ?J ?s ?S))
|
|
1129 (capitalize-word 1)
|
|
1130 (if (eq (following-char) ?.)
|
|
1131 (forward-char 1)
|
|
1132 (insert ?.)))
|
|
1133 (t
|
|
1134 (upcase-word 1)))
|
|
1135 (setq word-count (1+ word-count))
|
|
1136 (throw 'stop t))
|
|
1137
|
|
1138 ;; Handle SCA names
|
|
1139 ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As"
|
|
1140 (setq word-count 0)
|
|
1141 (goto-char (match-beginning 1))
|
|
1142 (narrow-to-region (point) (point-max)))
|
|
1143
|
|
1144 ;; Various stopping points
|
|
1145 ((or
|
|
1146 ;; Stop before ALL CAPS acronyms, if preceded by mixed-case or
|
|
1147 ;; lowercase words. Eg. XT-DEM.
|
|
1148 (and (>= word-count 2)
|
|
1149 (or mixed-case-flag lower-case-flag)
|
|
1150 (looking-at mail-weird-acronym-pattern)
|
|
1151 (not (looking-at mail-roman-numeral-pattern)))
|
|
1152 ;; Stop before 4-or-more letter lowercase words preceded by
|
|
1153 ;; mixed case or uppercase words.
|
|
1154 (and (>= word-count 2)
|
|
1155 (or upper-case-flag mixed-case-flag)
|
|
1156 (looking-at "[a-z][a-z][a-z][a-z]+\\b"))
|
|
1157 ;; Stop before trailing alternative address
|
|
1158 (looking-at mail-alternative-address-pattern)
|
|
1159 ;; Stop before trailing comment not introduced by comma
|
|
1160 (looking-at mail-trailing-comment-start-pattern)
|
|
1161 ;; Stop before telephone numbers
|
|
1162 (looking-at mail-telephone-extension-pattern))
|
|
1163 (throw 'stop t))
|
|
1164
|
|
1165 ;; Check for initial last name followed by comma
|
|
1166 ((and (eq ?, (following-char))
|
|
1167 (eq word-count 1))
|
|
1168 (forward-char 1)
|
|
1169 (setq last-name-comma-flag t)
|
|
1170 (or (eq mail-space-char (following-char))
|
|
1171 (insert mail-space-char)))
|
|
1172
|
|
1173 ;; Stop before trailing comma-separated comment
|
|
1174 ((eq ?, (following-char))
|
|
1175 (throw 'stop t))
|
|
1176
|
|
1177 ;; Delete parenthesized/quoted comment/nickname
|
|
1178 ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`))
|
|
1179 (setq comment-beg (point))
|
|
1180 (set-syntax-table address-text-comment-syntax-table)
|
|
1181 (cond ((memq (following-char) '(?\' ?\`))
|
|
1182 (if (eq ?\' (following-char))
|
|
1183 (forward-char 1))
|
|
1184 (or (search-forward "'" nil t)
|
|
1185 (delete-char 1)))
|
|
1186 (t
|
|
1187 (or (safe-move-sexp 1)
|
|
1188 (goto-char (point-max)))))
|
|
1189 (set-syntax-table address-text-syntax-table)
|
|
1190 (setq comment-end (point))
|
|
1191 (cond
|
|
1192 ;; Handle case of entire name being quoted
|
|
1193 ((and (eq word-count 0)
|
|
1194 (looking-at " *\\'")
|
|
1195 (>= (- comment-end comment-beg) 2))
|
|
1196 (narrow-to-region (1+ comment-beg) (1- comment-end))
|
|
1197 (goto-char (point-min)))
|
|
1198 (t
|
|
1199 ;; Handle case of quoted initial
|
|
1200 (if (and (or (= 3 (- comment-end comment-beg))
|
|
1201 (and (= 4 (- comment-end comment-beg))
|
|
1202 (eq ?. (char-after (+ 2 comment-beg)))))
|
|
1203 (not (looking-at " *\\'")))
|
|
1204 (setq initial (char-after (1+ comment-beg)))
|
|
1205 (setq initial nil))
|
|
1206 (delete-region comment-beg comment-end)
|
|
1207 (if initial
|
|
1208 (insert initial ". ")))))
|
|
1209
|
|
1210 ;; Delete ham radio call signs
|
|
1211 ((looking-at mail-ham-call-sign-pattern)
|
|
1212 (delete-region (match-beginning 0) (match-end 0)))
|
|
1213
|
|
1214 ;; Handle & substitution
|
|
1215 ;; TODO: remember to disable middle initial guessing
|
|
1216 ((and (or (bobp)
|
|
1217 (eq mail-space-char (preceding-char)))
|
|
1218 (looking-at "&\\( \\|\\'\\)"))
|
|
1219 (delete-char 1)
|
|
1220 (capitalize-region
|
|
1221 (point)
|
|
1222 (progn
|
|
1223 (insert-buffer-substring canonicalization-buffer
|
|
1224 mbox-beg mbox-end)
|
|
1225 (point))))
|
|
1226
|
|
1227 ;; Fixup initials
|
|
1228 ((looking-at mail-initial-pattern)
|
|
1229 (or (eq (following-char) (upcase (following-char)))
|
|
1230 (setq lower-case-flag t))
|
|
1231 (forward-char 1)
|
|
1232 (if (eq ?. (following-char))
|
|
1233 (forward-char 1)
|
|
1234 (insert ?.))
|
|
1235 (or (eq mail-space-char (following-char))
|
|
1236 (insert mail-space-char))
|
|
1237 (setq word-count (1+ word-count)))
|
|
1238
|
|
1239 ;; Regular name words
|
|
1240 ((looking-at mail-name-pattern)
|
|
1241 (setq beg (point))
|
|
1242 (setq end (match-end 0))
|
|
1243 (set (if (re-search-forward "[a-z]" end t)
|
|
1244 (if (progn
|
|
1245 (goto-char beg)
|
|
1246 (re-search-forward "[A-Z]" end t))
|
|
1247 'mixed-case-flag
|
|
1248 'lower-case-flag)
|
|
1249 'upper-case-flag) t)
|
|
1250 (goto-char end)
|
|
1251 (setq word-count (1+ word-count)))
|
|
1252
|
|
1253 (t
|
|
1254 (throw 'stop t)))))
|
|
1255
|
|
1256 (narrow-to-region (point-min) (point))
|
|
1257
|
|
1258 ;; Delete trailing word followed immediately by .
|
|
1259 (cond ((not suffix-flag)
|
|
1260 (goto-char (point-min))
|
|
1261 (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
|
|
1262 (narrow-to-region (point-min) (match-beginning 0)))))
|
|
1263
|
|
1264 ;; If last name first put it at end (but before suffix)
|
|
1265 (cond (last-name-comma-flag
|
|
1266 (goto-char (point-min))
|
|
1267 (search-forward ",")
|
|
1268 (setq end (1- (point)))
|
|
1269 (goto-char (or suffix-flag (point-max)))
|
|
1270 (or (eq mail-space-char (preceding-char))
|
|
1271 (insert mail-space-char))
|
|
1272 (insert-buffer-substring (current-buffer) (point-min) end)
|
|
1273 (narrow-to-region (1+ end) (point-max))))
|
|
1274
|
|
1275 (goto-char (point-max))
|
|
1276 (skip-chars-backward mail-non-end-name-chars)
|
|
1277 (if (eq ?. (following-char))
|
|
1278 (forward-char 1))
|
|
1279 (narrow-to-region (point)
|
|
1280 (progn
|
|
1281 (goto-char (point-min))
|
|
1282 (skip-chars-forward mail-non-begin-name-chars)
|
|
1283 (point)))
|
|
1284
|
|
1285 ;; Compress whitespace
|
|
1286 (goto-char (point-min))
|
|
1287 (while (re-search-forward "[ \t\n]+" nil t)
|
|
1288 (replace-match " "))
|
|
1289
|
|
1290 (buffer-substring (point-min) (point-max))
|
|
1291
|
|
1292 )))
|
|
1293
|
|
1294 ;; The country names are just in there for show right now, and because
|
|
1295 ;; Jamie thought it would be neat. They aren't used yet.
|
|
1296
|
|
1297 ;; Keep in mind that the country abbreviations follow ISO-3166. There is
|
|
1298 ;; a U.S. FIPS that specifies a different set of two-letter country
|
|
1299 ;; abbreviations.
|
|
1300
|
|
1301 ;; TODO: put this in its own obarray, instead of cluttering up the main
|
|
1302 ;; symbol table with junk.
|
|
1303
|
|
1304 (mapcar
|
|
1305 (function
|
|
1306 (lambda (x)
|
|
1307 (if (symbolp x)
|
|
1308 (put x 'domain-name t)
|
|
1309 (put (car x) 'domain-name (nth 1 x)))))
|
|
1310 '((ag "Antigua")
|
|
1311 (ar "Argentina") ; Argentine Republic
|
|
1312 arpa ; Advanced Projects Research Agency
|
|
1313 (at "Austria") ; The Republic of _
|
|
1314 (au "Australia")
|
|
1315 (bb "Barbados")
|
|
1316 (be "Belgium") ; The Kingdom of _
|
|
1317 (bg "Bulgaria")
|
|
1318 bitnet ; Because It's Time NET
|
|
1319 (bo "Bolivia") ; Republic of _
|
|
1320 (br "Brazil") ; The Federative Republic of _
|
|
1321 (bs "Bahamas")
|
|
1322 (bz "Belize")
|
|
1323 (ca "Canada")
|
|
1324 (ch "Switzerland") ; The Swiss Confederation
|
|
1325 (cl "Chile") ; The Republic of _
|
|
1326 (cn "China") ; The People's Republic of _
|
|
1327 (co "Columbia")
|
|
1328 com ; Commercial
|
|
1329 (cr "Costa Rica") ; The Republic of _
|
|
1330 (cs "Czechoslovakia")
|
|
1331 (de "Germany")
|
|
1332 (dk "Denmark")
|
|
1333 (dm "Dominica")
|
|
1334 (do "Dominican Republic") ; The _
|
|
1335 (ec "Ecuador") ; The Republic of _
|
|
1336 edu ; Educational
|
|
1337 (eg "Egypt") ; The Arab Republic of _
|
|
1338 (es "Spain") ; The Kingdom of _
|
|
1339 (fi "Finland") ; The Republic of _
|
|
1340 (fj "Fiji")
|
|
1341 (fr "France")
|
|
1342 gov ; Government (U.S.A.)
|
|
1343 (gr "Greece") ; The Hellenic Republic
|
|
1344 (hk "Hong Kong")
|
|
1345 (hu "Hungary") ; The Hungarian People's Republic (???)
|
|
1346 (ie "Ireland")
|
|
1347 (il "Israel") ; The State of _
|
|
1348 (in "India") ; The Republic of _
|
|
1349 int ; something British, don't know what
|
|
1350 (is "Iceland") ; The Republic of _
|
|
1351 (it "Italy") ; The Italian Republic
|
|
1352 (jm "Jamaica")
|
|
1353 (jp "Japan")
|
|
1354 (kn "St. Kitts and Nevis")
|
|
1355 (kr "South Korea")
|
|
1356 (lc "St. Lucia")
|
|
1357 (lk "Sri Lanka") ; The Democratic Socialist Republic of _
|
|
1358 mil ; Military (U.S.A.)
|
|
1359 (mx "Mexico") ; The United Mexican States
|
|
1360 (my "Malaysia") ; changed to Myanmar????
|
|
1361 (na "Namibia")
|
|
1362 nato ; North Atlantic Treaty Organization
|
|
1363 net ; Network
|
|
1364 (ni "Nicaragua") ; The Republic of _
|
|
1365 (nl "Netherlands") ; The Kingdom of the _
|
|
1366 (no "Norway") ; The Kingdom of _
|
|
1367 (nz "New Zealand")
|
|
1368 org ; Organization
|
|
1369 (pe "Peru")
|
|
1370 (pg "Papua New Guinea")
|
|
1371 (ph "Philippines") ; The Republic of the _
|
|
1372 (pl "Poland")
|
|
1373 (pr "Puerto Rico")
|
|
1374 (pt "Portugal") ; The Portugese Republic
|
|
1375 (py "Paraguay")
|
|
1376 (se "Sweden") ; The Kingdom of _
|
|
1377 (sg "Singapore") ; The Republic of _
|
|
1378 (sr "Suriname")
|
|
1379 (su "Soviet Union")
|
|
1380 (th "Thailand") ; The Kingdom of _
|
|
1381 (tn "Tunisia")
|
|
1382 (tr "Turkey") ; The Republic of _
|
|
1383 (tt "Trinidad and Tobago")
|
|
1384 (tw "Taiwan")
|
|
1385 (uk "United Kingdom") ; The _ of Great Britain
|
|
1386 unter-dom ; something German
|
|
1387 (us "U.S.A.") ; The United States of America
|
|
1388 uucp ; Unix to Unix CoPy
|
|
1389 (uy "Uruguay") ; The Eastern Republic of _
|
|
1390 (vc "St. Vincent and the Grenadines")
|
|
1391 (ve "Venezuela") ; The Republic of _
|
|
1392 (yu "Yugoslavia") ; The Socialist Federal Republic of _
|
|
1393 ;; Also said to be Zambia ...
|
|
1394 (za "South Africa") ; The Republic of _ (why not Zaire???)
|
|
1395 (zw "Zimbabwe") ; Republic of _
|
|
1396 ))
|
|
1397 ;; fipnet
|
|
1398
|
|
1399
|
|
1400 ;; Code for testing.
|
|
1401
|
|
1402 (defun time-extract ()
|
|
1403 (let (times list)
|
|
1404 (setq times (cons (current-time-string) times)
|
|
1405 list problem-address-alist)
|
|
1406 (while list
|
|
1407 (mail-extract-address-components (car (car list)))
|
|
1408 (setq list (cdr list)))
|
|
1409 (setq times (cons (current-time-string) times))
|
|
1410 (nreverse times)))
|
|
1411
|
|
1412 (defun test-extract (&optional starting-point)
|
|
1413 (interactive)
|
|
1414 (set-buffer (get-buffer-create "*Testing*"))
|
|
1415 (erase-buffer)
|
|
1416 (sit-for 0)
|
|
1417 (mapcar 'test-extract-internal
|
|
1418 (if starting-point
|
|
1419 (memq starting-point problem-address-alist)
|
|
1420 problem-address-alist)))
|
|
1421
|
|
1422 (defvar failed-item)
|
|
1423 (defun test-extract-internal (item)
|
|
1424 (setq failed-item item)
|
|
1425 (let* ((address (car item))
|
|
1426 (correct-name (nth 1 item))
|
|
1427 (correct-canon (nth 2 item))
|
|
1428 (result (mail-extract-address-components address))
|
|
1429 (name (car result))
|
|
1430 (canon (nth 1 result))
|
|
1431 (name-correct (or (null correct-name)
|
|
1432 (string-equal (downcase correct-name)
|
|
1433 (downcase name))))
|
|
1434 (canon-correct (or (null correct-canon)
|
|
1435 (string-equal correct-canon canon))))
|
|
1436 (cond ((not (and name-correct canon-correct))
|
|
1437 (pop-to-buffer "*Testing*")
|
|
1438 (select-window (get-buffer-window (current-buffer)))
|
|
1439 (goto-char (point-max))
|
|
1440 (insert "Address: " address "\n")
|
|
1441 (if (not name-correct)
|
|
1442 (insert " Correct Name: [" correct-name
|
|
1443 "]\; Result: [" name "]\n"))
|
|
1444 (if (not canon-correct)
|
|
1445 (insert " Correct Canon: [" correct-canon
|
|
1446 "]\; Result: [" canon "]\n"))
|
|
1447 (insert "\n")
|
|
1448 (sit-for 0))))
|
|
1449 (setq failed-item nil))
|
|
1450
|
|
1451 (defun test-continue-extract ()
|
|
1452 (interactive)
|
|
1453 (test-extract failed-item))
|
|
1454
|
|
1455
|
|
1456 ;; Assorted junk.
|
|
1457
|
|
1458 ;; warsaw@nlm.nih.gov (A Bad Dude -- Barry Warsaw)
|
|
1459
|
|
1460 ;;'(from
|
|
1461 ;; reply-to
|
|
1462 ;; return-path
|
|
1463 ;; x-uucp-from
|
|
1464 ;; sender
|
|
1465 ;; resent-from
|
|
1466 ;; resent-sender
|
|
1467 ;; resent-reply-to)
|
|
1468
|
|
1469 ;;; mail-extr.el ends here
|