Mercurial > emacs
comparison lisp/mail/mail-extr.el @ 809:8a0066235d56
Initial revision
author | Eric S. Raymond <esr@snark.thyrsus.com> |
---|---|
date | Fri, 17 Jul 1992 06:48:03 +0000 |
parents | |
children | 20674ae6bf52 |
comparison
equal
deleted
inserted
replaced
808:707866b2a190 | 809:8a0066235d56 |
---|---|
1 ;;; mail-extr.el --- extract full name and address from RFC 822 mail header. | |
2 | |
3 ;; Author: Joe Wells <jbw@cs.bu.edu> | |
4 ;; Last-Modified: 7 Apr 1992 | |
5 ;; Version: 1.0 | |
6 ;; Adapted-By: ESR | |
7 ;; Keywords: mail | |
8 | |
9 ;; Copyright (C) 1992 Free Software Foundation, Inc. | |
10 | |
11 ;; This file is part of GNU Emacs. | |
12 | |
13 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 ;; it under the terms of the GNU General Public License as published by | |
15 ;; the Free Software Foundation; either version 1, or (at your option) | |
16 ;; any later version. | |
17 | |
18 ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ;; GNU General Public License for more details. | |
22 | |
23 ;; You should have received a copy of the GNU General Public License | |
24 ;; along with GNU Emacs; see the file COPYING. If not, write to | |
25 ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | |
26 | |
27 ;;; Commentary: | |
28 | |
29 ;; Here is `mail-extr', a package for extracting full names and canonical | |
30 ;; addresses from RFC 822 mail headers. It is intended to be hooked into | |
31 ;; other Emacs Lisp packages that deal with RFC 822 format messages, such as | |
32 ;; Gnews, GNUS, RMAIL, MH-E, BBDB, VM, Supercite, etc. Thus, this release is | |
33 ;; mainly for Emacs Lisp developers. | |
34 | |
35 ;; There are two main benefits: | |
36 | |
37 ;; 1. Higher probability of getting the correct full name for a human than | |
38 ;; any other package I know of. (On the other hand, it will cheerfully | |
39 ;; mangle non-human names/comments.) | |
40 ;; 2. Address part is put in a canonical form. | |
41 | |
42 ;; The interface is not yet carved in stone; please give me suggestions. | |
43 | |
44 ;; I have an extensive test-case collection of funny addresses if you want to | |
45 ;; work with the code. Developing this code requires frequent testing to | |
46 ;; make sure you're not breaking functionality. I'm not posting the | |
47 ;; test-cases because they take over 100K. | |
48 | |
49 ;; If you find an address that mail-extr fails on, please send it to me along | |
50 ;; with what you think the correct results should be. I do not consider it a | |
51 ;; bug if mail-extr mangles a comment that does not correspond to a real | |
52 ;; human full name, although I would prefer that mail-extr would return the | |
53 ;; comment as-is. | |
54 | |
55 ;; Features: | |
56 | |
57 ;; * Full name handling: | |
58 | |
59 ;; * knows where full names can be found in an address. | |
60 ;; * avoids using empty comments and quoted text. | |
61 ;; * extracts full names from mailbox names. | |
62 ;; * recognizes common formats for comments after a full name. | |
63 ;; * puts a period and a space after each initial. | |
64 ;; * understands & referring to the mailbox name capitalized. | |
65 ;; * strips name prefixes like "Prof.", etc.. | |
66 ;; * understands what characters can occur in names (not just letters). | |
67 ;; * figures out middle initial from mailbox name. | |
68 ;; * removes funny nicknames. | |
69 ;; * keeps suffixes such as Jr., Sr., III, etc. | |
70 ;; * reorders "Last, First" type names. | |
71 | |
72 ;; * Address handling: | |
73 | |
74 ;; * parses rfc822 quoted text, comments, and domain literals. | |
75 ;; * parses rfc822 multi-line headers. | |
76 ;; * does something reasonable with rfc822 GROUP addresses. | |
77 ;; * handles many rfc822 noncompliant and garbage addresses. | |
78 ;; * canonicalizes addresses (after stripping comments/phrases outside <>). | |
79 ;; * converts ! addresses into .UUCP and %-style addresses. | |
80 ;; * converts rfc822 ROUTE addresses to %-style addresses. | |
81 ;; * truncates %-style addresses at leftmost fully qualified domain name. | |
82 ;; * handles local relative precedence of ! vs. % and @ (untested). | |
83 | |
84 ;; It does almost no string creation. It primarily uses the built-in | |
85 ;; parsing routines with the appropriate syntax tables. This should | |
86 ;; result in greater speed. | |
87 | |
88 ;; TODO: | |
89 | |
90 ;; * handle all test cases. (This will take forever.) | |
91 ;; * software to pick the correct header to use (eg., "Senders-Name:"). | |
92 ;; * multiple addresses in the "From:" header (almost all of the necessary | |
93 ;; code is there). | |
94 ;; * flag to not treat `,' as an address separator. (This is useful when | |
95 ;; there is a "From:" header but no "Sender:" header, because then there | |
96 ;; is only allowed to be one address.) | |
97 ;; * mailbox name does not necessarily contain full name. | |
98 ;; * fixing capitalization when it's all upper or lowercase. (Hard!) | |
99 ;; * some of the domain literal handling is missing. (But I've never even | |
100 ;; seen one of these in a mail address, so maybe no big deal.) | |
101 ;; * arrange to have syntax tables byte-compiled. | |
102 ;; * speed hacks. | |
103 ;; * delete unused variables. | |
104 ;; * arrange for testing with different relative precedences of ! vs. @ | |
105 ;; and %. | |
106 ;; * put variant-method back into mail-extract-address-components. | |
107 ;; * insert documentation strings! | |
108 ;; * handle X.400-gatewayed addresses according to RFC 1148. | |
109 | |
110 ;;; Change Log: | |
111 ;; | |
112 ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu) | |
113 ;; | |
114 ;; * Cleaned up some more. Release version 1.0 to world. | |
115 ;; | |
116 ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu) | |
117 ;; | |
118 ;; * Cleaned up full name extraction extensively. | |
119 ;; | |
120 ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu) | |
121 ;; | |
122 ;; * Total rewrite. Integrated mail-canonicalize-address into | |
123 ;; mail-extract-address-components. Now handles GROUP addresses more | |
124 ;; or less correctly. Better handling of lots of different cases. | |
125 ;; | |
126 ;; Fri Jun 14 19:39:50 1991 | |
127 ;; * Created. | |
128 | |
129 ;;; Code: | |
130 | |
131 ;; Variable definitions. | |
132 | |
133 (defvar mail-@-binds-tighter-than-! nil) | |
134 | |
135 ;;---------------------------------------------------------------------- | |
136 ;; what orderings are meaningful????? | |
137 ;;(defvar mail-operator-precedence-list '(?! ?% ?@)) | |
138 ;; Right operand of a % or a @ must be a domain name, period. No other | |
139 ;; operators allowed. Left operand of a @ is an address relative to that | |
140 ;; site. | |
141 | |
142 ;; Left operand of a ! must be a domain name. Right operand is an | |
143 ;; arbitrary address. | |
144 ;;---------------------------------------------------------------------- | |
145 | |
146 (defconst mail-space-char 32) | |
147 | |
148 (defconst mail-whitespace " \t\n") | |
149 | |
150 ;; Any character that can occur in a name in an RFC822 address. | |
151 ;; Yes, there are weird people with digits in their names. | |
152 (defconst mail-all-letters "A-Za-z---{|}'~0-9`.") | |
153 | |
154 ;; Any character that can occur in a name, not counting characters that | |
155 ;; separate parts of a multipart name. | |
156 (defconst mail-all-letters-but-separators "A-Za-z{|}'~0-9`") | |
157 | |
158 ;; Any character that can start a name | |
159 (defconst mail-first-letters "A-Za-z") | |
160 | |
161 ;; Any character that can end a name. | |
162 (defconst mail-last-letters "A-Za-z`'.") | |
163 | |
164 ;; Matches an initial not followed by both a period and a space. | |
165 (defconst mail-bad-initials-pattern | |
166 (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)" | |
167 mail-all-letters mail-first-letters mail-all-letters)) | |
168 | |
169 (defconst mail-non-name-chars (concat "^" mail-all-letters ".")) | |
170 | |
171 (defconst mail-non-begin-name-chars (concat "^" mail-first-letters)) | |
172 | |
173 (defconst mail-non-end-name-chars (concat "^" mail-last-letters)) | |
174 | |
175 ;; Matches periods used instead of spaces. Must not match the period | |
176 ;; following an initial. | |
177 (defconst mail-bad-\.-pattern | |
178 (format "\\([%s][%s]\\)\\.+\\([%s]\\)" | |
179 mail-all-letters mail-last-letters mail-first-letters)) | |
180 | |
181 ;; Matches an embedded or leading nickname that should be removed. | |
182 (defconst mail-nickname-pattern | |
183 (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] " | |
184 mail-all-letters)) | |
185 | |
186 ;; Matches a leading title that is not part of the name (does not | |
187 ;; contribute to uniquely identifying the person). | |
188 (defconst mail-full-name-prefixes | |
189 '"\\` *\\(Prof\\|Dr\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.? ") | |
190 | |
191 ;; Matches the occurrence of a generational name suffix, and the last | |
192 ;; character of the preceding name. | |
193 (defconst mail-full-name-suffix-pattern | |
194 (format | |
195 "\\(,? ?\\([JjSs]r\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)" | |
196 mail-all-letters mail-all-letters)) | |
197 | |
198 (defconst mail-roman-numeral-pattern | |
199 "V?I+V?\\b") | |
200 | |
201 ;; Matches a trailing uppercase (with other characters possible) acronym. | |
202 ;; Must not match a trailing uppercase last name or trailing initial | |
203 (defconst mail-weird-acronym-pattern "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)") | |
204 | |
205 ;; Matches a mixed-case or lowercase name (not an initial). | |
206 (defconst mail-mixed-case-name-pattern | |
207 (format | |
208 "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)" | |
209 mail-all-letters mail-last-letters | |
210 mail-first-letters mail-all-letters mail-all-letters mail-last-letters | |
211 mail-first-letters mail-all-letters)) | |
212 | |
213 ;; Matches a trailing alternative address. | |
214 (defconst mail-alternative-address-pattern "[a-zA-Z.]+[!@][a-zA-Z.]") | |
215 | |
216 ;; Matches a variety of trailing comments not including comma-delimited | |
217 ;; comments. | |
218 (defconst mail-trailing-comment-start-pattern " [-{]\\|--\\|[+@#></\;]") | |
219 | |
220 ;; Matches a name (not an initial). | |
221 ;; This doesn't force a word boundary at the end because sometimes a | |
222 ;; comment is separated by a `-' with no preceding space. | |
223 (defconst mail-name-pattern | |
224 (format | |
225 "\\b[%s][%s]*[%s]" | |
226 mail-first-letters mail-all-letters mail-last-letters)) | |
227 | |
228 (defconst mail-initial-pattern | |
229 (format "\\b[%s]\\([. ]\\|\\b\\)" mail-first-letters)) | |
230 | |
231 ;; Matches a single name before a comma. | |
232 (defconst mail-last-name-first-pattern | |
233 (concat "\\`" mail-name-pattern ",")) | |
234 | |
235 ;; Matches telephone extensions. | |
236 (defconst mail-telephone-extension-pattern | |
237 "\\(\\([Ee]xt\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+") | |
238 | |
239 ;; Matches ham radio call signs. | |
240 (defconst mail-ham-call-sign-pattern | |
241 "\\b[A-Z]+[0-9][A-Z0-9]*") | |
242 | |
243 ;; Matches normal single-part name | |
244 (defconst mail-normal-name-pattern | |
245 (format | |
246 "\\b[%s][%s]+[%s]" | |
247 mail-first-letters mail-all-letters-but-separators mail-last-letters)) | |
248 | |
249 ;; Matches normal two names with missing middle initial | |
250 (defconst mail-two-name-pattern | |
251 (concat "\\`\\(" mail-normal-name-pattern | |
252 "\\|" mail-initial-pattern | |
253 "\\) +\\(" mail-normal-name-pattern "\\)\\(,\\|\\'\\)")) | |
254 | |
255 (defvar address-syntax-table (make-syntax-table)) | |
256 (defvar address-comment-syntax-table (make-syntax-table)) | |
257 (defvar address-domain-literal-syntax-table (make-syntax-table)) | |
258 (defvar address-text-comment-syntax-table (make-syntax-table)) | |
259 (defvar address-text-syntax-table (make-syntax-table)) | |
260 (mapcar | |
261 (function | |
262 (lambda (pair) | |
263 (let ((syntax-table (symbol-value (car pair)))) | |
264 (mapcar | |
265 (function | |
266 (lambda (item) | |
267 (if (eq 2 (length item)) | |
268 (modify-syntax-entry (car item) (car (cdr item)) syntax-table) | |
269 (let ((char (car item)) | |
270 (bound (car (cdr item))) | |
271 (syntax (car (cdr (cdr item))))) | |
272 (while (<= char bound) | |
273 (modify-syntax-entry char syntax syntax-table) | |
274 (setq char (1+ char))))))) | |
275 (cdr pair))))) | |
276 '((address-syntax-table | |
277 (0 31 "w") ;control characters | |
278 (32 " ") ;SPC | |
279 (?! ?~ "w") ;printable characters | |
280 (127 "w") ;DEL | |
281 (128 255 "w") ;high-bit-on characters | |
282 (?\t " ") | |
283 (?\r " ") | |
284 (?\n " ") | |
285 (?\( ".") | |
286 (?\) ".") | |
287 (?< ".") | |
288 (?> ".") | |
289 (?@ ".") | |
290 (?, ".") | |
291 (?\; ".") | |
292 (?: ".") | |
293 (?\\ "\\") | |
294 (?\" "\"") | |
295 (?. ".") | |
296 (?\[ ".") | |
297 (?\] ".") | |
298 ;; % and ! aren't RFC822 characters, but it is convenient to pretend | |
299 (?% ".") | |
300 (?! ".") | |
301 ) | |
302 (address-comment-syntax-table | |
303 (0 255 "w") | |
304 (?\( "\(\)") | |
305 (?\) "\)\(") | |
306 (?\\ "\\")) | |
307 (address-domain-literal-syntax-table | |
308 (0 255 "w") | |
309 (?\[ "\(\]") ;?????? | |
310 (?\] "\)\[") ;?????? | |
311 (?\\ "\\")) | |
312 (address-text-comment-syntax-table | |
313 (0 255 "w") | |
314 (?\( "\(\)") | |
315 (?\) "\)\(") | |
316 (?\[ "\(\]") | |
317 (?\] "\)\[") | |
318 (?\{ "\(\}") | |
319 (?\} "\)\{") | |
320 (?\\ "\\") | |
321 (?\" "\"") | |
322 ;; (?\' "\)\`") | |
323 ;; (?\` "\(\'") | |
324 ) | |
325 (address-text-syntax-table | |
326 (0 255 ".") | |
327 (?A ?Z "w") | |
328 (?a ?z "w") | |
329 (?- "w") | |
330 (?\} "w") | |
331 (?\{ "w") | |
332 (?| "w") | |
333 (?\' "w") | |
334 (?~ "w") | |
335 (?0 ?9 "w")) | |
336 )) | |
337 | |
338 | |
339 ;; Utility functions and macros. | |
340 | |
341 (defmacro undo-backslash-quoting (beg end) | |
342 (`(save-excursion | |
343 (save-restriction | |
344 (narrow-to-region (, beg) (, end)) | |
345 (goto-char (point-min)) | |
346 ;; undo \ quoting | |
347 (while (re-search-forward "\\\\\\(.\\)" nil t) | |
348 (replace-match "\\1") | |
349 ;; CHECK: does this leave point after the replacement? | |
350 ))))) | |
351 | |
352 (defmacro mail-nuke-char-at (pos) | |
353 (` (save-excursion | |
354 (goto-char (, pos)) | |
355 (delete-char 1) | |
356 (insert mail-space-char)))) | |
357 | |
358 (defmacro mail-nuke-elements-outside-range (list-symbol beg-symbol end-symbol | |
359 &optional no-replace) | |
360 (` (progn | |
361 (setq temp (, list-symbol)) | |
362 (while temp | |
363 (cond ((or (> (car temp) (, end-symbol)) | |
364 (< (car temp) (, beg-symbol))) | |
365 (, (or no-replace | |
366 (` (mail-nuke-char-at (car temp))))) | |
367 (setcar temp nil))) | |
368 (setq temp (cdr temp))) | |
369 (setq (, list-symbol) (delq nil (, list-symbol)))))) | |
370 | |
371 (defun mail-demarkerize (marker) | |
372 (and marker | |
373 (if (markerp marker) | |
374 (let ((temp (marker-position marker))) | |
375 (set-marker marker nil) | |
376 temp) | |
377 marker))) | |
378 | |
379 (defun mail-markerize (pos) | |
380 (and pos | |
381 (if (markerp pos) | |
382 pos | |
383 (copy-marker pos)))) | |
384 | |
385 (defmacro mail-last-element (list) | |
386 "Return last element of LIST." | |
387 (` (let ((list (, list))) | |
388 (while (not (null (cdr list))) | |
389 (setq list (cdr list))) | |
390 (car list)))) | |
391 | |
392 (defmacro safe-move-sexp (arg) | |
393 "Safely skip over one balanced sexp, if there is one. Return t if success." | |
394 (` (condition-case error | |
395 (progn | |
396 (goto-char (scan-sexps (point) (, arg))) | |
397 t) | |
398 (error | |
399 (if (string-equal (nth 1 error) "Unbalanced parentheses") | |
400 nil | |
401 (while t | |
402 (signal (car error) (cdr error)))))))) | |
403 | |
404 | |
405 ;; The main function to grind addresses | |
406 | |
407 (defun mail-extract-address-components (address) | |
408 "Given an rfc 822 ADDRESS, extract full name and canonical address. | |
409 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS)." | |
410 (let ((canonicalization-buffer (get-buffer-create "*canonical address*")) | |
411 (extraction-buffer (get-buffer-create "*extract address components*")) | |
412 (foo 'bar) | |
413 char | |
414 multiple-addresses | |
415 <-pos >-pos @-pos :-pos ,-pos !-pos %-pos \;-pos | |
416 group-:-pos group-\;-pos route-addr-:-pos | |
417 record-pos-symbol | |
418 first-real-pos last-real-pos | |
419 phrase-beg phrase-end | |
420 comment-beg comment-end | |
421 quote-beg quote-end | |
422 atom-beg atom-end | |
423 mbox-beg mbox-end | |
424 \.-ends-name | |
425 temp | |
426 name-suffix | |
427 saved-point | |
428 fi mi li | |
429 saved-%-pos saved-!-pos saved-@-pos | |
430 domain-pos \.-pos insert-point) | |
431 | |
432 (save-excursion | |
433 (set-buffer extraction-buffer) | |
434 (buffer-flush-undo extraction-buffer) | |
435 (set-syntax-table address-syntax-table) | |
436 (widen) | |
437 (erase-buffer) | |
438 (setq case-fold-search nil) | |
439 | |
440 ;; Insert extra space at beginning to allow later replacement with < | |
441 ;; without having to move markers. | |
442 (insert mail-space-char address) | |
443 | |
444 ;; stolen from rfc822.el | |
445 ;; Unfold multiple lines. | |
446 (goto-char (point-min)) | |
447 (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t) | |
448 (replace-match "\\1 " t)) | |
449 | |
450 ;; first pass grabs useful information about address | |
451 (goto-char (point-min)) | |
452 (while (progn | |
453 (skip-chars-forward mail-whitespace) | |
454 (not (eobp))) | |
455 (setq char (char-after (point))) | |
456 (or first-real-pos | |
457 (if (not (eq char ?\()) | |
458 (setq first-real-pos (point)))) | |
459 (cond | |
460 ;; comment | |
461 ((eq char ?\() | |
462 (set-syntax-table address-comment-syntax-table) | |
463 ;; only record the first non-empty comment's position | |
464 (if (and (not comment-beg) | |
465 (save-excursion | |
466 (forward-char 1) | |
467 (skip-chars-forward mail-whitespace) | |
468 (not (eq ?\) (char-after (point)))))) | |
469 (setq comment-beg (point))) | |
470 ;; TODO: don't record if unbalanced | |
471 (or (safe-move-sexp 1) | |
472 (forward-char 1)) | |
473 (set-syntax-table address-syntax-table) | |
474 (if (and comment-beg | |
475 (not comment-end)) | |
476 (setq comment-end (point)))) | |
477 ;; quoted text | |
478 ((eq char ?\") | |
479 ;; only record the first non-empty quote's position | |
480 (if (and (not quote-beg) | |
481 (save-excursion | |
482 (forward-char 1) | |
483 (skip-chars-forward mail-whitespace) | |
484 (not (eq ?\" (char-after (point)))))) | |
485 (setq quote-beg (point))) | |
486 ;; TODO: don't record if unbalanced | |
487 (or (safe-move-sexp 1) | |
488 (forward-char 1)) | |
489 (if (and quote-beg | |
490 (not quote-end)) | |
491 (setq quote-end (point)))) | |
492 ;; domain literals | |
493 ((eq char ?\[) | |
494 (set-syntax-table address-domain-literal-syntax-table) | |
495 (or (safe-move-sexp 1) | |
496 (forward-char 1)) | |
497 (set-syntax-table address-syntax-table)) | |
498 ;; commas delimit addresses when outside < > pairs. | |
499 ((and (eq char ?,) | |
500 (or (null <-pos) | |
501 (and >-pos | |
502 ;; handle weird munged addresses | |
503 (> (mail-last-element <-pos) (car >-pos))))) | |
504 (setq multiple-addresses t) | |
505 (delete-char 1) | |
506 (narrow-to-region (point-min) (point))) | |
507 ;; record the position of various interesting chars, determine | |
508 ;; legality later. | |
509 ((setq record-pos-symbol | |
510 (cdr (assq char | |
511 '((?< . <-pos) (?> . >-pos) (?@ . @-pos) | |
512 (?: . :-pos) (?, . ,-pos) (?! . !-pos) | |
513 (?% . %-pos) (?\; . \;-pos))))) | |
514 (set record-pos-symbol | |
515 (cons (point) (symbol-value record-pos-symbol))) | |
516 (forward-char 1)) | |
517 ((eq char ?.) | |
518 (forward-char 1)) | |
519 ((memq char '( | |
520 ;; comment terminator illegal | |
521 ?\) | |
522 ;; domain literal terminator illegal | |
523 ?\] | |
524 ;; \ allowed only within quoted strings, | |
525 ;; domain literals, and comments | |
526 ?\\ | |
527 )) | |
528 (mail-nuke-char-at (point)) | |
529 (forward-char 1)) | |
530 (t | |
531 (forward-word 1))) | |
532 (or (eq char ?\() | |
533 (setq last-real-pos (point)))) | |
534 | |
535 ;; Use only the leftmost <, if any. Replace all others with spaces. | |
536 (while (cdr <-pos) | |
537 (mail-nuke-char-at (car <-pos)) | |
538 (setq <-pos (cdr <-pos))) | |
539 | |
540 ;; Use only the rightmost >, if any. Replace all others with spaces. | |
541 (while (cdr >-pos) | |
542 (mail-nuke-char-at (nth 1 >-pos)) | |
543 (setcdr >-pos (nthcdr 2 >-pos))) | |
544 | |
545 ;; If multiple @s and a :, but no < and >, insert around buffer. | |
546 ;; This commonly happens on the UUCP "From " line. Ugh. | |
547 (cond ((and (> (length @-pos) 1) | |
548 :-pos ;TODO: check if between @s | |
549 (not <-pos)) | |
550 (goto-char (point-min)) | |
551 (delete-char 1) | |
552 (setq <-pos (list (point))) | |
553 (insert ?<))) | |
554 | |
555 ;; If < but no >, insert > in rightmost possible position | |
556 (cond ((and <-pos | |
557 (null >-pos)) | |
558 (goto-char (point-max)) | |
559 (setq >-pos (list (point))) | |
560 (insert ?>))) | |
561 | |
562 ;; If > but no <, replace > with space. | |
563 (cond ((and >-pos | |
564 (null <-pos)) | |
565 (mail-nuke-char-at (car >-pos)) | |
566 (setq >-pos nil))) | |
567 | |
568 ;; Turn >-pos and <-pos into non-lists | |
569 (setq >-pos (car >-pos) | |
570 <-pos (car <-pos)) | |
571 | |
572 ;; Trim other punctuation lists of items outside < > pair to handle | |
573 ;; stupid MTAs. | |
574 (cond (<-pos ; don't need to check >-pos also | |
575 ;; handle bozo software that violates RFC 822 by sticking | |
576 ;; punctuation marks outside of a < > pair | |
577 (mail-nuke-elements-outside-range @-pos <-pos >-pos t) | |
578 ;; RFC 822 says nothing about these two outside < >, but | |
579 ;; remove those positions from the lists to make things | |
580 ;; easier. | |
581 (mail-nuke-elements-outside-range !-pos <-pos >-pos t) | |
582 (mail-nuke-elements-outside-range %-pos <-pos >-pos t))) | |
583 | |
584 ;; Check for : that indicates GROUP list and for : part of | |
585 ;; ROUTE-ADDR spec. | |
586 ;; Can't possibly be more than two :. Nuke any extra. | |
587 (while :-pos | |
588 (setq temp (car :-pos) | |
589 :-pos (cdr :-pos)) | |
590 (cond ((and <-pos >-pos | |
591 (> temp <-pos) | |
592 (< temp >-pos)) | |
593 (if (or route-addr-:-pos | |
594 (< (length @-pos) 2) | |
595 (> temp (car @-pos)) | |
596 (< temp (nth 1 @-pos))) | |
597 (mail-nuke-char-at temp) | |
598 (setq route-addr-:-pos temp))) | |
599 ((or (not <-pos) | |
600 (and <-pos | |
601 (< temp <-pos))) | |
602 (setq group-:-pos temp)))) | |
603 | |
604 ;; Nuke any ; that is in or to the left of a < > pair or to the left | |
605 ;; of a GROUP starting :. Also, there may only be one ;. | |
606 (while \;-pos | |
607 (setq temp (car \;-pos) | |
608 \;-pos (cdr \;-pos)) | |
609 (cond ((and <-pos >-pos | |
610 (> temp <-pos) | |
611 (< temp >-pos)) | |
612 (mail-nuke-char-at temp)) | |
613 ((and (or (not group-:-pos) | |
614 (> temp group-:-pos)) | |
615 (not group-\;-pos)) | |
616 (setq group-\;-pos temp)))) | |
617 | |
618 ;; Handle junk like ";@host.company.dom" that sendmail adds. | |
619 ;; **** should I remember comment positions? | |
620 (and group-\;-pos | |
621 ;; this is fine for now | |
622 (mail-nuke-elements-outside-range !-pos group-:-pos group-\;-pos t) | |
623 (mail-nuke-elements-outside-range @-pos group-:-pos group-\;-pos t) | |
624 (mail-nuke-elements-outside-range %-pos group-:-pos group-\;-pos t) | |
625 (mail-nuke-elements-outside-range ,-pos group-:-pos group-\;-pos t) | |
626 (and last-real-pos | |
627 (> last-real-pos (1+ group-\;-pos)) | |
628 (setq last-real-pos (1+ group-\;-pos))) | |
629 (and comment-end | |
630 (> comment-end group-\;-pos) | |
631 (setq comment-end nil | |
632 comment-beg nil)) | |
633 (and quote-end | |
634 (> quote-end group-\;-pos) | |
635 (setq quote-end nil | |
636 quote-beg nil)) | |
637 (narrow-to-region (point-min) group-\;-pos)) | |
638 | |
639 ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any | |
640 ;; others. | |
641 ;; Hell, go ahead an nuke all of the commas. | |
642 ;; **** This will cause problems when we start handling commas in | |
643 ;; the PHRASE part .... no it won't ... yes it will ... ????? | |
644 (mail-nuke-elements-outside-range ,-pos 1 1) | |
645 | |
646 ;; can only have multiple @s inside < >. The fact that some MTAs | |
647 ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is | |
648 ;; handled above. | |
649 | |
650 ;; Locate PHRASE part of ROUTE-ADDR. | |
651 (cond (<-pos | |
652 (goto-char <-pos) | |
653 (skip-chars-backward mail-whitespace) | |
654 (setq phrase-end (point)) | |
655 (goto-char (or ;;group-:-pos | |
656 (point-min))) | |
657 (skip-chars-forward mail-whitespace) | |
658 (if (< (point) phrase-end) | |
659 (setq phrase-beg (point)) | |
660 (setq phrase-end nil)))) | |
661 | |
662 ;; handle ROUTE-ADDRS with real ROUTEs. | |
663 ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and | |
664 ;; any % or ! must be semantically meaningless. | |
665 ;; TODO: do this processing into canonicalization buffer | |
666 (cond (route-addr-:-pos | |
667 (setq !-pos nil | |
668 %-pos nil | |
669 >-pos (copy-marker >-pos) | |
670 route-addr-:-pos (copy-marker route-addr-:-pos)) | |
671 (goto-char >-pos) | |
672 (insert-before-markers ?X) | |
673 (goto-char (car @-pos)) | |
674 (while (setq @-pos (cdr @-pos)) | |
675 (delete-char 1) | |
676 (setq %-pos (cons (point-marker) %-pos)) | |
677 (insert "%") | |
678 (goto-char (1- >-pos)) | |
679 (save-excursion | |
680 (insert-buffer-substring extraction-buffer | |
681 (car @-pos) route-addr-:-pos) | |
682 (delete-region (car @-pos) route-addr-:-pos)) | |
683 (or (cdr @-pos) | |
684 (setq saved-@-pos (list (point))))) | |
685 (setq @-pos saved-@-pos) | |
686 (goto-char >-pos) | |
687 (delete-char -1) | |
688 (mail-nuke-char-at route-addr-:-pos) | |
689 (mail-demarkerize route-addr-:-pos) | |
690 (setq route-addr-:-pos nil | |
691 >-pos (mail-demarkerize >-pos) | |
692 %-pos (mapcar 'mail-demarkerize %-pos)))) | |
693 | |
694 ;; de-listify @-pos | |
695 (setq @-pos (car @-pos)) | |
696 | |
697 ;; TODO: remove comments in the middle of an address | |
698 | |
699 (set-buffer canonicalization-buffer) | |
700 | |
701 (buffer-flush-undo canonicalization-buffer) | |
702 (set-syntax-table address-syntax-table) | |
703 (setq case-fold-search nil) | |
704 | |
705 (widen) | |
706 (erase-buffer) | |
707 (insert-buffer-substring extraction-buffer) | |
708 | |
709 (if <-pos | |
710 (narrow-to-region (progn | |
711 (goto-char (1+ <-pos)) | |
712 (skip-chars-forward mail-whitespace) | |
713 (point)) | |
714 >-pos) | |
715 ;; ****** Oh no! What if the address is completely empty! | |
716 (narrow-to-region first-real-pos last-real-pos)) | |
717 | |
718 (and @-pos %-pos | |
719 (mail-nuke-elements-outside-range %-pos (point-min) @-pos)) | |
720 (and %-pos !-pos | |
721 (mail-nuke-elements-outside-range !-pos (point-min) (car %-pos))) | |
722 (and @-pos !-pos (not %-pos) | |
723 (mail-nuke-elements-outside-range !-pos (point-min) @-pos)) | |
724 | |
725 ;; Error condition:?? (and %-pos (not @-pos)) | |
726 | |
727 (cond (!-pos | |
728 ;; **** I don't understand this save-restriction and the | |
729 ;; narrow-to-region inside it. Why did I do that? | |
730 (save-restriction | |
731 (cond ((and @-pos | |
732 mail-@-binds-tighter-than-!) | |
733 (goto-char @-pos) | |
734 (setq %-pos (cons (point) %-pos) | |
735 @-pos nil) | |
736 (delete-char 1) | |
737 (insert "%") | |
738 (setq insert-point (point-max))) | |
739 (mail-@-binds-tighter-than-! | |
740 (setq insert-point (point-max))) | |
741 (%-pos | |
742 (setq insert-point (mail-last-element %-pos) | |
743 saved-%-pos (mapcar 'mail-markerize %-pos) | |
744 %-pos nil | |
745 @-pos (mail-markerize @-pos))) | |
746 (@-pos | |
747 (setq insert-point @-pos) | |
748 (setq @-pos (mail-markerize @-pos))) | |
749 (t | |
750 (setq insert-point (point-max)))) | |
751 (narrow-to-region (point-min) insert-point) | |
752 (setq saved-!-pos (car !-pos)) | |
753 (while !-pos | |
754 (goto-char (point-max)) | |
755 (cond ((and (not @-pos) | |
756 (not (cdr !-pos))) | |
757 (setq @-pos (point)) | |
758 (insert-before-markers "@ ")) | |
759 (t | |
760 (setq %-pos (cons (point) %-pos)) | |
761 (insert-before-markers "% "))) | |
762 (backward-char 1) | |
763 (insert-buffer-substring | |
764 (current-buffer) | |
765 (if (nth 1 !-pos) | |
766 (1+ (nth 1 !-pos)) | |
767 (point-min)) | |
768 (car !-pos)) | |
769 (delete-char 1) | |
770 (or (save-excursion | |
771 (safe-move-sexp -1) | |
772 (skip-chars-backward mail-whitespace) | |
773 (eq ?. (preceding-char))) | |
774 (insert-before-markers | |
775 (if (save-excursion | |
776 (skip-chars-backward mail-whitespace) | |
777 (eq ?. (preceding-char))) | |
778 "" | |
779 ".") | |
780 "uucp")) | |
781 (setq !-pos (cdr !-pos)))) | |
782 (and saved-%-pos | |
783 (setq %-pos (append (mapcar 'mail-demarkerize saved-%-pos) | |
784 %-pos))) | |
785 (setq @-pos (mail-demarkerize @-pos)) | |
786 (narrow-to-region (1+ saved-!-pos) (point-max)))) | |
787 (cond ((and %-pos | |
788 (not @-pos)) | |
789 (goto-char (car %-pos)) | |
790 (delete-char 1) | |
791 (setq @-pos (point)) | |
792 (insert "@") | |
793 (setq %-pos (cdr %-pos)))) | |
794 (setq %-pos (nreverse %-pos)) | |
795 ;; RFC 1034 doesn't approve of this, oh well: | |
796 (downcase-region (or (car %-pos) @-pos (point-max)) (point-max)) | |
797 (cond (%-pos ; implies @-pos valid | |
798 (setq temp %-pos) | |
799 (catch 'truncated | |
800 (while temp | |
801 (goto-char (or (nth 1 temp) | |
802 @-pos)) | |
803 (skip-chars-backward mail-whitespace) | |
804 (save-excursion | |
805 (safe-move-sexp -1) | |
806 (setq domain-pos (point)) | |
807 (skip-chars-backward mail-whitespace) | |
808 (setq \.-pos (eq ?. (preceding-char)))) | |
809 (cond ((and \.-pos | |
810 (get | |
811 (intern | |
812 (buffer-substring domain-pos (point))) | |
813 'domain-name)) | |
814 (narrow-to-region (point-min) (point)) | |
815 (goto-char (car temp)) | |
816 (delete-char 1) | |
817 (setq @-pos (point)) | |
818 (setcdr temp nil) | |
819 (setq %-pos (delq @-pos %-pos)) | |
820 (insert "@") | |
821 (throw 'truncated t))) | |
822 (setq temp (cdr temp)))))) | |
823 (setq mbox-beg (point-min) | |
824 mbox-end (if %-pos (car %-pos) | |
825 (or @-pos | |
826 (point-max)))) | |
827 | |
828 ;; Done canonicalizing address. | |
829 | |
830 (set-buffer extraction-buffer) | |
831 | |
832 ;; Find the full name | |
833 | |
834 (cond ((and phrase-beg | |
835 (eq quote-beg phrase-beg) | |
836 (<= quote-end phrase-end)) | |
837 (narrow-to-region (1+ quote-beg) (1- quote-end)) | |
838 (undo-backslash-quoting (point-min) (point-max))) | |
839 (phrase-beg | |
840 (narrow-to-region phrase-beg phrase-end)) | |
841 (comment-beg | |
842 (narrow-to-region (1+ comment-beg) (1- comment-end)) | |
843 (undo-backslash-quoting (point-min) (point-max))) | |
844 (t | |
845 ;; *** Work in canon buffer instead? No, can't. Hmm. | |
846 (delete-region (point-min) (point-max)) | |
847 (insert-buffer-substring canonicalization-buffer | |
848 mbox-beg mbox-end) | |
849 (goto-char (point-min)) | |
850 (setq \.-ends-name (search-forward "_" nil t)) | |
851 (goto-char (point-min)) | |
852 (while (progn | |
853 (skip-chars-forward mail-whitespace) | |
854 (not (eobp))) | |
855 (setq char (char-after (point))) | |
856 (cond | |
857 ((eq char ?\") | |
858 (setq quote-beg (point)) | |
859 (or (safe-move-sexp 1) | |
860 ;; TODO: handle this error condition!!!!! | |
861 (forward-char 1)) | |
862 ;; take into account deletions | |
863 (setq quote-end (- (point) 2)) | |
864 (save-excursion | |
865 (backward-char 1) | |
866 (delete-char 1) | |
867 (goto-char quote-beg) | |
868 (delete-char 1)) | |
869 (undo-backslash-quoting quote-beg quote-end) | |
870 (or (eq mail-space-char (char-after (point))) | |
871 (insert " ")) | |
872 (setq \.-ends-name t)) | |
873 ((eq char ?.) | |
874 (if (eq (char-after (1+ (point))) ?_) | |
875 (progn | |
876 (forward-char 1) | |
877 (delete-char 1) | |
878 (insert mail-space-char)) | |
879 (if \.-ends-name | |
880 (narrow-to-region (point-min) (point)) | |
881 (delete-char 1) | |
882 (insert " ")))) | |
883 ((memq (char-syntax char) '(?. ?\\)) | |
884 (delete-char 1) | |
885 (insert " ")) | |
886 (t | |
887 (setq atom-beg (point)) | |
888 (forward-word 1) | |
889 (setq atom-end (point)) | |
890 (save-restriction | |
891 (narrow-to-region atom-beg atom-end) | |
892 (goto-char (point-min)) | |
893 (while (re-search-forward "\\([^_]+\\)_" nil t) | |
894 (replace-match "\\1 ")) | |
895 (goto-char (point-max)))))))) | |
896 | |
897 (set-syntax-table address-text-syntax-table) | |
898 | |
899 (setq xxx (variant-method (buffer-string))) | |
900 (delete-region (point-min) (point-max)) | |
901 (insert xxx) | |
902 (goto-char (point-min)) | |
903 | |
904 ;; ;; Compress whitespace | |
905 ;; (goto-char (point-min)) | |
906 ;; (while (re-search-forward "[ \t\n]+" nil t) | |
907 ;; (replace-match " ")) | |
908 ;; | |
909 ;; ;; Fix . used as space | |
910 ;; (goto-char (point-min)) | |
911 ;; (while (re-search-forward mail-bad-\.-pattern nil t) | |
912 ;; (replace-match "\\1 \\2")) | |
913 ;; | |
914 ;; ;; Delete trailing parenthesized comment | |
915 ;; (goto-char (point-max)) | |
916 ;; (skip-chars-backward mail-whitespace) | |
917 ;; (cond ((memq (char-after (1- (point))) '(?\) ?\} ?\])) | |
918 ;; (setq comment-end (point)) | |
919 ;; (set-syntax-table address-text-comment-syntax-table) | |
920 ;; (or (safe-move-sexp -1) | |
921 ;; (backward-char 1)) | |
922 ;; (set-syntax-table address-text-syntax-table) | |
923 ;; (setq comment-beg (point)) | |
924 ;; (skip-chars-backward mail-whitespace) | |
925 ;; (if (bobp) | |
926 ;; (narrow-to-region (1+ comment-beg) (1- comment-end)) | |
927 ;; (narrow-to-region (point-min) (point))))) | |
928 ;; | |
929 ;; ;; Find, save, and delete any name suffix | |
930 ;; ;; *** Broken! | |
931 ;; (goto-char (point-min)) | |
932 ;; (cond ((re-search-forward mail-full-name-suffix-pattern nil t) | |
933 ;; (setq name-suffix (buffer-substring (match-beginning 3) | |
934 ;; (match-end 3))) | |
935 ;; (replace-match "\\1 \\4"))) | |
936 ;; | |
937 ;; ;; Delete ALL CAPS words and after, if preceded by mixed-case or | |
938 ;; ;; lowercase words. Eg. XT-DEM. | |
939 ;; (goto-char (point-min)) | |
940 ;; ;; ## This will lose on something like "SMITH MAX". | |
941 ;; ;; ## maybe it should be | |
942 ;; ;; ## " \\([A-Z]+[-_/][A-Z]+\\|[A-Z][A-Z][A-Z]\\)\\b.*[^A-Z \t]" | |
943 ;; ;; ## that is, three-letter-upper-case-word with non-upper-case | |
944 ;; ;; ## characters following it. | |
945 ;; (if (re-search-forward mail-mixed-case-name-pattern nil t) | |
946 ;; (if (re-search-forward mail-weird-acronym-pattern nil t) | |
947 ;; (narrow-to-region (point-min) (match-beginning 0)))) | |
948 ;; | |
949 ;; ;; Delete trailing alternative address | |
950 ;; (goto-char (point-min)) | |
951 ;; (if (re-search-forward mail-alternative-address-pattern nil t) | |
952 ;; (narrow-to-region (point-min) (match-beginning 0))) | |
953 ;; | |
954 ;; ;; Delete trailing comment | |
955 ;; (goto-char (point-min)) | |
956 ;; (if (re-search-forward mail-trailing-comment-start-pattern nil t) | |
957 ;; (or (progn | |
958 ;; (goto-char (match-beginning 0)) | |
959 ;; (skip-chars-backward mail-whitespace) | |
960 ;; (bobp)) | |
961 ;; (narrow-to-region (point-min) (match-beginning 0)))) | |
962 ;; | |
963 ;; ;; Delete trailing comma-separated comment | |
964 ;; (goto-char (point-min)) | |
965 ;; ;; ## doesn't this break "Smith, John"? Yes. | |
966 ;; (re-search-forward mail-last-name-first-pattern nil t) | |
967 ;; (while (search-forward "," nil t) | |
968 ;; (or (save-excursion | |
969 ;; (backward-char 2) | |
970 ;; (looking-at mail-full-name-suffix-pattern)) | |
971 ;; (narrow-to-region (point-min) (1- (point))))) | |
972 ;; | |
973 ;; ;; Delete telephone numbers and ham radio call signs | |
974 ;; (goto-char (point-min)) | |
975 ;; (if (re-search-forward mail-telephone-extension-pattern nil t) | |
976 ;; (narrow-to-region (point-min) (match-beginning 0))) | |
977 ;; (goto-char (point-min)) | |
978 ;; (if (re-search-forward mail-ham-call-sign-pattern nil t) | |
979 ;; (if (eq (match-beginning 0) (point-min)) | |
980 ;; (narrow-to-region (match-end 0) (point-max)) | |
981 ;; (narrow-to-region (point-min) (match-beginning 0)))) | |
982 ;; | |
983 ;; ;; Delete trailing word followed immediately by . | |
984 ;; (goto-char (point-min)) | |
985 ;; ;; ## what's this for? doesn't it mess up "Public, Harry Q."? No. | |
986 ;; (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t) | |
987 ;; (narrow-to-region (point-min) (match-beginning 0))) | |
988 ;; | |
989 ;; ;; Handle & substitution | |
990 ;; ;; TODO: remember to disable middle initial guessing | |
991 ;; (goto-char (point-min)) | |
992 ;; (cond ((re-search-forward "\\( \\|\\`\\)&\\( \\|\\'\\)" nil t) | |
993 ;; (goto-char (match-end 1)) | |
994 ;; (delete-char 1) | |
995 ;; (capitalize-region | |
996 ;; (point) | |
997 ;; (progn | |
998 ;; (insert-buffer-substring canonicalization-buffer | |
999 ;; mbox-beg mbox-end) | |
1000 ;; (point))))) | |
1001 ;; | |
1002 ;; ;; Delete nickname | |
1003 ;; (goto-char (point-min)) | |
1004 ;; (if (re-search-forward mail-nickname-pattern nil t) | |
1005 ;; (replace-match (if (eq (match-beginning 2) (1- (match-end 2))) | |
1006 ;; " \\2 " | |
1007 ;; " "))) | |
1008 ;; | |
1009 ;; ;; Fixup initials | |
1010 ;; (while (progn | |
1011 ;; (goto-char (point-min)) | |
1012 ;; (re-search-forward mail-bad-initials-pattern nil t)) | |
1013 ;; (replace-match | |
1014 ;; (if (match-beginning 4) | |
1015 ;; "\\1. \\4" | |
1016 ;; (if (match-beginning 5) | |
1017 ;; "\\1. \\5" | |
1018 ;; "\\1. ")))) | |
1019 ;; | |
1020 ;; ;; Delete title | |
1021 ;; (goto-char (point-min)) | |
1022 ;; (if (re-search-forward mail-full-name-prefixes nil t) | |
1023 ;; (narrow-to-region (point) (point-max))) | |
1024 ;; | |
1025 ;; ;; Delete trailing and preceding non-name characters | |
1026 ;; (goto-char (point-min)) | |
1027 ;; (skip-chars-forward mail-non-begin-name-chars) | |
1028 ;; (narrow-to-region (point) (point-max)) | |
1029 ;; (goto-char (point-max)) | |
1030 ;; (skip-chars-backward mail-non-end-name-chars) | |
1031 ;; (narrow-to-region (point-min) (point)) | |
1032 | |
1033 ;; If name is "First Last" and userid is "F?L", then assume | |
1034 ;; the middle initial is the second letter in the userid. | |
1035 ;; initially by Jamie Zawinski <jwz@lucid.com> | |
1036 (cond ((and (eq 3 (- mbox-end mbox-beg)) | |
1037 (progn | |
1038 (goto-char (point-min)) | |
1039 (looking-at mail-two-name-pattern))) | |
1040 (setq fi (char-after (match-beginning 0)) | |
1041 li (char-after (match-beginning 3))) | |
1042 (save-excursion | |
1043 (set-buffer canonicalization-buffer) | |
1044 ;; char-equal is ignoring case here, so no need to upcase | |
1045 ;; or downcase. | |
1046 (let ((case-fold-search t)) | |
1047 (and (char-equal fi (char-after mbox-beg)) | |
1048 (char-equal li (char-after (1- mbox-end))) | |
1049 (setq mi (char-after (1+ mbox-beg)))))) | |
1050 (cond ((and mi | |
1051 ;; TODO: use better table than syntax table | |
1052 (eq ?w (char-syntax mi))) | |
1053 (goto-char (match-beginning 3)) | |
1054 (insert (upcase mi) ". "))))) | |
1055 | |
1056 ;; ;; Restore suffix | |
1057 ;; (cond (name-suffix | |
1058 ;; (goto-char (point-max)) | |
1059 ;; (insert ", " name-suffix) | |
1060 ;; (backward-word 1) | |
1061 ;; (cond ((memq (following-char) '(?j ?J ?s ?S)) | |
1062 ;; (capitalize-word 1) | |
1063 ;; (or (eq (following-char) ?.) | |
1064 ;; (insert ?.))) | |
1065 ;; (t | |
1066 ;; (upcase-word 1))))) | |
1067 | |
1068 ;; Result | |
1069 (list (buffer-string) | |
1070 (progn | |
1071 (set-buffer canonicalization-buffer) | |
1072 (buffer-string))) | |
1073 ))) | |
1074 | |
1075 ;; TODO: put this back in the above function now that it's proven: | |
1076 (defun variant-method (string) | |
1077 (let ((variant-buffer (get-buffer-create "*variant method buffer*")) | |
1078 (word-count 0) | |
1079 mixed-case-flag lower-case-flag upper-case-flag | |
1080 suffix-flag last-name-comma-flag | |
1081 comment-beg comment-end initial beg end | |
1082 ) | |
1083 (save-excursion | |
1084 (set-buffer variant-buffer) | |
1085 (buffer-flush-undo variant-buffer) | |
1086 (set-syntax-table address-text-syntax-table) | |
1087 (widen) | |
1088 (erase-buffer) | |
1089 (setq case-fold-search nil) | |
1090 | |
1091 (insert string) | |
1092 | |
1093 ;; Fix . used as space | |
1094 (goto-char (point-min)) | |
1095 (while (re-search-forward mail-bad-\.-pattern nil t) | |
1096 (replace-match "\\1 \\2")) | |
1097 | |
1098 ;; Skip any initial garbage. | |
1099 (goto-char (point-min)) | |
1100 (skip-chars-forward mail-non-begin-name-chars) | |
1101 (skip-chars-backward "& \"") | |
1102 (narrow-to-region (point) (point-max)) | |
1103 | |
1104 (catch 'stop | |
1105 (while t | |
1106 (skip-chars-forward mail-whitespace) | |
1107 | |
1108 (cond | |
1109 | |
1110 ;; Delete title | |
1111 ((and (eq word-count 0) | |
1112 (looking-at mail-full-name-prefixes)) | |
1113 (goto-char (match-end 0)) | |
1114 (narrow-to-region (point) (point-max))) | |
1115 | |
1116 ;; Stop after name suffix | |
1117 ((and (>= word-count 2) | |
1118 (looking-at mail-full-name-suffix-pattern)) | |
1119 (skip-chars-backward mail-whitespace) | |
1120 (setq suffix-flag (point)) | |
1121 (if (eq ?, (following-char)) | |
1122 (forward-char 1) | |
1123 (insert ?,)) | |
1124 ;; Enforce at least one space after comma | |
1125 (or (eq mail-space-char (following-char)) | |
1126 (insert mail-space-char)) | |
1127 (skip-chars-forward mail-whitespace) | |
1128 (cond ((memq (following-char) '(?j ?J ?s ?S)) | |
1129 (capitalize-word 1) | |
1130 (if (eq (following-char) ?.) | |
1131 (forward-char 1) | |
1132 (insert ?.))) | |
1133 (t | |
1134 (upcase-word 1))) | |
1135 (setq word-count (1+ word-count)) | |
1136 (throw 'stop t)) | |
1137 | |
1138 ;; Handle SCA names | |
1139 ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As" | |
1140 (setq word-count 0) | |
1141 (goto-char (match-beginning 1)) | |
1142 (narrow-to-region (point) (point-max))) | |
1143 | |
1144 ;; Various stopping points | |
1145 ((or | |
1146 ;; Stop before ALL CAPS acronyms, if preceded by mixed-case or | |
1147 ;; lowercase words. Eg. XT-DEM. | |
1148 (and (>= word-count 2) | |
1149 (or mixed-case-flag lower-case-flag) | |
1150 (looking-at mail-weird-acronym-pattern) | |
1151 (not (looking-at mail-roman-numeral-pattern))) | |
1152 ;; Stop before 4-or-more letter lowercase words preceded by | |
1153 ;; mixed case or uppercase words. | |
1154 (and (>= word-count 2) | |
1155 (or upper-case-flag mixed-case-flag) | |
1156 (looking-at "[a-z][a-z][a-z][a-z]+\\b")) | |
1157 ;; Stop before trailing alternative address | |
1158 (looking-at mail-alternative-address-pattern) | |
1159 ;; Stop before trailing comment not introduced by comma | |
1160 (looking-at mail-trailing-comment-start-pattern) | |
1161 ;; Stop before telephone numbers | |
1162 (looking-at mail-telephone-extension-pattern)) | |
1163 (throw 'stop t)) | |
1164 | |
1165 ;; Check for initial last name followed by comma | |
1166 ((and (eq ?, (following-char)) | |
1167 (eq word-count 1)) | |
1168 (forward-char 1) | |
1169 (setq last-name-comma-flag t) | |
1170 (or (eq mail-space-char (following-char)) | |
1171 (insert mail-space-char))) | |
1172 | |
1173 ;; Stop before trailing comma-separated comment | |
1174 ((eq ?, (following-char)) | |
1175 (throw 'stop t)) | |
1176 | |
1177 ;; Delete parenthesized/quoted comment/nickname | |
1178 ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`)) | |
1179 (setq comment-beg (point)) | |
1180 (set-syntax-table address-text-comment-syntax-table) | |
1181 (cond ((memq (following-char) '(?\' ?\`)) | |
1182 (if (eq ?\' (following-char)) | |
1183 (forward-char 1)) | |
1184 (or (search-forward "'" nil t) | |
1185 (delete-char 1))) | |
1186 (t | |
1187 (or (safe-move-sexp 1) | |
1188 (goto-char (point-max))))) | |
1189 (set-syntax-table address-text-syntax-table) | |
1190 (setq comment-end (point)) | |
1191 (cond | |
1192 ;; Handle case of entire name being quoted | |
1193 ((and (eq word-count 0) | |
1194 (looking-at " *\\'") | |
1195 (>= (- comment-end comment-beg) 2)) | |
1196 (narrow-to-region (1+ comment-beg) (1- comment-end)) | |
1197 (goto-char (point-min))) | |
1198 (t | |
1199 ;; Handle case of quoted initial | |
1200 (if (and (or (= 3 (- comment-end comment-beg)) | |
1201 (and (= 4 (- comment-end comment-beg)) | |
1202 (eq ?. (char-after (+ 2 comment-beg))))) | |
1203 (not (looking-at " *\\'"))) | |
1204 (setq initial (char-after (1+ comment-beg))) | |
1205 (setq initial nil)) | |
1206 (delete-region comment-beg comment-end) | |
1207 (if initial | |
1208 (insert initial ". "))))) | |
1209 | |
1210 ;; Delete ham radio call signs | |
1211 ((looking-at mail-ham-call-sign-pattern) | |
1212 (delete-region (match-beginning 0) (match-end 0))) | |
1213 | |
1214 ;; Handle & substitution | |
1215 ;; TODO: remember to disable middle initial guessing | |
1216 ((and (or (bobp) | |
1217 (eq mail-space-char (preceding-char))) | |
1218 (looking-at "&\\( \\|\\'\\)")) | |
1219 (delete-char 1) | |
1220 (capitalize-region | |
1221 (point) | |
1222 (progn | |
1223 (insert-buffer-substring canonicalization-buffer | |
1224 mbox-beg mbox-end) | |
1225 (point)))) | |
1226 | |
1227 ;; Fixup initials | |
1228 ((looking-at mail-initial-pattern) | |
1229 (or (eq (following-char) (upcase (following-char))) | |
1230 (setq lower-case-flag t)) | |
1231 (forward-char 1) | |
1232 (if (eq ?. (following-char)) | |
1233 (forward-char 1) | |
1234 (insert ?.)) | |
1235 (or (eq mail-space-char (following-char)) | |
1236 (insert mail-space-char)) | |
1237 (setq word-count (1+ word-count))) | |
1238 | |
1239 ;; Regular name words | |
1240 ((looking-at mail-name-pattern) | |
1241 (setq beg (point)) | |
1242 (setq end (match-end 0)) | |
1243 (set (if (re-search-forward "[a-z]" end t) | |
1244 (if (progn | |
1245 (goto-char beg) | |
1246 (re-search-forward "[A-Z]" end t)) | |
1247 'mixed-case-flag | |
1248 'lower-case-flag) | |
1249 'upper-case-flag) t) | |
1250 (goto-char end) | |
1251 (setq word-count (1+ word-count))) | |
1252 | |
1253 (t | |
1254 (throw 'stop t))))) | |
1255 | |
1256 (narrow-to-region (point-min) (point)) | |
1257 | |
1258 ;; Delete trailing word followed immediately by . | |
1259 (cond ((not suffix-flag) | |
1260 (goto-char (point-min)) | |
1261 (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t) | |
1262 (narrow-to-region (point-min) (match-beginning 0))))) | |
1263 | |
1264 ;; If last name first put it at end (but before suffix) | |
1265 (cond (last-name-comma-flag | |
1266 (goto-char (point-min)) | |
1267 (search-forward ",") | |
1268 (setq end (1- (point))) | |
1269 (goto-char (or suffix-flag (point-max))) | |
1270 (or (eq mail-space-char (preceding-char)) | |
1271 (insert mail-space-char)) | |
1272 (insert-buffer-substring (current-buffer) (point-min) end) | |
1273 (narrow-to-region (1+ end) (point-max)))) | |
1274 | |
1275 (goto-char (point-max)) | |
1276 (skip-chars-backward mail-non-end-name-chars) | |
1277 (if (eq ?. (following-char)) | |
1278 (forward-char 1)) | |
1279 (narrow-to-region (point) | |
1280 (progn | |
1281 (goto-char (point-min)) | |
1282 (skip-chars-forward mail-non-begin-name-chars) | |
1283 (point))) | |
1284 | |
1285 ;; Compress whitespace | |
1286 (goto-char (point-min)) | |
1287 (while (re-search-forward "[ \t\n]+" nil t) | |
1288 (replace-match " ")) | |
1289 | |
1290 (buffer-substring (point-min) (point-max)) | |
1291 | |
1292 ))) | |
1293 | |
1294 ;; The country names are just in there for show right now, and because | |
1295 ;; Jamie thought it would be neat. They aren't used yet. | |
1296 | |
1297 ;; Keep in mind that the country abbreviations follow ISO-3166. There is | |
1298 ;; a U.S. FIPS that specifies a different set of two-letter country | |
1299 ;; abbreviations. | |
1300 | |
1301 ;; TODO: put this in its own obarray, instead of cluttering up the main | |
1302 ;; symbol table with junk. | |
1303 | |
1304 (mapcar | |
1305 (function | |
1306 (lambda (x) | |
1307 (if (symbolp x) | |
1308 (put x 'domain-name t) | |
1309 (put (car x) 'domain-name (nth 1 x))))) | |
1310 '((ag "Antigua") | |
1311 (ar "Argentina") ; Argentine Republic | |
1312 arpa ; Advanced Projects Research Agency | |
1313 (at "Austria") ; The Republic of _ | |
1314 (au "Australia") | |
1315 (bb "Barbados") | |
1316 (be "Belgium") ; The Kingdom of _ | |
1317 (bg "Bulgaria") | |
1318 bitnet ; Because It's Time NET | |
1319 (bo "Bolivia") ; Republic of _ | |
1320 (br "Brazil") ; The Federative Republic of _ | |
1321 (bs "Bahamas") | |
1322 (bz "Belize") | |
1323 (ca "Canada") | |
1324 (ch "Switzerland") ; The Swiss Confederation | |
1325 (cl "Chile") ; The Republic of _ | |
1326 (cn "China") ; The People's Republic of _ | |
1327 (co "Columbia") | |
1328 com ; Commercial | |
1329 (cr "Costa Rica") ; The Republic of _ | |
1330 (cs "Czechoslovakia") | |
1331 (de "Germany") | |
1332 (dk "Denmark") | |
1333 (dm "Dominica") | |
1334 (do "Dominican Republic") ; The _ | |
1335 (ec "Ecuador") ; The Republic of _ | |
1336 edu ; Educational | |
1337 (eg "Egypt") ; The Arab Republic of _ | |
1338 (es "Spain") ; The Kingdom of _ | |
1339 (fi "Finland") ; The Republic of _ | |
1340 (fj "Fiji") | |
1341 (fr "France") | |
1342 gov ; Government (U.S.A.) | |
1343 (gr "Greece") ; The Hellenic Republic | |
1344 (hk "Hong Kong") | |
1345 (hu "Hungary") ; The Hungarian People's Republic (???) | |
1346 (ie "Ireland") | |
1347 (il "Israel") ; The State of _ | |
1348 (in "India") ; The Republic of _ | |
1349 int ; something British, don't know what | |
1350 (is "Iceland") ; The Republic of _ | |
1351 (it "Italy") ; The Italian Republic | |
1352 (jm "Jamaica") | |
1353 (jp "Japan") | |
1354 (kn "St. Kitts and Nevis") | |
1355 (kr "South Korea") | |
1356 (lc "St. Lucia") | |
1357 (lk "Sri Lanka") ; The Democratic Socialist Republic of _ | |
1358 mil ; Military (U.S.A.) | |
1359 (mx "Mexico") ; The United Mexican States | |
1360 (my "Malaysia") ; changed to Myanmar???? | |
1361 (na "Namibia") | |
1362 nato ; North Atlantic Treaty Organization | |
1363 net ; Network | |
1364 (ni "Nicaragua") ; The Republic of _ | |
1365 (nl "Netherlands") ; The Kingdom of the _ | |
1366 (no "Norway") ; The Kingdom of _ | |
1367 (nz "New Zealand") | |
1368 org ; Organization | |
1369 (pe "Peru") | |
1370 (pg "Papua New Guinea") | |
1371 (ph "Philippines") ; The Republic of the _ | |
1372 (pl "Poland") | |
1373 (pr "Puerto Rico") | |
1374 (pt "Portugal") ; The Portugese Republic | |
1375 (py "Paraguay") | |
1376 (se "Sweden") ; The Kingdom of _ | |
1377 (sg "Singapore") ; The Republic of _ | |
1378 (sr "Suriname") | |
1379 (su "Soviet Union") | |
1380 (th "Thailand") ; The Kingdom of _ | |
1381 (tn "Tunisia") | |
1382 (tr "Turkey") ; The Republic of _ | |
1383 (tt "Trinidad and Tobago") | |
1384 (tw "Taiwan") | |
1385 (uk "United Kingdom") ; The _ of Great Britain | |
1386 unter-dom ; something German | |
1387 (us "U.S.A.") ; The United States of America | |
1388 uucp ; Unix to Unix CoPy | |
1389 (uy "Uruguay") ; The Eastern Republic of _ | |
1390 (vc "St. Vincent and the Grenadines") | |
1391 (ve "Venezuela") ; The Republic of _ | |
1392 (yu "Yugoslavia") ; The Socialist Federal Republic of _ | |
1393 ;; Also said to be Zambia ... | |
1394 (za "South Africa") ; The Republic of _ (why not Zaire???) | |
1395 (zw "Zimbabwe") ; Republic of _ | |
1396 )) | |
1397 ;; fipnet | |
1398 | |
1399 | |
1400 ;; Code for testing. | |
1401 | |
1402 (defun time-extract () | |
1403 (let (times list) | |
1404 (setq times (cons (current-time-string) times) | |
1405 list problem-address-alist) | |
1406 (while list | |
1407 (mail-extract-address-components (car (car list))) | |
1408 (setq list (cdr list))) | |
1409 (setq times (cons (current-time-string) times)) | |
1410 (nreverse times))) | |
1411 | |
1412 (defun test-extract (&optional starting-point) | |
1413 (interactive) | |
1414 (set-buffer (get-buffer-create "*Testing*")) | |
1415 (erase-buffer) | |
1416 (sit-for 0) | |
1417 (mapcar 'test-extract-internal | |
1418 (if starting-point | |
1419 (memq starting-point problem-address-alist) | |
1420 problem-address-alist))) | |
1421 | |
1422 (defvar failed-item) | |
1423 (defun test-extract-internal (item) | |
1424 (setq failed-item item) | |
1425 (let* ((address (car item)) | |
1426 (correct-name (nth 1 item)) | |
1427 (correct-canon (nth 2 item)) | |
1428 (result (mail-extract-address-components address)) | |
1429 (name (car result)) | |
1430 (canon (nth 1 result)) | |
1431 (name-correct (or (null correct-name) | |
1432 (string-equal (downcase correct-name) | |
1433 (downcase name)))) | |
1434 (canon-correct (or (null correct-canon) | |
1435 (string-equal correct-canon canon)))) | |
1436 (cond ((not (and name-correct canon-correct)) | |
1437 (pop-to-buffer "*Testing*") | |
1438 (select-window (get-buffer-window (current-buffer))) | |
1439 (goto-char (point-max)) | |
1440 (insert "Address: " address "\n") | |
1441 (if (not name-correct) | |
1442 (insert " Correct Name: [" correct-name | |
1443 "]\; Result: [" name "]\n")) | |
1444 (if (not canon-correct) | |
1445 (insert " Correct Canon: [" correct-canon | |
1446 "]\; Result: [" canon "]\n")) | |
1447 (insert "\n") | |
1448 (sit-for 0)))) | |
1449 (setq failed-item nil)) | |
1450 | |
1451 (defun test-continue-extract () | |
1452 (interactive) | |
1453 (test-extract failed-item)) | |
1454 | |
1455 | |
1456 ;; Assorted junk. | |
1457 | |
1458 ;; warsaw@nlm.nih.gov (A Bad Dude -- Barry Warsaw) | |
1459 | |
1460 ;;'(from | |
1461 ;; reply-to | |
1462 ;; return-path | |
1463 ;; x-uucp-from | |
1464 ;; sender | |
1465 ;; resent-from | |
1466 ;; resent-sender | |
1467 ;; resent-reply-to) | |
1468 | |
1469 ;;; mail-extr.el ends here |