comparison lisp/mail/mail-extr.el @ 809:8a0066235d56

Initial revision
author Eric S. Raymond <esr@snark.thyrsus.com>
date Fri, 17 Jul 1992 06:48:03 +0000
parents
children 20674ae6bf52
comparison
equal deleted inserted replaced
808:707866b2a190 809:8a0066235d56
1 ;;; mail-extr.el --- extract full name and address from RFC 822 mail header.
2
3 ;; Author: Joe Wells <jbw@cs.bu.edu>
4 ;; Last-Modified: 7 Apr 1992
5 ;; Version: 1.0
6 ;; Adapted-By: ESR
7 ;; Keywords: mail
8
9 ;; Copyright (C) 1992 Free Software Foundation, Inc.
10
11 ;; This file is part of GNU Emacs.
12
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 1, or (at your option)
16 ;; any later version.
17
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
22
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to
25 ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 ;;; Commentary:
28
29 ;; Here is `mail-extr', a package for extracting full names and canonical
30 ;; addresses from RFC 822 mail headers. It is intended to be hooked into
31 ;; other Emacs Lisp packages that deal with RFC 822 format messages, such as
32 ;; Gnews, GNUS, RMAIL, MH-E, BBDB, VM, Supercite, etc. Thus, this release is
33 ;; mainly for Emacs Lisp developers.
34
35 ;; There are two main benefits:
36
37 ;; 1. Higher probability of getting the correct full name for a human than
38 ;; any other package I know of. (On the other hand, it will cheerfully
39 ;; mangle non-human names/comments.)
40 ;; 2. Address part is put in a canonical form.
41
42 ;; The interface is not yet carved in stone; please give me suggestions.
43
44 ;; I have an extensive test-case collection of funny addresses if you want to
45 ;; work with the code. Developing this code requires frequent testing to
46 ;; make sure you're not breaking functionality. I'm not posting the
47 ;; test-cases because they take over 100K.
48
49 ;; If you find an address that mail-extr fails on, please send it to me along
50 ;; with what you think the correct results should be. I do not consider it a
51 ;; bug if mail-extr mangles a comment that does not correspond to a real
52 ;; human full name, although I would prefer that mail-extr would return the
53 ;; comment as-is.
54
55 ;; Features:
56
57 ;; * Full name handling:
58
59 ;; * knows where full names can be found in an address.
60 ;; * avoids using empty comments and quoted text.
61 ;; * extracts full names from mailbox names.
62 ;; * recognizes common formats for comments after a full name.
63 ;; * puts a period and a space after each initial.
64 ;; * understands & referring to the mailbox name capitalized.
65 ;; * strips name prefixes like "Prof.", etc..
66 ;; * understands what characters can occur in names (not just letters).
67 ;; * figures out middle initial from mailbox name.
68 ;; * removes funny nicknames.
69 ;; * keeps suffixes such as Jr., Sr., III, etc.
70 ;; * reorders "Last, First" type names.
71
72 ;; * Address handling:
73
74 ;; * parses rfc822 quoted text, comments, and domain literals.
75 ;; * parses rfc822 multi-line headers.
76 ;; * does something reasonable with rfc822 GROUP addresses.
77 ;; * handles many rfc822 noncompliant and garbage addresses.
78 ;; * canonicalizes addresses (after stripping comments/phrases outside <>).
79 ;; * converts ! addresses into .UUCP and %-style addresses.
80 ;; * converts rfc822 ROUTE addresses to %-style addresses.
81 ;; * truncates %-style addresses at leftmost fully qualified domain name.
82 ;; * handles local relative precedence of ! vs. % and @ (untested).
83
84 ;; It does almost no string creation. It primarily uses the built-in
85 ;; parsing routines with the appropriate syntax tables. This should
86 ;; result in greater speed.
87
88 ;; TODO:
89
90 ;; * handle all test cases. (This will take forever.)
91 ;; * software to pick the correct header to use (eg., "Senders-Name:").
92 ;; * multiple addresses in the "From:" header (almost all of the necessary
93 ;; code is there).
94 ;; * flag to not treat `,' as an address separator. (This is useful when
95 ;; there is a "From:" header but no "Sender:" header, because then there
96 ;; is only allowed to be one address.)
97 ;; * mailbox name does not necessarily contain full name.
98 ;; * fixing capitalization when it's all upper or lowercase. (Hard!)
99 ;; * some of the domain literal handling is missing. (But I've never even
100 ;; seen one of these in a mail address, so maybe no big deal.)
101 ;; * arrange to have syntax tables byte-compiled.
102 ;; * speed hacks.
103 ;; * delete unused variables.
104 ;; * arrange for testing with different relative precedences of ! vs. @
105 ;; and %.
106 ;; * put variant-method back into mail-extract-address-components.
107 ;; * insert documentation strings!
108 ;; * handle X.400-gatewayed addresses according to RFC 1148.
109
110 ;;; Change Log:
111 ;;
112 ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu)
113 ;;
114 ;; * Cleaned up some more. Release version 1.0 to world.
115 ;;
116 ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu)
117 ;;
118 ;; * Cleaned up full name extraction extensively.
119 ;;
120 ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu)
121 ;;
122 ;; * Total rewrite. Integrated mail-canonicalize-address into
123 ;; mail-extract-address-components. Now handles GROUP addresses more
124 ;; or less correctly. Better handling of lots of different cases.
125 ;;
126 ;; Fri Jun 14 19:39:50 1991
127 ;; * Created.
128
129 ;;; Code:
130
131 ;; Variable definitions.
132
133 (defvar mail-@-binds-tighter-than-! nil)
134
135 ;;----------------------------------------------------------------------
136 ;; what orderings are meaningful?????
137 ;;(defvar mail-operator-precedence-list '(?! ?% ?@))
138 ;; Right operand of a % or a @ must be a domain name, period. No other
139 ;; operators allowed. Left operand of a @ is an address relative to that
140 ;; site.
141
142 ;; Left operand of a ! must be a domain name. Right operand is an
143 ;; arbitrary address.
144 ;;----------------------------------------------------------------------
145
146 (defconst mail-space-char 32)
147
148 (defconst mail-whitespace " \t\n")
149
150 ;; Any character that can occur in a name in an RFC822 address.
151 ;; Yes, there are weird people with digits in their names.
152 (defconst mail-all-letters "A-Za-z---{|}'~0-9`.")
153
154 ;; Any character that can occur in a name, not counting characters that
155 ;; separate parts of a multipart name.
156 (defconst mail-all-letters-but-separators "A-Za-z{|}'~0-9`")
157
158 ;; Any character that can start a name
159 (defconst mail-first-letters "A-Za-z")
160
161 ;; Any character that can end a name.
162 (defconst mail-last-letters "A-Za-z`'.")
163
164 ;; Matches an initial not followed by both a period and a space.
165 (defconst mail-bad-initials-pattern
166 (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)"
167 mail-all-letters mail-first-letters mail-all-letters))
168
169 (defconst mail-non-name-chars (concat "^" mail-all-letters "."))
170
171 (defconst mail-non-begin-name-chars (concat "^" mail-first-letters))
172
173 (defconst mail-non-end-name-chars (concat "^" mail-last-letters))
174
175 ;; Matches periods used instead of spaces. Must not match the period
176 ;; following an initial.
177 (defconst mail-bad-\.-pattern
178 (format "\\([%s][%s]\\)\\.+\\([%s]\\)"
179 mail-all-letters mail-last-letters mail-first-letters))
180
181 ;; Matches an embedded or leading nickname that should be removed.
182 (defconst mail-nickname-pattern
183 (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] "
184 mail-all-letters))
185
186 ;; Matches a leading title that is not part of the name (does not
187 ;; contribute to uniquely identifying the person).
188 (defconst mail-full-name-prefixes
189 '"\\` *\\(Prof\\|Dr\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.? ")
190
191 ;; Matches the occurrence of a generational name suffix, and the last
192 ;; character of the preceding name.
193 (defconst mail-full-name-suffix-pattern
194 (format
195 "\\(,? ?\\([JjSs]r\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)"
196 mail-all-letters mail-all-letters))
197
198 (defconst mail-roman-numeral-pattern
199 "V?I+V?\\b")
200
201 ;; Matches a trailing uppercase (with other characters possible) acronym.
202 ;; Must not match a trailing uppercase last name or trailing initial
203 (defconst mail-weird-acronym-pattern "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)")
204
205 ;; Matches a mixed-case or lowercase name (not an initial).
206 (defconst mail-mixed-case-name-pattern
207 (format
208 "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)"
209 mail-all-letters mail-last-letters
210 mail-first-letters mail-all-letters mail-all-letters mail-last-letters
211 mail-first-letters mail-all-letters))
212
213 ;; Matches a trailing alternative address.
214 (defconst mail-alternative-address-pattern "[a-zA-Z.]+[!@][a-zA-Z.]")
215
216 ;; Matches a variety of trailing comments not including comma-delimited
217 ;; comments.
218 (defconst mail-trailing-comment-start-pattern " [-{]\\|--\\|[+@#></\;]")
219
220 ;; Matches a name (not an initial).
221 ;; This doesn't force a word boundary at the end because sometimes a
222 ;; comment is separated by a `-' with no preceding space.
223 (defconst mail-name-pattern
224 (format
225 "\\b[%s][%s]*[%s]"
226 mail-first-letters mail-all-letters mail-last-letters))
227
228 (defconst mail-initial-pattern
229 (format "\\b[%s]\\([. ]\\|\\b\\)" mail-first-letters))
230
231 ;; Matches a single name before a comma.
232 (defconst mail-last-name-first-pattern
233 (concat "\\`" mail-name-pattern ","))
234
235 ;; Matches telephone extensions.
236 (defconst mail-telephone-extension-pattern
237 "\\(\\([Ee]xt\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+")
238
239 ;; Matches ham radio call signs.
240 (defconst mail-ham-call-sign-pattern
241 "\\b[A-Z]+[0-9][A-Z0-9]*")
242
243 ;; Matches normal single-part name
244 (defconst mail-normal-name-pattern
245 (format
246 "\\b[%s][%s]+[%s]"
247 mail-first-letters mail-all-letters-but-separators mail-last-letters))
248
249 ;; Matches normal two names with missing middle initial
250 (defconst mail-two-name-pattern
251 (concat "\\`\\(" mail-normal-name-pattern
252 "\\|" mail-initial-pattern
253 "\\) +\\(" mail-normal-name-pattern "\\)\\(,\\|\\'\\)"))
254
255 (defvar address-syntax-table (make-syntax-table))
256 (defvar address-comment-syntax-table (make-syntax-table))
257 (defvar address-domain-literal-syntax-table (make-syntax-table))
258 (defvar address-text-comment-syntax-table (make-syntax-table))
259 (defvar address-text-syntax-table (make-syntax-table))
260 (mapcar
261 (function
262 (lambda (pair)
263 (let ((syntax-table (symbol-value (car pair))))
264 (mapcar
265 (function
266 (lambda (item)
267 (if (eq 2 (length item))
268 (modify-syntax-entry (car item) (car (cdr item)) syntax-table)
269 (let ((char (car item))
270 (bound (car (cdr item)))
271 (syntax (car (cdr (cdr item)))))
272 (while (<= char bound)
273 (modify-syntax-entry char syntax syntax-table)
274 (setq char (1+ char)))))))
275 (cdr pair)))))
276 '((address-syntax-table
277 (0 31 "w") ;control characters
278 (32 " ") ;SPC
279 (?! ?~ "w") ;printable characters
280 (127 "w") ;DEL
281 (128 255 "w") ;high-bit-on characters
282 (?\t " ")
283 (?\r " ")
284 (?\n " ")
285 (?\( ".")
286 (?\) ".")
287 (?< ".")
288 (?> ".")
289 (?@ ".")
290 (?, ".")
291 (?\; ".")
292 (?: ".")
293 (?\\ "\\")
294 (?\" "\"")
295 (?. ".")
296 (?\[ ".")
297 (?\] ".")
298 ;; % and ! aren't RFC822 characters, but it is convenient to pretend
299 (?% ".")
300 (?! ".")
301 )
302 (address-comment-syntax-table
303 (0 255 "w")
304 (?\( "\(\)")
305 (?\) "\)\(")
306 (?\\ "\\"))
307 (address-domain-literal-syntax-table
308 (0 255 "w")
309 (?\[ "\(\]") ;??????
310 (?\] "\)\[") ;??????
311 (?\\ "\\"))
312 (address-text-comment-syntax-table
313 (0 255 "w")
314 (?\( "\(\)")
315 (?\) "\)\(")
316 (?\[ "\(\]")
317 (?\] "\)\[")
318 (?\{ "\(\}")
319 (?\} "\)\{")
320 (?\\ "\\")
321 (?\" "\"")
322 ;; (?\' "\)\`")
323 ;; (?\` "\(\'")
324 )
325 (address-text-syntax-table
326 (0 255 ".")
327 (?A ?Z "w")
328 (?a ?z "w")
329 (?- "w")
330 (?\} "w")
331 (?\{ "w")
332 (?| "w")
333 (?\' "w")
334 (?~ "w")
335 (?0 ?9 "w"))
336 ))
337
338
339 ;; Utility functions and macros.
340
341 (defmacro undo-backslash-quoting (beg end)
342 (`(save-excursion
343 (save-restriction
344 (narrow-to-region (, beg) (, end))
345 (goto-char (point-min))
346 ;; undo \ quoting
347 (while (re-search-forward "\\\\\\(.\\)" nil t)
348 (replace-match "\\1")
349 ;; CHECK: does this leave point after the replacement?
350 )))))
351
352 (defmacro mail-nuke-char-at (pos)
353 (` (save-excursion
354 (goto-char (, pos))
355 (delete-char 1)
356 (insert mail-space-char))))
357
358 (defmacro mail-nuke-elements-outside-range (list-symbol beg-symbol end-symbol
359 &optional no-replace)
360 (` (progn
361 (setq temp (, list-symbol))
362 (while temp
363 (cond ((or (> (car temp) (, end-symbol))
364 (< (car temp) (, beg-symbol)))
365 (, (or no-replace
366 (` (mail-nuke-char-at (car temp)))))
367 (setcar temp nil)))
368 (setq temp (cdr temp)))
369 (setq (, list-symbol) (delq nil (, list-symbol))))))
370
371 (defun mail-demarkerize (marker)
372 (and marker
373 (if (markerp marker)
374 (let ((temp (marker-position marker)))
375 (set-marker marker nil)
376 temp)
377 marker)))
378
379 (defun mail-markerize (pos)
380 (and pos
381 (if (markerp pos)
382 pos
383 (copy-marker pos))))
384
385 (defmacro mail-last-element (list)
386 "Return last element of LIST."
387 (` (let ((list (, list)))
388 (while (not (null (cdr list)))
389 (setq list (cdr list)))
390 (car list))))
391
392 (defmacro safe-move-sexp (arg)
393 "Safely skip over one balanced sexp, if there is one. Return t if success."
394 (` (condition-case error
395 (progn
396 (goto-char (scan-sexps (point) (, arg)))
397 t)
398 (error
399 (if (string-equal (nth 1 error) "Unbalanced parentheses")
400 nil
401 (while t
402 (signal (car error) (cdr error))))))))
403
404
405 ;; The main function to grind addresses
406
407 (defun mail-extract-address-components (address)
408 "Given an rfc 822 ADDRESS, extract full name and canonical address.
409 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS)."
410 (let ((canonicalization-buffer (get-buffer-create "*canonical address*"))
411 (extraction-buffer (get-buffer-create "*extract address components*"))
412 (foo 'bar)
413 char
414 multiple-addresses
415 <-pos >-pos @-pos :-pos ,-pos !-pos %-pos \;-pos
416 group-:-pos group-\;-pos route-addr-:-pos
417 record-pos-symbol
418 first-real-pos last-real-pos
419 phrase-beg phrase-end
420 comment-beg comment-end
421 quote-beg quote-end
422 atom-beg atom-end
423 mbox-beg mbox-end
424 \.-ends-name
425 temp
426 name-suffix
427 saved-point
428 fi mi li
429 saved-%-pos saved-!-pos saved-@-pos
430 domain-pos \.-pos insert-point)
431
432 (save-excursion
433 (set-buffer extraction-buffer)
434 (buffer-flush-undo extraction-buffer)
435 (set-syntax-table address-syntax-table)
436 (widen)
437 (erase-buffer)
438 (setq case-fold-search nil)
439
440 ;; Insert extra space at beginning to allow later replacement with <
441 ;; without having to move markers.
442 (insert mail-space-char address)
443
444 ;; stolen from rfc822.el
445 ;; Unfold multiple lines.
446 (goto-char (point-min))
447 (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t)
448 (replace-match "\\1 " t))
449
450 ;; first pass grabs useful information about address
451 (goto-char (point-min))
452 (while (progn
453 (skip-chars-forward mail-whitespace)
454 (not (eobp)))
455 (setq char (char-after (point)))
456 (or first-real-pos
457 (if (not (eq char ?\())
458 (setq first-real-pos (point))))
459 (cond
460 ;; comment
461 ((eq char ?\()
462 (set-syntax-table address-comment-syntax-table)
463 ;; only record the first non-empty comment's position
464 (if (and (not comment-beg)
465 (save-excursion
466 (forward-char 1)
467 (skip-chars-forward mail-whitespace)
468 (not (eq ?\) (char-after (point))))))
469 (setq comment-beg (point)))
470 ;; TODO: don't record if unbalanced
471 (or (safe-move-sexp 1)
472 (forward-char 1))
473 (set-syntax-table address-syntax-table)
474 (if (and comment-beg
475 (not comment-end))
476 (setq comment-end (point))))
477 ;; quoted text
478 ((eq char ?\")
479 ;; only record the first non-empty quote's position
480 (if (and (not quote-beg)
481 (save-excursion
482 (forward-char 1)
483 (skip-chars-forward mail-whitespace)
484 (not (eq ?\" (char-after (point))))))
485 (setq quote-beg (point)))
486 ;; TODO: don't record if unbalanced
487 (or (safe-move-sexp 1)
488 (forward-char 1))
489 (if (and quote-beg
490 (not quote-end))
491 (setq quote-end (point))))
492 ;; domain literals
493 ((eq char ?\[)
494 (set-syntax-table address-domain-literal-syntax-table)
495 (or (safe-move-sexp 1)
496 (forward-char 1))
497 (set-syntax-table address-syntax-table))
498 ;; commas delimit addresses when outside < > pairs.
499 ((and (eq char ?,)
500 (or (null <-pos)
501 (and >-pos
502 ;; handle weird munged addresses
503 (> (mail-last-element <-pos) (car >-pos)))))
504 (setq multiple-addresses t)
505 (delete-char 1)
506 (narrow-to-region (point-min) (point)))
507 ;; record the position of various interesting chars, determine
508 ;; legality later.
509 ((setq record-pos-symbol
510 (cdr (assq char
511 '((?< . <-pos) (?> . >-pos) (?@ . @-pos)
512 (?: . :-pos) (?, . ,-pos) (?! . !-pos)
513 (?% . %-pos) (?\; . \;-pos)))))
514 (set record-pos-symbol
515 (cons (point) (symbol-value record-pos-symbol)))
516 (forward-char 1))
517 ((eq char ?.)
518 (forward-char 1))
519 ((memq char '(
520 ;; comment terminator illegal
521 ?\)
522 ;; domain literal terminator illegal
523 ?\]
524 ;; \ allowed only within quoted strings,
525 ;; domain literals, and comments
526 ?\\
527 ))
528 (mail-nuke-char-at (point))
529 (forward-char 1))
530 (t
531 (forward-word 1)))
532 (or (eq char ?\()
533 (setq last-real-pos (point))))
534
535 ;; Use only the leftmost <, if any. Replace all others with spaces.
536 (while (cdr <-pos)
537 (mail-nuke-char-at (car <-pos))
538 (setq <-pos (cdr <-pos)))
539
540 ;; Use only the rightmost >, if any. Replace all others with spaces.
541 (while (cdr >-pos)
542 (mail-nuke-char-at (nth 1 >-pos))
543 (setcdr >-pos (nthcdr 2 >-pos)))
544
545 ;; If multiple @s and a :, but no < and >, insert around buffer.
546 ;; This commonly happens on the UUCP "From " line. Ugh.
547 (cond ((and (> (length @-pos) 1)
548 :-pos ;TODO: check if between @s
549 (not <-pos))
550 (goto-char (point-min))
551 (delete-char 1)
552 (setq <-pos (list (point)))
553 (insert ?<)))
554
555 ;; If < but no >, insert > in rightmost possible position
556 (cond ((and <-pos
557 (null >-pos))
558 (goto-char (point-max))
559 (setq >-pos (list (point)))
560 (insert ?>)))
561
562 ;; If > but no <, replace > with space.
563 (cond ((and >-pos
564 (null <-pos))
565 (mail-nuke-char-at (car >-pos))
566 (setq >-pos nil)))
567
568 ;; Turn >-pos and <-pos into non-lists
569 (setq >-pos (car >-pos)
570 <-pos (car <-pos))
571
572 ;; Trim other punctuation lists of items outside < > pair to handle
573 ;; stupid MTAs.
574 (cond (<-pos ; don't need to check >-pos also
575 ;; handle bozo software that violates RFC 822 by sticking
576 ;; punctuation marks outside of a < > pair
577 (mail-nuke-elements-outside-range @-pos <-pos >-pos t)
578 ;; RFC 822 says nothing about these two outside < >, but
579 ;; remove those positions from the lists to make things
580 ;; easier.
581 (mail-nuke-elements-outside-range !-pos <-pos >-pos t)
582 (mail-nuke-elements-outside-range %-pos <-pos >-pos t)))
583
584 ;; Check for : that indicates GROUP list and for : part of
585 ;; ROUTE-ADDR spec.
586 ;; Can't possibly be more than two :. Nuke any extra.
587 (while :-pos
588 (setq temp (car :-pos)
589 :-pos (cdr :-pos))
590 (cond ((and <-pos >-pos
591 (> temp <-pos)
592 (< temp >-pos))
593 (if (or route-addr-:-pos
594 (< (length @-pos) 2)
595 (> temp (car @-pos))
596 (< temp (nth 1 @-pos)))
597 (mail-nuke-char-at temp)
598 (setq route-addr-:-pos temp)))
599 ((or (not <-pos)
600 (and <-pos
601 (< temp <-pos)))
602 (setq group-:-pos temp))))
603
604 ;; Nuke any ; that is in or to the left of a < > pair or to the left
605 ;; of a GROUP starting :. Also, there may only be one ;.
606 (while \;-pos
607 (setq temp (car \;-pos)
608 \;-pos (cdr \;-pos))
609 (cond ((and <-pos >-pos
610 (> temp <-pos)
611 (< temp >-pos))
612 (mail-nuke-char-at temp))
613 ((and (or (not group-:-pos)
614 (> temp group-:-pos))
615 (not group-\;-pos))
616 (setq group-\;-pos temp))))
617
618 ;; Handle junk like ";@host.company.dom" that sendmail adds.
619 ;; **** should I remember comment positions?
620 (and group-\;-pos
621 ;; this is fine for now
622 (mail-nuke-elements-outside-range !-pos group-:-pos group-\;-pos t)
623 (mail-nuke-elements-outside-range @-pos group-:-pos group-\;-pos t)
624 (mail-nuke-elements-outside-range %-pos group-:-pos group-\;-pos t)
625 (mail-nuke-elements-outside-range ,-pos group-:-pos group-\;-pos t)
626 (and last-real-pos
627 (> last-real-pos (1+ group-\;-pos))
628 (setq last-real-pos (1+ group-\;-pos)))
629 (and comment-end
630 (> comment-end group-\;-pos)
631 (setq comment-end nil
632 comment-beg nil))
633 (and quote-end
634 (> quote-end group-\;-pos)
635 (setq quote-end nil
636 quote-beg nil))
637 (narrow-to-region (point-min) group-\;-pos))
638
639 ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any
640 ;; others.
641 ;; Hell, go ahead an nuke all of the commas.
642 ;; **** This will cause problems when we start handling commas in
643 ;; the PHRASE part .... no it won't ... yes it will ... ?????
644 (mail-nuke-elements-outside-range ,-pos 1 1)
645
646 ;; can only have multiple @s inside < >. The fact that some MTAs
647 ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is
648 ;; handled above.
649
650 ;; Locate PHRASE part of ROUTE-ADDR.
651 (cond (<-pos
652 (goto-char <-pos)
653 (skip-chars-backward mail-whitespace)
654 (setq phrase-end (point))
655 (goto-char (or ;;group-:-pos
656 (point-min)))
657 (skip-chars-forward mail-whitespace)
658 (if (< (point) phrase-end)
659 (setq phrase-beg (point))
660 (setq phrase-end nil))))
661
662 ;; handle ROUTE-ADDRS with real ROUTEs.
663 ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and
664 ;; any % or ! must be semantically meaningless.
665 ;; TODO: do this processing into canonicalization buffer
666 (cond (route-addr-:-pos
667 (setq !-pos nil
668 %-pos nil
669 >-pos (copy-marker >-pos)
670 route-addr-:-pos (copy-marker route-addr-:-pos))
671 (goto-char >-pos)
672 (insert-before-markers ?X)
673 (goto-char (car @-pos))
674 (while (setq @-pos (cdr @-pos))
675 (delete-char 1)
676 (setq %-pos (cons (point-marker) %-pos))
677 (insert "%")
678 (goto-char (1- >-pos))
679 (save-excursion
680 (insert-buffer-substring extraction-buffer
681 (car @-pos) route-addr-:-pos)
682 (delete-region (car @-pos) route-addr-:-pos))
683 (or (cdr @-pos)
684 (setq saved-@-pos (list (point)))))
685 (setq @-pos saved-@-pos)
686 (goto-char >-pos)
687 (delete-char -1)
688 (mail-nuke-char-at route-addr-:-pos)
689 (mail-demarkerize route-addr-:-pos)
690 (setq route-addr-:-pos nil
691 >-pos (mail-demarkerize >-pos)
692 %-pos (mapcar 'mail-demarkerize %-pos))))
693
694 ;; de-listify @-pos
695 (setq @-pos (car @-pos))
696
697 ;; TODO: remove comments in the middle of an address
698
699 (set-buffer canonicalization-buffer)
700
701 (buffer-flush-undo canonicalization-buffer)
702 (set-syntax-table address-syntax-table)
703 (setq case-fold-search nil)
704
705 (widen)
706 (erase-buffer)
707 (insert-buffer-substring extraction-buffer)
708
709 (if <-pos
710 (narrow-to-region (progn
711 (goto-char (1+ <-pos))
712 (skip-chars-forward mail-whitespace)
713 (point))
714 >-pos)
715 ;; ****** Oh no! What if the address is completely empty!
716 (narrow-to-region first-real-pos last-real-pos))
717
718 (and @-pos %-pos
719 (mail-nuke-elements-outside-range %-pos (point-min) @-pos))
720 (and %-pos !-pos
721 (mail-nuke-elements-outside-range !-pos (point-min) (car %-pos)))
722 (and @-pos !-pos (not %-pos)
723 (mail-nuke-elements-outside-range !-pos (point-min) @-pos))
724
725 ;; Error condition:?? (and %-pos (not @-pos))
726
727 (cond (!-pos
728 ;; **** I don't understand this save-restriction and the
729 ;; narrow-to-region inside it. Why did I do that?
730 (save-restriction
731 (cond ((and @-pos
732 mail-@-binds-tighter-than-!)
733 (goto-char @-pos)
734 (setq %-pos (cons (point) %-pos)
735 @-pos nil)
736 (delete-char 1)
737 (insert "%")
738 (setq insert-point (point-max)))
739 (mail-@-binds-tighter-than-!
740 (setq insert-point (point-max)))
741 (%-pos
742 (setq insert-point (mail-last-element %-pos)
743 saved-%-pos (mapcar 'mail-markerize %-pos)
744 %-pos nil
745 @-pos (mail-markerize @-pos)))
746 (@-pos
747 (setq insert-point @-pos)
748 (setq @-pos (mail-markerize @-pos)))
749 (t
750 (setq insert-point (point-max))))
751 (narrow-to-region (point-min) insert-point)
752 (setq saved-!-pos (car !-pos))
753 (while !-pos
754 (goto-char (point-max))
755 (cond ((and (not @-pos)
756 (not (cdr !-pos)))
757 (setq @-pos (point))
758 (insert-before-markers "@ "))
759 (t
760 (setq %-pos (cons (point) %-pos))
761 (insert-before-markers "% ")))
762 (backward-char 1)
763 (insert-buffer-substring
764 (current-buffer)
765 (if (nth 1 !-pos)
766 (1+ (nth 1 !-pos))
767 (point-min))
768 (car !-pos))
769 (delete-char 1)
770 (or (save-excursion
771 (safe-move-sexp -1)
772 (skip-chars-backward mail-whitespace)
773 (eq ?. (preceding-char)))
774 (insert-before-markers
775 (if (save-excursion
776 (skip-chars-backward mail-whitespace)
777 (eq ?. (preceding-char)))
778 ""
779 ".")
780 "uucp"))
781 (setq !-pos (cdr !-pos))))
782 (and saved-%-pos
783 (setq %-pos (append (mapcar 'mail-demarkerize saved-%-pos)
784 %-pos)))
785 (setq @-pos (mail-demarkerize @-pos))
786 (narrow-to-region (1+ saved-!-pos) (point-max))))
787 (cond ((and %-pos
788 (not @-pos))
789 (goto-char (car %-pos))
790 (delete-char 1)
791 (setq @-pos (point))
792 (insert "@")
793 (setq %-pos (cdr %-pos))))
794 (setq %-pos (nreverse %-pos))
795 ;; RFC 1034 doesn't approve of this, oh well:
796 (downcase-region (or (car %-pos) @-pos (point-max)) (point-max))
797 (cond (%-pos ; implies @-pos valid
798 (setq temp %-pos)
799 (catch 'truncated
800 (while temp
801 (goto-char (or (nth 1 temp)
802 @-pos))
803 (skip-chars-backward mail-whitespace)
804 (save-excursion
805 (safe-move-sexp -1)
806 (setq domain-pos (point))
807 (skip-chars-backward mail-whitespace)
808 (setq \.-pos (eq ?. (preceding-char))))
809 (cond ((and \.-pos
810 (get
811 (intern
812 (buffer-substring domain-pos (point)))
813 'domain-name))
814 (narrow-to-region (point-min) (point))
815 (goto-char (car temp))
816 (delete-char 1)
817 (setq @-pos (point))
818 (setcdr temp nil)
819 (setq %-pos (delq @-pos %-pos))
820 (insert "@")
821 (throw 'truncated t)))
822 (setq temp (cdr temp))))))
823 (setq mbox-beg (point-min)
824 mbox-end (if %-pos (car %-pos)
825 (or @-pos
826 (point-max))))
827
828 ;; Done canonicalizing address.
829
830 (set-buffer extraction-buffer)
831
832 ;; Find the full name
833
834 (cond ((and phrase-beg
835 (eq quote-beg phrase-beg)
836 (<= quote-end phrase-end))
837 (narrow-to-region (1+ quote-beg) (1- quote-end))
838 (undo-backslash-quoting (point-min) (point-max)))
839 (phrase-beg
840 (narrow-to-region phrase-beg phrase-end))
841 (comment-beg
842 (narrow-to-region (1+ comment-beg) (1- comment-end))
843 (undo-backslash-quoting (point-min) (point-max)))
844 (t
845 ;; *** Work in canon buffer instead? No, can't. Hmm.
846 (delete-region (point-min) (point-max))
847 (insert-buffer-substring canonicalization-buffer
848 mbox-beg mbox-end)
849 (goto-char (point-min))
850 (setq \.-ends-name (search-forward "_" nil t))
851 (goto-char (point-min))
852 (while (progn
853 (skip-chars-forward mail-whitespace)
854 (not (eobp)))
855 (setq char (char-after (point)))
856 (cond
857 ((eq char ?\")
858 (setq quote-beg (point))
859 (or (safe-move-sexp 1)
860 ;; TODO: handle this error condition!!!!!
861 (forward-char 1))
862 ;; take into account deletions
863 (setq quote-end (- (point) 2))
864 (save-excursion
865 (backward-char 1)
866 (delete-char 1)
867 (goto-char quote-beg)
868 (delete-char 1))
869 (undo-backslash-quoting quote-beg quote-end)
870 (or (eq mail-space-char (char-after (point)))
871 (insert " "))
872 (setq \.-ends-name t))
873 ((eq char ?.)
874 (if (eq (char-after (1+ (point))) ?_)
875 (progn
876 (forward-char 1)
877 (delete-char 1)
878 (insert mail-space-char))
879 (if \.-ends-name
880 (narrow-to-region (point-min) (point))
881 (delete-char 1)
882 (insert " "))))
883 ((memq (char-syntax char) '(?. ?\\))
884 (delete-char 1)
885 (insert " "))
886 (t
887 (setq atom-beg (point))
888 (forward-word 1)
889 (setq atom-end (point))
890 (save-restriction
891 (narrow-to-region atom-beg atom-end)
892 (goto-char (point-min))
893 (while (re-search-forward "\\([^_]+\\)_" nil t)
894 (replace-match "\\1 "))
895 (goto-char (point-max))))))))
896
897 (set-syntax-table address-text-syntax-table)
898
899 (setq xxx (variant-method (buffer-string)))
900 (delete-region (point-min) (point-max))
901 (insert xxx)
902 (goto-char (point-min))
903
904 ;; ;; Compress whitespace
905 ;; (goto-char (point-min))
906 ;; (while (re-search-forward "[ \t\n]+" nil t)
907 ;; (replace-match " "))
908 ;;
909 ;; ;; Fix . used as space
910 ;; (goto-char (point-min))
911 ;; (while (re-search-forward mail-bad-\.-pattern nil t)
912 ;; (replace-match "\\1 \\2"))
913 ;;
914 ;; ;; Delete trailing parenthesized comment
915 ;; (goto-char (point-max))
916 ;; (skip-chars-backward mail-whitespace)
917 ;; (cond ((memq (char-after (1- (point))) '(?\) ?\} ?\]))
918 ;; (setq comment-end (point))
919 ;; (set-syntax-table address-text-comment-syntax-table)
920 ;; (or (safe-move-sexp -1)
921 ;; (backward-char 1))
922 ;; (set-syntax-table address-text-syntax-table)
923 ;; (setq comment-beg (point))
924 ;; (skip-chars-backward mail-whitespace)
925 ;; (if (bobp)
926 ;; (narrow-to-region (1+ comment-beg) (1- comment-end))
927 ;; (narrow-to-region (point-min) (point)))))
928 ;;
929 ;; ;; Find, save, and delete any name suffix
930 ;; ;; *** Broken!
931 ;; (goto-char (point-min))
932 ;; (cond ((re-search-forward mail-full-name-suffix-pattern nil t)
933 ;; (setq name-suffix (buffer-substring (match-beginning 3)
934 ;; (match-end 3)))
935 ;; (replace-match "\\1 \\4")))
936 ;;
937 ;; ;; Delete ALL CAPS words and after, if preceded by mixed-case or
938 ;; ;; lowercase words. Eg. XT-DEM.
939 ;; (goto-char (point-min))
940 ;; ;; ## This will lose on something like "SMITH MAX".
941 ;; ;; ## maybe it should be
942 ;; ;; ## " \\([A-Z]+[-_/][A-Z]+\\|[A-Z][A-Z][A-Z]\\)\\b.*[^A-Z \t]"
943 ;; ;; ## that is, three-letter-upper-case-word with non-upper-case
944 ;; ;; ## characters following it.
945 ;; (if (re-search-forward mail-mixed-case-name-pattern nil t)
946 ;; (if (re-search-forward mail-weird-acronym-pattern nil t)
947 ;; (narrow-to-region (point-min) (match-beginning 0))))
948 ;;
949 ;; ;; Delete trailing alternative address
950 ;; (goto-char (point-min))
951 ;; (if (re-search-forward mail-alternative-address-pattern nil t)
952 ;; (narrow-to-region (point-min) (match-beginning 0)))
953 ;;
954 ;; ;; Delete trailing comment
955 ;; (goto-char (point-min))
956 ;; (if (re-search-forward mail-trailing-comment-start-pattern nil t)
957 ;; (or (progn
958 ;; (goto-char (match-beginning 0))
959 ;; (skip-chars-backward mail-whitespace)
960 ;; (bobp))
961 ;; (narrow-to-region (point-min) (match-beginning 0))))
962 ;;
963 ;; ;; Delete trailing comma-separated comment
964 ;; (goto-char (point-min))
965 ;; ;; ## doesn't this break "Smith, John"? Yes.
966 ;; (re-search-forward mail-last-name-first-pattern nil t)
967 ;; (while (search-forward "," nil t)
968 ;; (or (save-excursion
969 ;; (backward-char 2)
970 ;; (looking-at mail-full-name-suffix-pattern))
971 ;; (narrow-to-region (point-min) (1- (point)))))
972 ;;
973 ;; ;; Delete telephone numbers and ham radio call signs
974 ;; (goto-char (point-min))
975 ;; (if (re-search-forward mail-telephone-extension-pattern nil t)
976 ;; (narrow-to-region (point-min) (match-beginning 0)))
977 ;; (goto-char (point-min))
978 ;; (if (re-search-forward mail-ham-call-sign-pattern nil t)
979 ;; (if (eq (match-beginning 0) (point-min))
980 ;; (narrow-to-region (match-end 0) (point-max))
981 ;; (narrow-to-region (point-min) (match-beginning 0))))
982 ;;
983 ;; ;; Delete trailing word followed immediately by .
984 ;; (goto-char (point-min))
985 ;; ;; ## what's this for? doesn't it mess up "Public, Harry Q."? No.
986 ;; (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
987 ;; (narrow-to-region (point-min) (match-beginning 0)))
988 ;;
989 ;; ;; Handle & substitution
990 ;; ;; TODO: remember to disable middle initial guessing
991 ;; (goto-char (point-min))
992 ;; (cond ((re-search-forward "\\( \\|\\`\\)&\\( \\|\\'\\)" nil t)
993 ;; (goto-char (match-end 1))
994 ;; (delete-char 1)
995 ;; (capitalize-region
996 ;; (point)
997 ;; (progn
998 ;; (insert-buffer-substring canonicalization-buffer
999 ;; mbox-beg mbox-end)
1000 ;; (point)))))
1001 ;;
1002 ;; ;; Delete nickname
1003 ;; (goto-char (point-min))
1004 ;; (if (re-search-forward mail-nickname-pattern nil t)
1005 ;; (replace-match (if (eq (match-beginning 2) (1- (match-end 2)))
1006 ;; " \\2 "
1007 ;; " ")))
1008 ;;
1009 ;; ;; Fixup initials
1010 ;; (while (progn
1011 ;; (goto-char (point-min))
1012 ;; (re-search-forward mail-bad-initials-pattern nil t))
1013 ;; (replace-match
1014 ;; (if (match-beginning 4)
1015 ;; "\\1. \\4"
1016 ;; (if (match-beginning 5)
1017 ;; "\\1. \\5"
1018 ;; "\\1. "))))
1019 ;;
1020 ;; ;; Delete title
1021 ;; (goto-char (point-min))
1022 ;; (if (re-search-forward mail-full-name-prefixes nil t)
1023 ;; (narrow-to-region (point) (point-max)))
1024 ;;
1025 ;; ;; Delete trailing and preceding non-name characters
1026 ;; (goto-char (point-min))
1027 ;; (skip-chars-forward mail-non-begin-name-chars)
1028 ;; (narrow-to-region (point) (point-max))
1029 ;; (goto-char (point-max))
1030 ;; (skip-chars-backward mail-non-end-name-chars)
1031 ;; (narrow-to-region (point-min) (point))
1032
1033 ;; If name is "First Last" and userid is "F?L", then assume
1034 ;; the middle initial is the second letter in the userid.
1035 ;; initially by Jamie Zawinski <jwz@lucid.com>
1036 (cond ((and (eq 3 (- mbox-end mbox-beg))
1037 (progn
1038 (goto-char (point-min))
1039 (looking-at mail-two-name-pattern)))
1040 (setq fi (char-after (match-beginning 0))
1041 li (char-after (match-beginning 3)))
1042 (save-excursion
1043 (set-buffer canonicalization-buffer)
1044 ;; char-equal is ignoring case here, so no need to upcase
1045 ;; or downcase.
1046 (let ((case-fold-search t))
1047 (and (char-equal fi (char-after mbox-beg))
1048 (char-equal li (char-after (1- mbox-end)))
1049 (setq mi (char-after (1+ mbox-beg))))))
1050 (cond ((and mi
1051 ;; TODO: use better table than syntax table
1052 (eq ?w (char-syntax mi)))
1053 (goto-char (match-beginning 3))
1054 (insert (upcase mi) ". ")))))
1055
1056 ;; ;; Restore suffix
1057 ;; (cond (name-suffix
1058 ;; (goto-char (point-max))
1059 ;; (insert ", " name-suffix)
1060 ;; (backward-word 1)
1061 ;; (cond ((memq (following-char) '(?j ?J ?s ?S))
1062 ;; (capitalize-word 1)
1063 ;; (or (eq (following-char) ?.)
1064 ;; (insert ?.)))
1065 ;; (t
1066 ;; (upcase-word 1)))))
1067
1068 ;; Result
1069 (list (buffer-string)
1070 (progn
1071 (set-buffer canonicalization-buffer)
1072 (buffer-string)))
1073 )))
1074
1075 ;; TODO: put this back in the above function now that it's proven:
1076 (defun variant-method (string)
1077 (let ((variant-buffer (get-buffer-create "*variant method buffer*"))
1078 (word-count 0)
1079 mixed-case-flag lower-case-flag upper-case-flag
1080 suffix-flag last-name-comma-flag
1081 comment-beg comment-end initial beg end
1082 )
1083 (save-excursion
1084 (set-buffer variant-buffer)
1085 (buffer-flush-undo variant-buffer)
1086 (set-syntax-table address-text-syntax-table)
1087 (widen)
1088 (erase-buffer)
1089 (setq case-fold-search nil)
1090
1091 (insert string)
1092
1093 ;; Fix . used as space
1094 (goto-char (point-min))
1095 (while (re-search-forward mail-bad-\.-pattern nil t)
1096 (replace-match "\\1 \\2"))
1097
1098 ;; Skip any initial garbage.
1099 (goto-char (point-min))
1100 (skip-chars-forward mail-non-begin-name-chars)
1101 (skip-chars-backward "& \"")
1102 (narrow-to-region (point) (point-max))
1103
1104 (catch 'stop
1105 (while t
1106 (skip-chars-forward mail-whitespace)
1107
1108 (cond
1109
1110 ;; Delete title
1111 ((and (eq word-count 0)
1112 (looking-at mail-full-name-prefixes))
1113 (goto-char (match-end 0))
1114 (narrow-to-region (point) (point-max)))
1115
1116 ;; Stop after name suffix
1117 ((and (>= word-count 2)
1118 (looking-at mail-full-name-suffix-pattern))
1119 (skip-chars-backward mail-whitespace)
1120 (setq suffix-flag (point))
1121 (if (eq ?, (following-char))
1122 (forward-char 1)
1123 (insert ?,))
1124 ;; Enforce at least one space after comma
1125 (or (eq mail-space-char (following-char))
1126 (insert mail-space-char))
1127 (skip-chars-forward mail-whitespace)
1128 (cond ((memq (following-char) '(?j ?J ?s ?S))
1129 (capitalize-word 1)
1130 (if (eq (following-char) ?.)
1131 (forward-char 1)
1132 (insert ?.)))
1133 (t
1134 (upcase-word 1)))
1135 (setq word-count (1+ word-count))
1136 (throw 'stop t))
1137
1138 ;; Handle SCA names
1139 ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As"
1140 (setq word-count 0)
1141 (goto-char (match-beginning 1))
1142 (narrow-to-region (point) (point-max)))
1143
1144 ;; Various stopping points
1145 ((or
1146 ;; Stop before ALL CAPS acronyms, if preceded by mixed-case or
1147 ;; lowercase words. Eg. XT-DEM.
1148 (and (>= word-count 2)
1149 (or mixed-case-flag lower-case-flag)
1150 (looking-at mail-weird-acronym-pattern)
1151 (not (looking-at mail-roman-numeral-pattern)))
1152 ;; Stop before 4-or-more letter lowercase words preceded by
1153 ;; mixed case or uppercase words.
1154 (and (>= word-count 2)
1155 (or upper-case-flag mixed-case-flag)
1156 (looking-at "[a-z][a-z][a-z][a-z]+\\b"))
1157 ;; Stop before trailing alternative address
1158 (looking-at mail-alternative-address-pattern)
1159 ;; Stop before trailing comment not introduced by comma
1160 (looking-at mail-trailing-comment-start-pattern)
1161 ;; Stop before telephone numbers
1162 (looking-at mail-telephone-extension-pattern))
1163 (throw 'stop t))
1164
1165 ;; Check for initial last name followed by comma
1166 ((and (eq ?, (following-char))
1167 (eq word-count 1))
1168 (forward-char 1)
1169 (setq last-name-comma-flag t)
1170 (or (eq mail-space-char (following-char))
1171 (insert mail-space-char)))
1172
1173 ;; Stop before trailing comma-separated comment
1174 ((eq ?, (following-char))
1175 (throw 'stop t))
1176
1177 ;; Delete parenthesized/quoted comment/nickname
1178 ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`))
1179 (setq comment-beg (point))
1180 (set-syntax-table address-text-comment-syntax-table)
1181 (cond ((memq (following-char) '(?\' ?\`))
1182 (if (eq ?\' (following-char))
1183 (forward-char 1))
1184 (or (search-forward "'" nil t)
1185 (delete-char 1)))
1186 (t
1187 (or (safe-move-sexp 1)
1188 (goto-char (point-max)))))
1189 (set-syntax-table address-text-syntax-table)
1190 (setq comment-end (point))
1191 (cond
1192 ;; Handle case of entire name being quoted
1193 ((and (eq word-count 0)
1194 (looking-at " *\\'")
1195 (>= (- comment-end comment-beg) 2))
1196 (narrow-to-region (1+ comment-beg) (1- comment-end))
1197 (goto-char (point-min)))
1198 (t
1199 ;; Handle case of quoted initial
1200 (if (and (or (= 3 (- comment-end comment-beg))
1201 (and (= 4 (- comment-end comment-beg))
1202 (eq ?. (char-after (+ 2 comment-beg)))))
1203 (not (looking-at " *\\'")))
1204 (setq initial (char-after (1+ comment-beg)))
1205 (setq initial nil))
1206 (delete-region comment-beg comment-end)
1207 (if initial
1208 (insert initial ". ")))))
1209
1210 ;; Delete ham radio call signs
1211 ((looking-at mail-ham-call-sign-pattern)
1212 (delete-region (match-beginning 0) (match-end 0)))
1213
1214 ;; Handle & substitution
1215 ;; TODO: remember to disable middle initial guessing
1216 ((and (or (bobp)
1217 (eq mail-space-char (preceding-char)))
1218 (looking-at "&\\( \\|\\'\\)"))
1219 (delete-char 1)
1220 (capitalize-region
1221 (point)
1222 (progn
1223 (insert-buffer-substring canonicalization-buffer
1224 mbox-beg mbox-end)
1225 (point))))
1226
1227 ;; Fixup initials
1228 ((looking-at mail-initial-pattern)
1229 (or (eq (following-char) (upcase (following-char)))
1230 (setq lower-case-flag t))
1231 (forward-char 1)
1232 (if (eq ?. (following-char))
1233 (forward-char 1)
1234 (insert ?.))
1235 (or (eq mail-space-char (following-char))
1236 (insert mail-space-char))
1237 (setq word-count (1+ word-count)))
1238
1239 ;; Regular name words
1240 ((looking-at mail-name-pattern)
1241 (setq beg (point))
1242 (setq end (match-end 0))
1243 (set (if (re-search-forward "[a-z]" end t)
1244 (if (progn
1245 (goto-char beg)
1246 (re-search-forward "[A-Z]" end t))
1247 'mixed-case-flag
1248 'lower-case-flag)
1249 'upper-case-flag) t)
1250 (goto-char end)
1251 (setq word-count (1+ word-count)))
1252
1253 (t
1254 (throw 'stop t)))))
1255
1256 (narrow-to-region (point-min) (point))
1257
1258 ;; Delete trailing word followed immediately by .
1259 (cond ((not suffix-flag)
1260 (goto-char (point-min))
1261 (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
1262 (narrow-to-region (point-min) (match-beginning 0)))))
1263
1264 ;; If last name first put it at end (but before suffix)
1265 (cond (last-name-comma-flag
1266 (goto-char (point-min))
1267 (search-forward ",")
1268 (setq end (1- (point)))
1269 (goto-char (or suffix-flag (point-max)))
1270 (or (eq mail-space-char (preceding-char))
1271 (insert mail-space-char))
1272 (insert-buffer-substring (current-buffer) (point-min) end)
1273 (narrow-to-region (1+ end) (point-max))))
1274
1275 (goto-char (point-max))
1276 (skip-chars-backward mail-non-end-name-chars)
1277 (if (eq ?. (following-char))
1278 (forward-char 1))
1279 (narrow-to-region (point)
1280 (progn
1281 (goto-char (point-min))
1282 (skip-chars-forward mail-non-begin-name-chars)
1283 (point)))
1284
1285 ;; Compress whitespace
1286 (goto-char (point-min))
1287 (while (re-search-forward "[ \t\n]+" nil t)
1288 (replace-match " "))
1289
1290 (buffer-substring (point-min) (point-max))
1291
1292 )))
1293
1294 ;; The country names are just in there for show right now, and because
1295 ;; Jamie thought it would be neat. They aren't used yet.
1296
1297 ;; Keep in mind that the country abbreviations follow ISO-3166. There is
1298 ;; a U.S. FIPS that specifies a different set of two-letter country
1299 ;; abbreviations.
1300
1301 ;; TODO: put this in its own obarray, instead of cluttering up the main
1302 ;; symbol table with junk.
1303
1304 (mapcar
1305 (function
1306 (lambda (x)
1307 (if (symbolp x)
1308 (put x 'domain-name t)
1309 (put (car x) 'domain-name (nth 1 x)))))
1310 '((ag "Antigua")
1311 (ar "Argentina") ; Argentine Republic
1312 arpa ; Advanced Projects Research Agency
1313 (at "Austria") ; The Republic of _
1314 (au "Australia")
1315 (bb "Barbados")
1316 (be "Belgium") ; The Kingdom of _
1317 (bg "Bulgaria")
1318 bitnet ; Because It's Time NET
1319 (bo "Bolivia") ; Republic of _
1320 (br "Brazil") ; The Federative Republic of _
1321 (bs "Bahamas")
1322 (bz "Belize")
1323 (ca "Canada")
1324 (ch "Switzerland") ; The Swiss Confederation
1325 (cl "Chile") ; The Republic of _
1326 (cn "China") ; The People's Republic of _
1327 (co "Columbia")
1328 com ; Commercial
1329 (cr "Costa Rica") ; The Republic of _
1330 (cs "Czechoslovakia")
1331 (de "Germany")
1332 (dk "Denmark")
1333 (dm "Dominica")
1334 (do "Dominican Republic") ; The _
1335 (ec "Ecuador") ; The Republic of _
1336 edu ; Educational
1337 (eg "Egypt") ; The Arab Republic of _
1338 (es "Spain") ; The Kingdom of _
1339 (fi "Finland") ; The Republic of _
1340 (fj "Fiji")
1341 (fr "France")
1342 gov ; Government (U.S.A.)
1343 (gr "Greece") ; The Hellenic Republic
1344 (hk "Hong Kong")
1345 (hu "Hungary") ; The Hungarian People's Republic (???)
1346 (ie "Ireland")
1347 (il "Israel") ; The State of _
1348 (in "India") ; The Republic of _
1349 int ; something British, don't know what
1350 (is "Iceland") ; The Republic of _
1351 (it "Italy") ; The Italian Republic
1352 (jm "Jamaica")
1353 (jp "Japan")
1354 (kn "St. Kitts and Nevis")
1355 (kr "South Korea")
1356 (lc "St. Lucia")
1357 (lk "Sri Lanka") ; The Democratic Socialist Republic of _
1358 mil ; Military (U.S.A.)
1359 (mx "Mexico") ; The United Mexican States
1360 (my "Malaysia") ; changed to Myanmar????
1361 (na "Namibia")
1362 nato ; North Atlantic Treaty Organization
1363 net ; Network
1364 (ni "Nicaragua") ; The Republic of _
1365 (nl "Netherlands") ; The Kingdom of the _
1366 (no "Norway") ; The Kingdom of _
1367 (nz "New Zealand")
1368 org ; Organization
1369 (pe "Peru")
1370 (pg "Papua New Guinea")
1371 (ph "Philippines") ; The Republic of the _
1372 (pl "Poland")
1373 (pr "Puerto Rico")
1374 (pt "Portugal") ; The Portugese Republic
1375 (py "Paraguay")
1376 (se "Sweden") ; The Kingdom of _
1377 (sg "Singapore") ; The Republic of _
1378 (sr "Suriname")
1379 (su "Soviet Union")
1380 (th "Thailand") ; The Kingdom of _
1381 (tn "Tunisia")
1382 (tr "Turkey") ; The Republic of _
1383 (tt "Trinidad and Tobago")
1384 (tw "Taiwan")
1385 (uk "United Kingdom") ; The _ of Great Britain
1386 unter-dom ; something German
1387 (us "U.S.A.") ; The United States of America
1388 uucp ; Unix to Unix CoPy
1389 (uy "Uruguay") ; The Eastern Republic of _
1390 (vc "St. Vincent and the Grenadines")
1391 (ve "Venezuela") ; The Republic of _
1392 (yu "Yugoslavia") ; The Socialist Federal Republic of _
1393 ;; Also said to be Zambia ...
1394 (za "South Africa") ; The Republic of _ (why not Zaire???)
1395 (zw "Zimbabwe") ; Republic of _
1396 ))
1397 ;; fipnet
1398
1399
1400 ;; Code for testing.
1401
1402 (defun time-extract ()
1403 (let (times list)
1404 (setq times (cons (current-time-string) times)
1405 list problem-address-alist)
1406 (while list
1407 (mail-extract-address-components (car (car list)))
1408 (setq list (cdr list)))
1409 (setq times (cons (current-time-string) times))
1410 (nreverse times)))
1411
1412 (defun test-extract (&optional starting-point)
1413 (interactive)
1414 (set-buffer (get-buffer-create "*Testing*"))
1415 (erase-buffer)
1416 (sit-for 0)
1417 (mapcar 'test-extract-internal
1418 (if starting-point
1419 (memq starting-point problem-address-alist)
1420 problem-address-alist)))
1421
1422 (defvar failed-item)
1423 (defun test-extract-internal (item)
1424 (setq failed-item item)
1425 (let* ((address (car item))
1426 (correct-name (nth 1 item))
1427 (correct-canon (nth 2 item))
1428 (result (mail-extract-address-components address))
1429 (name (car result))
1430 (canon (nth 1 result))
1431 (name-correct (or (null correct-name)
1432 (string-equal (downcase correct-name)
1433 (downcase name))))
1434 (canon-correct (or (null correct-canon)
1435 (string-equal correct-canon canon))))
1436 (cond ((not (and name-correct canon-correct))
1437 (pop-to-buffer "*Testing*")
1438 (select-window (get-buffer-window (current-buffer)))
1439 (goto-char (point-max))
1440 (insert "Address: " address "\n")
1441 (if (not name-correct)
1442 (insert " Correct Name: [" correct-name
1443 "]\; Result: [" name "]\n"))
1444 (if (not canon-correct)
1445 (insert " Correct Canon: [" correct-canon
1446 "]\; Result: [" canon "]\n"))
1447 (insert "\n")
1448 (sit-for 0))))
1449 (setq failed-item nil))
1450
1451 (defun test-continue-extract ()
1452 (interactive)
1453 (test-extract failed-item))
1454
1455
1456 ;; Assorted junk.
1457
1458 ;; warsaw@nlm.nih.gov (A Bad Dude -- Barry Warsaw)
1459
1460 ;;'(from
1461 ;; reply-to
1462 ;; return-path
1463 ;; x-uucp-from
1464 ;; sender
1465 ;; resent-from
1466 ;; resent-sender
1467 ;; resent-reply-to)
1468
1469 ;;; mail-extr.el ends here