(hilit-submit-feedback): Fix message spelling.
[bpt/emacs.git] / lisp / mail / mail-extr.el
CommitLineData
72c0ae01
ER
1;;; mail-extr.el --- extract full name and address from RFC 822 mail header.
2
154b3e39 3;; Copyright (C) 1991, 1992, 1993, 1994 Free Software Foundation, Inc.
3a801d0c 4
72c0ae01 5;; Author: Joe Wells <jbw@cs.bu.edu>
154b3e39
RS
6;; Maintainer: Jamie Zawinski <jwz@lucid.com>
7;; Version: 1.8
72c0ae01
ER
8;; Keywords: mail
9
72c0ae01
ER
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
154b3e39 14;; the Free Software Foundation; either version 2, or (at your option)
72c0ae01
ER
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING. If not, write to
24;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
25
26;;; Commentary:
27
154b3e39
RS
28;; The entry point of this code is
29;;
30;; mail-extract-address-components: (address)
31;;
32;; Given an RFC-822 ADDRESS, extract full name and canonical address.
33;; Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
34;; If no name can be extracted, FULL-NAME will be nil.
35;; ADDRESS may be a string or a buffer. If it is a buffer, the visible
36;; (narrowed) portion of the buffer will be interpreted as the address.
37;; (This feature exists so that the clever caller might be able to avoid
38;; consing a string.)
39;; If ADDRESS contains more than one RFC-822 address, only the first is
40;; returned.
41;;
42;; This code is more correct (and more heuristic) parser than the code in
43;; rfc822.el. And despite its size, it's fairly fast.
44;;
72c0ae01 45;; There are two main benefits:
154b3e39 46;;
72c0ae01 47;; 1. Higher probability of getting the correct full name for a human than
154b3e39 48;; any other package we know of. (On the other hand, it will cheerfully
72c0ae01
ER
49;; mangle non-human names/comments.)
50;; 2. Address part is put in a canonical form.
154b3e39
RS
51;;
52;; The interface is not yet carved in stone; please give us suggestions.
53;;
54;; We have an extensive test-case collection of funny addresses if you want to
72c0ae01 55;; work with the code. Developing this code requires frequent testing to
154b3e39
RS
56;; make sure you're not breaking functionality. The test cases aren't included
57;; because they are over 100K.
58;;
59;; If you find an address that mail-extr fails on, please send it to the
60;; maintainer along with what you think the correct results should be. We do
61;; not consider it a bug if mail-extr mangles a comment that does not
62;; correspond to a real human full name, although we would prefer that
63;; mail-extr would return the comment as-is.
64;;
72c0ae01 65;; Features:
154b3e39 66;;
72c0ae01 67;; * Full name handling:
154b3e39 68;;
72c0ae01
ER
69;; * knows where full names can be found in an address.
70;; * avoids using empty comments and quoted text.
71;; * extracts full names from mailbox names.
72;; * recognizes common formats for comments after a full name.
73;; * puts a period and a space after each initial.
154b3e39
RS
74;; * understands & referring to the mailbox name, capitalized.
75;; * strips name prefixes like "Prof.", etc.
72c0ae01
ER
76;; * understands what characters can occur in names (not just letters).
77;; * figures out middle initial from mailbox name.
78;; * removes funny nicknames.
79;; * keeps suffixes such as Jr., Sr., III, etc.
80;; * reorders "Last, First" type names.
154b3e39 81;;
72c0ae01 82;; * Address handling:
154b3e39 83;;
72c0ae01
ER
84;; * parses rfc822 quoted text, comments, and domain literals.
85;; * parses rfc822 multi-line headers.
86;; * does something reasonable with rfc822 GROUP addresses.
87;; * handles many rfc822 noncompliant and garbage addresses.
88;; * canonicalizes addresses (after stripping comments/phrases outside <>).
89;; * converts ! addresses into .UUCP and %-style addresses.
90;; * converts rfc822 ROUTE addresses to %-style addresses.
91;; * truncates %-style addresses at leftmost fully qualified domain name.
92;; * handles local relative precedence of ! vs. % and @ (untested).
154b3e39 93;;
72c0ae01
ER
94;; It does almost no string creation. It primarily uses the built-in
95;; parsing routines with the appropriate syntax tables. This should
96;; result in greater speed.
154b3e39 97;;
72c0ae01 98;; TODO:
154b3e39 99;;
72c0ae01
ER
100;; * handle all test cases. (This will take forever.)
101;; * software to pick the correct header to use (eg., "Senders-Name:").
102;; * multiple addresses in the "From:" header (almost all of the necessary
103;; code is there).
104;; * flag to not treat `,' as an address separator. (This is useful when
105;; there is a "From:" header but no "Sender:" header, because then there
106;; is only allowed to be one address.)
107;; * mailbox name does not necessarily contain full name.
108;; * fixing capitalization when it's all upper or lowercase. (Hard!)
109;; * some of the domain literal handling is missing. (But I've never even
110;; seen one of these in a mail address, so maybe no big deal.)
111;; * arrange to have syntax tables byte-compiled.
112;; * speed hacks.
113;; * delete unused variables.
114;; * arrange for testing with different relative precedences of ! vs. @
115;; and %.
72c0ae01
ER
116;; * insert documentation strings!
117;; * handle X.400-gatewayed addresses according to RFC 1148.
118
119;;; Change Log:
120;;
154b3e39
RS
121;; Thu Feb 17 17:57:33 1994 Jamie Zawinski (jwz@lucid.com)
122;;
123;; * merged with jbw's latest version
124;;
125;; Wed Feb 9 21:56:27 1994 Jamie Zawinski (jwz@lucid.com)
126;;
127;; * high-bit chars in comments weren't treated as word syntax
128;;
129;; Sat Feb 5 03:13:40 1994 Jamie Zawinski (jwz@lucid.com)
130;;
131;; * call replace-match with fixed-case arg
132;;
133;; Thu Dec 16 21:56:45 1993 Jamie Zawinski (jwz@lucid.com)
134;;
135;; * some more cleanup, doc, added provide
136;;
137;; Tue Mar 23 21:23:18 1993 Joe Wells (jbw at csd.bu.edu)
138;;
139;; * Made mail-full-name-prefixes a user-customizable variable.
140;; Allow passing the address as a buffer as well as as a string.
141;; Allow [ and ] as name characters (Finnish character set).
142;;
143;; Mon Mar 22 21:20:56 1993 Joe Wells (jbw at bigbird.bu.edu)
144;;
145;; * Handle "null" addresses. Handle = used for spacing in mailbox
146;; name. Fix bug in handling of ROUTE-ADDR-type addresses that are
147;; missing their brackets. Handle uppercase "JR". Extract full
148;; names from X.400 addresses encoded in RFC-822. Fix bug in
149;; handling of multiple addresses where first has trailing comment.
150;; Handle more kinds of telephone extension lead-ins.
151;;
152;; Mon Mar 22 20:16:57 1993 Joe Wells (jbw at bigbird.bu.edu)
153;;
154;; * Handle HZ encoding for embedding GB encoded chinese characters.
155;;
156;; Mon Mar 22 00:46:12 1993 Joe Wells (jbw at bigbird.bu.edu)
157;;
158;; * Fixed too broad matching of ham radio call signs. Fixed bug in
159;; handling an unmatched ' in a name string. Enhanced recognition
160;; of when . in the mailbox name terminates the name portion.
161;; Narrowed conversion of . to space to only the necessary
162;; situation. Deal with VMS's stupid date stamps. Handle a unique
163;; way of introducing an alternate address. Fixed spacing bug I
164;; introduced in switching last name order. Fixed bug in handling
165;; address with ! and % but no @. Narrowed the cases in which
166;; certain trailing words are discarded.
167;;
168;; Sun Mar 21 21:41:06 1993 Joe Wells (jbw at bigbird.bu.edu)
169;;
170;; * Fixed bugs in handling GROUP addresses. Certain words in the
171;; middle of a name no longer terminate it. Handle LISTSERV list
172;; names. Ignore comment field containing mailbox name.
173;;
174;; Sun Mar 21 14:39:38 1993 Joe Wells (jbw at bigbird.bu.edu)
175;;
176;; * Moved variant-method code back into main function. Handle
177;; underscores as spaces in comments. Handle leading nickname. Add
178;; flag to ignore single-word names. Other changes.
179;;
180;; Mon Feb 1 22:23:31 1993 Joe Wells (jbw at bigbird.bu.edu)
181;;
182;; * Added in changes by Rod Whitby and Jamie Zawinski. This
183;; includes the flag mail-extr-guess-middle-initial and the fix for
6b07c06e
RS
184;; handling multiple addresses correctly. (Whitby just changed
185;; a > to a <.)
154b3e39 186;;
72c0ae01
ER
187;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu)
188;;
189;; * Cleaned up some more. Release version 1.0 to world.
190;;
191;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu)
192;;
193;; * Cleaned up full name extraction extensively.
194;;
195;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu)
196;;
197;; * Total rewrite. Integrated mail-canonicalize-address into
198;; mail-extract-address-components. Now handles GROUP addresses more
199;; or less correctly. Better handling of lots of different cases.
200;;
201;; Fri Jun 14 19:39:50 1991
202;; * Created.
203
204;;; Code:
205\f
72c0ae01 206
154b3e39
RS
207;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
208;;
209;; User configuration variable definitions.
210;;
211
212(defvar mail-extr-guess-middle-initial nil
213 "*Whether to try to guess middle initial from mail address.
214If true, then when we see an address like \"John Smith <jqs@host.com>\"
215we will assume that \"John Q. Smith\" is the fellow's name.")
216
217(defvar mail-extr-ignore-single-names t
218 "*Whether to ignore a name that is just a single word.
219If true, then when we see an address like \"Idiot <dumb@stupid.com>\"
220we will act as though we couldn't find a full name in the address.")
221
222;; Matches a leading title that is not part of the name (does not
223;; contribute to uniquely identifying the person).
224(defvar mail-extr-full-name-prefixes
225 (purecopy
226 "\\(Prof\\|D[Rr]\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.?[ \t\n]")
227 "*Matches prefixes to the full name that identify a person's position.
228These are stripped from the full name because they do not contribute to
229uniquely identifying the person.")
230
231(defvar mail-extr-@-binds-tighter-than-! nil
232 "*Whether the local mail transport agent looks at ! before @.")
233
234(defvar mail-extr-mangle-uucp nil
235 "*Whether to throw away information in UUCP addresses
236by translating things like \"foo!bar!baz@host\" into \"baz@bar.UUCP\".")
72c0ae01
ER
237
238;;----------------------------------------------------------------------
239;; what orderings are meaningful?????
240;;(defvar mail-operator-precedence-list '(?! ?% ?@))
241;; Right operand of a % or a @ must be a domain name, period. No other
242;; operators allowed. Left operand of a @ is an address relative to that
243;; site.
244
245;; Left operand of a ! must be a domain name. Right operand is an
246;; arbitrary address.
247;;----------------------------------------------------------------------
248
154b3e39 249\f
72c0ae01 250
154b3e39
RS
251;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252;;
253;; Constant definitions.
254;;
255
256;; Codes in
257;; Names in ISO 8859-1 Name
258;; ISO 10XXX ISO 8859-2 in
259;; ISO 6937 ISO 10646 RFC Swedish
260;; etc. Hex Oct 1345 TeX Split ASCII Description
261;; --------- ---------- ---- --- ----- ----- -------------------------------
262;; %a E4 344 a: \"a ae { latin small a + diaeresis d
263;; %o F6 366 o: \"o oe | latin small o + diaeresis v
264;; @a E5 345 aa \oa aa } latin small a + ring above e
265;; %u FC 374 u: \"u ue ~ latin small u + diaeresis |
266;; /e E9 351 e' \'e ` latin small e + acute i
267;; %A C4 304 A: \"A AE [ latin capital a + diaeresis D
268;; %O D6 326 O: \"O OE \ latin capital o + diaeresis V
269;; @A C5 305 AA \oA AA ] latin capital a + ring above E
270;; %U DC 334 U: \"U UE ^ latin capital u + diaeresis \
271;; /E C9 311 E' \'E @ latin capital e + acute I
272
273;; NOTE: @a and @A are not in ISO 8859-2 (the codes mentioned above invoke
274;; /l and /L). Some of this data was retrieved from
275;; listserv@jhuvm.hcf.jhu.edu.
72c0ae01
ER
276
277;; Any character that can occur in a name, not counting characters that
154b3e39
RS
278;; separate parts of a multipart name (hyphen and period).
279;; Yes, there are weird people with digits in their names.
280;; You will also notice the consideration for the
281;; Swedish/Finnish/Norwegian character set.
154b3e39 282(defconst mail-extr-all-letters-but-separators
fa25dbbc 283 (purecopy "][A-Za-z{|}'~0-9`\200-\377"))
154b3e39
RS
284
285;; Any character that can occur in a name in an RFC822 address including
286;; the separator (hyphen and possibly period) for multipart names.
287;; #### should . be in here?
288(defconst mail-extr-all-letters
289 (purecopy (concat mail-extr-all-letters-but-separators "---")))
290
291;; Any character that can start a name.
292;; Keep this set as minimal as possible.
fa25dbbc 293(defconst mail-extr-first-letters (purecopy "A-Za-z\200-\377"))
72c0ae01
ER
294
295;; Any character that can end a name.
154b3e39 296;; Keep this set as minimal as possible.
fa25dbbc 297(defconst mail-extr-last-letters (purecopy "A-Za-z\200-\377`'."))
72c0ae01 298
154b3e39
RS
299(defconst mail-extr-leading-garbage
300 (purecopy (format "[^%s]+" mail-extr-first-letters)))
72c0ae01 301
154b3e39
RS
302;; (defconst mail-extr-non-name-chars
303;; (purecopy (concat "^" mail-extr-all-letters ".")))
304;; (defconst mail-extr-non-begin-name-chars
305;; (purecopy (concat "^" mail-extr-first-letters)))
306;; (defconst mail-extr-non-end-name-chars
307;; (purecopy (concat "^" mail-extr-last-letters)))
72c0ae01 308
154b3e39
RS
309;; Matches an initial not followed by both a period and a space.
310;; (defconst mail-extr-bad-initials-pattern
311;; (purecopy
312;; (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)"
313;; mail-extr-all-letters mail-extr-first-letters mail-extr-all-letters)))
72c0ae01
ER
314
315;; Matches periods used instead of spaces. Must not match the period
316;; following an initial.
154b3e39
RS
317(defconst mail-extr-bad-dot-pattern
318 (purecopy
319 (format "\\([%s][%s]\\)\\.+\\([%s]\\)"
320 mail-extr-all-letters
321 mail-extr-last-letters
322 mail-extr-first-letters)))
72c0ae01
ER
323
324;; Matches an embedded or leading nickname that should be removed.
154b3e39
RS
325;; (defconst mail-extr-nickname-pattern
326;; (purecopy
327;; (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] "
328;; mail-extr-all-letters)))
72c0ae01
ER
329
330;; Matches the occurrence of a generational name suffix, and the last
154b3e39
RS
331;; character of the preceding name. This is important because we want to
332;; keep such suffixes: they help to uniquely identify the person.
333;; *** Perhaps this should be a user-customizable variable. However, the
334;; *** regular expression is fairly tricky to alter, so maybe not.
335(defconst mail-extr-full-name-suffix-pattern
336 (purecopy
337 (format
338 "\\(,? ?\\([JjSs][Rr]\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)"
339 mail-extr-all-letters mail-extr-all-letters)))
340
341(defconst mail-extr-roman-numeral-pattern (purecopy "V?I+V?\\b"))
72c0ae01
ER
342
343;; Matches a trailing uppercase (with other characters possible) acronym.
344;; Must not match a trailing uppercase last name or trailing initial
154b3e39
RS
345(defconst mail-extr-weird-acronym-pattern
346 (purecopy "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)"))
72c0ae01
ER
347
348;; Matches a mixed-case or lowercase name (not an initial).
154b3e39
RS
349;; #### Match Latin1 lower case letters here too?
350;; (defconst mail-extr-mixed-case-name-pattern
351;; (purecopy
352;; (format
353;; "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)"
354;; mail-extr-all-letters mail-extr-last-letters
355;; mail-extr-first-letters mail-extr-all-letters mail-extr-all-letters
356;; mail-extr-last-letters mail-extr-first-letters mail-extr-all-letters)))
72c0ae01
ER
357
358;; Matches a trailing alternative address.
154b3e39
RS
359;; #### Match Latin1 letters here too?
360;; #### Match _ before @ here too?
361(defconst mail-extr-alternative-address-pattern
362 (purecopy "\\(aka *\\)?[a-zA-Z.]+[!@][a-zA-Z.]"))
72c0ae01
ER
363
364;; Matches a variety of trailing comments not including comma-delimited
365;; comments.
154b3e39
RS
366(defconst mail-extr-trailing-comment-start-pattern
367 (purecopy " [-{]\\|--\\|[+@#></\;]"))
72c0ae01
ER
368
369;; Matches a name (not an initial).
370;; This doesn't force a word boundary at the end because sometimes a
371;; comment is separated by a `-' with no preceding space.
154b3e39
RS
372(defconst mail-extr-name-pattern
373 (purecopy (format "\\b[%s][%s]*[%s]"
374 mail-extr-first-letters
375 mail-extr-all-letters
376 mail-extr-last-letters)))
72c0ae01 377
154b3e39
RS
378(defconst mail-extr-initial-pattern
379 (purecopy (format "\\b[%s]\\([. ]\\|\\b\\)" mail-extr-first-letters)))
72c0ae01
ER
380
381;; Matches a single name before a comma.
154b3e39
RS
382;; (defconst mail-extr-last-name-first-pattern
383;; (purecopy (concat "\\`" mail-extr-name-pattern ",")))
72c0ae01
ER
384
385;; Matches telephone extensions.
154b3e39
RS
386(defconst mail-extr-telephone-extension-pattern
387 (purecopy
388 "\\(\\([Ee]xt\\|\\|[Tt]ph\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+"))
72c0ae01
ER
389
390;; Matches ham radio call signs.
154b3e39
RS
391;; Help from: Mat Maessen N2NJZ <maessm@rpi.edu>, Mark Feit
392;; <mark@era.com>, Michael Covington <mcovingt@ai.uga.edu>.
393;; Examples: DX504 DX515 K5MRU K8DHK KA9WGN KA9WGN KD3FU KD6EUI KD6HBW
394;; KE9TV KF0NV N1API N3FU N3GZE N3IGS N4KCC N7IKQ N9HHU W4YHF W6ANK WA2SUH
395;; WB7VZI N2NJZ NR3G KJ4KK AB4UM AL7NI KH6OH WN3KBT N4TMI W1A N0NZO
396(defconst mail-extr-ham-call-sign-pattern
397 (purecopy "\\b\\(DX[0-9]+\\|[AKNW][A-Z]?[0-9][A-Z][A-Z]?[A-Z]?\\)"))
398
399;; Possible trailing suffixes: "\\(/\\(KT\\|A[AEG]\\|[R0-9]\\)\\)?"
400;; /KT == Temporary Technician (has CSC but not "real" license)
401;; /AA == Temporary Advanced
402;; /AE == Temporary Extra
403;; /AG == Temporary General
404;; /R == repeater
405;; /# == stations operating out of home district
406;; I don't include these in the regexp above because I can't imagine
407;; anyone putting them with their name in an e-mail address.
72c0ae01
ER
408
409;; Matches normal single-part name
154b3e39
RS
410(defconst mail-extr-normal-name-pattern
411 (purecopy (format "\\b[%s][%s]+[%s]"
412 mail-extr-first-letters
413 mail-extr-all-letters-but-separators
414 mail-extr-last-letters)))
415
416;; Matches a single word name.
417;; (defconst mail-extr-one-name-pattern
418;; (purecopy (concat "\\`" mail-extr-normal-name-pattern "\\'")))
419
72c0ae01 420;; Matches normal two names with missing middle initial
154b3e39
RS
421;; The first name is not allowed to have a hyphen because this can cause
422;; false matches where the "middle initial" is actually the first letter
423;; of the second part of the first name.
424(defconst mail-extr-two-name-pattern
425 (purecopy
426 (concat "\\`\\(" mail-extr-normal-name-pattern
427 "\\|" mail-extr-initial-pattern
428 "\\) +\\(" mail-extr-name-pattern "\\)\\(,\\|\\'\\)")))
429
430(defconst mail-extr-listserv-list-name-pattern
431 (purecopy "Multiple recipients of list \\([-A-Z]+\\)"))
432
433(defconst mail-extr-stupid-vms-date-stamp-pattern
434 (purecopy
435 "[0-9][0-9]-[JFMASOND][aepuco][nbrylgptvc]-[0-9][0-9][0-9][0-9] [0-9]+ *"))
436
437;;; HZ -- GB (PRC Chinese character encoding) in ASCII embedding protocol
438;;
439;; In ASCII mode, a byte is interpreted as an ASCII character, unless a '~' is
440;; encountered. The character '~' is an escape character. By convention, it
441;; must be immediately followed ONLY by '~', '{' or '\n' (<LF>), with the
442;; following special meaning.
443;;
444;; o The escape sequence '~~' is interpreted as a '~'.
445;; o The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
446;; o The escape sequence '~\n' is a line-continuation marker to be consumed
447;; with no output produced.
448;;
449;; In GB mode, characters are interpreted two bytes at a time as (pure) GB
450;; codes until the escape-from-GB code '~}' is read. This code switches the
451;; mode from GB back to ASCII. (Note that the escape-from-GB code '~}'
452;; ($7E7D) is outside the defined GB range.)
453(defconst mail-extr-hz-embedded-gb-encoded-chinese-pattern
454 (purecopy "~{\\([^~].\\|~[^\}]\\)+~}"))
455
456;; The leading optional lowercase letters are for a bastardized version of
457;; the encoding, as is the optional nature of the final slash.
458(defconst mail-extr-x400-encoded-address-pattern
459 (purecopy "[a-z]?[a-z]?\\(/[A-Za-z]+\\(\\.[A-Za-z]+\\)?=[^/]+\\)+/?\\'"))
460
461(defconst mail-extr-x400-encoded-address-field-pattern-format
462 (purecopy "/%s=\\([^/]+\\)\\(/\\|\\'\\)"))
463
464(defconst mail-extr-x400-encoded-address-surname-pattern
465 ;; S stands for Surname (family name).
466 (purecopy
467 (format mail-extr-x400-encoded-address-field-pattern-format "[Ss]")))
468
469(defconst mail-extr-x400-encoded-address-given-name-pattern
470 ;; G stands for Given name.
471 (purecopy
472 (format mail-extr-x400-encoded-address-field-pattern-format "[Gg]")))
473
474(defconst mail-extr-x400-encoded-address-full-name-pattern
475 ;; PN stands for Personal Name. When used it represents the combination
476 ;; of the G and S fields.
477 ;; "The one system I used having this field asked it with the prompt
478 ;; `Personal Name'. But they mapped it into G and S on outgoing real
479 ;; X.400 addresses. As they mapped G and S into PN on incoming..."
480 (purecopy
481 (format mail-extr-x400-encoded-address-field-pattern-format "[Pp][Nn]")))
482
483\f
484
485;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
486;;
487;; Syntax tables used for quick parsing.
488;;
489
490(defconst mail-extr-address-syntax-table (make-syntax-table))
491(defconst mail-extr-address-comment-syntax-table (make-syntax-table))
492(defconst mail-extr-address-domain-literal-syntax-table (make-syntax-table))
493(defconst mail-extr-address-text-comment-syntax-table (make-syntax-table))
494(defconst mail-extr-address-text-syntax-table (make-syntax-table))
72c0ae01
ER
495(mapcar
496 (function
497 (lambda (pair)
498 (let ((syntax-table (symbol-value (car pair))))
499 (mapcar
500 (function
501 (lambda (item)
502 (if (eq 2 (length item))
154b3e39 503 ;; modifying syntax of a single character
72c0ae01 504 (modify-syntax-entry (car item) (car (cdr item)) syntax-table)
154b3e39
RS
505 ;; modifying syntax of a range of characters
506 (let ((char (nth 0 item))
507 (bound (nth 1 item))
508 (syntax (nth 2 item)))
72c0ae01
ER
509 (while (<= char bound)
510 (modify-syntax-entry char syntax syntax-table)
511 (setq char (1+ char)))))))
512 (cdr pair)))))
154b3e39
RS
513 '((mail-extr-address-syntax-table
514 (?\000 ?\037 "w") ;control characters
515 (?\040 " ") ;SPC
516 (?! ?~ "w") ;printable characters
517 (?\177 "w") ;DEL
518 (?\200 ?\377 "w") ;high-bit-on characters
519 (?\240 " ") ;nobreakspace
72c0ae01
ER
520 (?\t " ")
521 (?\r " ")
522 (?\n " ")
523 (?\( ".")
524 (?\) ".")
525 (?< ".")
526 (?> ".")
527 (?@ ".")
528 (?, ".")
529 (?\; ".")
530 (?: ".")
531 (?\\ "\\")
532 (?\" "\"")
533 (?. ".")
534 (?\[ ".")
535 (?\] ".")
536 ;; % and ! aren't RFC822 characters, but it is convenient to pretend
537 (?% ".")
154b3e39 538 (?! ".") ;; this needs to be word-constituent when not in .UUCP mode
72c0ae01 539 )
154b3e39
RS
540 (mail-extr-address-comment-syntax-table
541 (?\000 ?\377 "w")
542 (?\040 " ")
543 (?\240 " ")
544 (?\t " ")
545 (?\r " ")
546 (?\n " ")
72c0ae01
ER
547 (?\( "\(\)")
548 (?\) "\)\(")
549 (?\\ "\\"))
154b3e39
RS
550 (mail-extr-address-domain-literal-syntax-table
551 (?\000 ?\377 "w")
552 (?\040 " ")
553 (?\240 " ")
554 (?\t " ")
555 (?\r " ")
556 (?\n " ")
72c0ae01
ER
557 (?\[ "\(\]") ;??????
558 (?\] "\)\[") ;??????
559 (?\\ "\\"))
154b3e39
RS
560 (mail-extr-address-text-comment-syntax-table
561 (?\000 ?\377 "w")
562 (?\040 " ")
563 (?\240 " ")
564 (?\t " ")
565 (?\r " ")
566 (?\n " ")
72c0ae01
ER
567 (?\( "\(\)")
568 (?\) "\)\(")
569 (?\[ "\(\]")
570 (?\] "\)\[")
571 (?\{ "\(\}")
572 (?\} "\)\{")
573 (?\\ "\\")
574 (?\" "\"")
575 ;; (?\' "\)\`")
576 ;; (?\` "\(\'")
577 )
154b3e39
RS
578 (mail-extr-address-text-syntax-table
579 (?\000 ?\177 ".")
580 (?\200 ?\377 "w")
581 (?\040 " ")
582 (?\t " ")
583 (?\r " ")
584 (?\n " ")
72c0ae01
ER
585 (?A ?Z "w")
586 (?a ?z "w")
587 (?- "w")
588 (?\} "w")
589 (?\{ "w")
590 (?| "w")
591 (?\' "w")
592 (?~ "w")
593 (?0 ?9 "w"))
594 ))
595
596\f
154b3e39
RS
597;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
598;;
72c0ae01 599;; Utility functions and macros.
154b3e39
RS
600;;
601
602(defmacro mail-extr-delete-char (n)
603 ;; in v19, delete-char is compiled as a function call, but delete-region
604 ;; is byte-coded, so it's much much faster.
605 (list 'delete-region '(point) (list '+ '(point) n)))
606
607(defmacro mail-extr-skip-whitespace-forward ()
608 ;; v19 fn skip-syntax-forward is more tasteful, but not byte-coded.
609 '(skip-chars-forward " \t\n\r\240"))
610
611(defmacro mail-extr-skip-whitespace-backward ()
612 ;; v19 fn skip-syntax-backward is more tasteful, but not byte-coded.
613 '(skip-chars-backward " \t\n\r\240"))
614
72c0ae01 615
154b3e39 616(defmacro mail-extr-undo-backslash-quoting (beg end)
72c0ae01
ER
617 (`(save-excursion
618 (save-restriction
619 (narrow-to-region (, beg) (, end))
620 (goto-char (point-min))
621 ;; undo \ quoting
154b3e39
RS
622 (while (search-forward "\\" nil t)
623 (mail-extr-delete-char -1)
624 (or (eobp)
625 (forward-char 1))
72c0ae01
ER
626 )))))
627
154b3e39 628(defmacro mail-extr-nuke-char-at (pos)
72c0ae01
ER
629 (` (save-excursion
630 (goto-char (, pos))
154b3e39
RS
631 (mail-extr-delete-char 1)
632 (insert ?\ ))))
633
634(put 'mail-extr-nuke-outside-range
635 'edebug-form-spec '(symbolp &optional form form atom))
636
637(defmacro mail-extr-nuke-outside-range (list-symbol
638 beg-symbol end-symbol
639 &optional no-replace)
640 ;; LIST-SYMBOL names a variable holding a list of buffer positions
641 ;; BEG-SYMBOL and END-SYMBOL name variables delimiting a range
642 ;; Each element of LIST-SYMBOL which lies outside of the range is
643 ;; deleted from the list.
644 ;; Unless NO-REPLACE is true, at each of the positions in LIST-SYMBOL
645 ;; which lie outside of the range, one character at that position is
646 ;; replaced with a SPC.
647 (or (memq no-replace '(t nil))
648 (error "no-replace must be t or nil, evalable at macroexpand-time."))
649 (` (let ((temp (, list-symbol))
650 ch)
72c0ae01 651 (while temp
154b3e39
RS
652 (setq ch (car temp))
653 (cond ((or (> ch (, end-symbol))
654 (< ch (, beg-symbol)))
655 (,@ (if no-replace
656 nil
657 (` ((mail-extr-nuke-char-at ch)))))
72c0ae01
ER
658 (setcar temp nil)))
659 (setq temp (cdr temp)))
660 (setq (, list-symbol) (delq nil (, list-symbol))))))
661
154b3e39
RS
662(defun mail-extr-demarkerize (marker)
663 ;; if arg is a marker, destroys the marker, then returns the old value.
664 ;; otherwise returns the arg.
665 (if (markerp marker)
666 (let ((temp (marker-position marker)))
667 (set-marker marker nil)
668 temp)
669 marker))
670
671(defun mail-extr-markerize (pos)
672 ;; coerces pos to a marker if non-nil.
673 (if (or (markerp pos) (null pos))
674 pos
675 (copy-marker pos)))
676
677(defmacro mail-extr-last (list)
678 ;; Returns last element of LIST.
679 ;; Could be a subst.
72c0ae01
ER
680 (` (let ((list (, list)))
681 (while (not (null (cdr list)))
682 (setq list (cdr list)))
683 (car list))))
684
154b3e39
RS
685(defmacro mail-extr-safe-move-sexp (arg)
686 ;; Safely skip over one balanced sexp, if there is one. Return t if success.
72c0ae01
ER
687 (` (condition-case error
688 (progn
689 (goto-char (scan-sexps (point) (, arg)))
690 t)
691 (error
154b3e39 692 ;; #### kludge kludge kludge kludge kludge kludge kludge !!!
72c0ae01
ER
693 (if (string-equal (nth 1 error) "Unbalanced parentheses")
694 nil
695 (while t
696 (signal (car error) (cdr error))))))))
72c0ae01 697\f
154b3e39
RS
698;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
699;;
72c0ae01 700;; The main function to grind addresses
154b3e39
RS
701;;
702
703(defvar disable-initial-guessing-flag) ; dynamic assignment
704(defvar cbeg) ; dynamic assignment
705(defvar cend) ; dynamic assignment
72c0ae01 706
154b3e39 707;;;###autoload
72c0ae01 708(defun mail-extract-address-components (address)
154b3e39
RS
709 "Given an RFC-822 ADDRESS, extract full name and canonical address.
710Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
711If no name can be extracted, FULL-NAME will be nil.
712ADDRESS may be a string or a buffer. If it is a buffer, the visible
713 (narrowed) portion of the buffer will be interpreted as the address.
714 (This feature exists so that the clever caller might be able to avoid
715 consing a string.)
716If ADDRESS contains more than one RFC-822 address, only the first is
717 returned. Some day this function may be extended to extract multiple
718 addresses, or perhaps return the position at which parsing stopped."
719 (let ((canonicalization-buffer (get-buffer-create " *canonical address*"))
720 (extraction-buffer (get-buffer-create " *extract address components*"))
72c0ae01 721 char
154b3e39 722;; multiple-addresses
72c0ae01
ER
723 <-pos >-pos @-pos :-pos ,-pos !-pos %-pos \;-pos
724 group-:-pos group-\;-pos route-addr-:-pos
725 record-pos-symbol
726 first-real-pos last-real-pos
727 phrase-beg phrase-end
154b3e39 728 cbeg cend ; dynamically set from -voodoo
72c0ae01
ER
729 quote-beg quote-end
730 atom-beg atom-end
731 mbox-beg mbox-end
732 \.-ends-name
733 temp
154b3e39
RS
734;; name-suffix
735 fi mi li ; first, middle, last initial
72c0ae01 736 saved-%-pos saved-!-pos saved-@-pos
154b3e39
RS
737 domain-pos \.-pos insert-point
738;; mailbox-name-processed-flag
739 disable-initial-guessing-flag ; dynamically set from -voodoo
740 )
72c0ae01
ER
741
742 (save-excursion
743 (set-buffer extraction-buffer)
154b3e39
RS
744 (fundamental-mode)
745 (kill-all-local-variables)
e8a57935 746 (buffer-disable-undo extraction-buffer)
154b3e39 747 (set-syntax-table mail-extr-address-syntax-table)
72c0ae01
ER
748 (widen)
749 (erase-buffer)
750 (setq case-fold-search nil)
751
752 ;; Insert extra space at beginning to allow later replacement with <
753 ;; without having to move markers.
154b3e39
RS
754 (insert ?\ )
755
756 ;; Insert the address itself.
757 (cond ((stringp address)
758 (insert address))
759 ((bufferp address)
760 (insert-buffer-substring address))
761 (t
762 (error "Illegal address: %s" address)))
72c0ae01
ER
763
764 ;; stolen from rfc822.el
765 ;; Unfold multiple lines.
766 (goto-char (point-min))
767 (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t)
768 (replace-match "\\1 " t))
769
770 ;; first pass grabs useful information about address
771 (goto-char (point-min))
772 (while (progn
154b3e39 773 (mail-extr-skip-whitespace-forward)
72c0ae01
ER
774 (not (eobp)))
775 (setq char (char-after (point)))
776 (or first-real-pos
777 (if (not (eq char ?\())
778 (setq first-real-pos (point))))
779 (cond
780 ;; comment
781 ((eq char ?\()
154b3e39 782 (set-syntax-table mail-extr-address-comment-syntax-table)
72c0ae01 783 ;; only record the first non-empty comment's position
154b3e39 784 (if (and (not cbeg)
72c0ae01
ER
785 (save-excursion
786 (forward-char 1)
154b3e39 787 (mail-extr-skip-whitespace-forward)
72c0ae01 788 (not (eq ?\) (char-after (point))))))
154b3e39 789 (setq cbeg (point)))
72c0ae01 790 ;; TODO: don't record if unbalanced
154b3e39 791 (or (mail-extr-safe-move-sexp 1)
72c0ae01 792 (forward-char 1))
154b3e39
RS
793 (set-syntax-table mail-extr-address-syntax-table)
794 (if (and cbeg
795 (not cend))
796 (setq cend (point))))
72c0ae01
ER
797 ;; quoted text
798 ((eq char ?\")
799 ;; only record the first non-empty quote's position
800 (if (and (not quote-beg)
801 (save-excursion
802 (forward-char 1)
154b3e39 803 (mail-extr-skip-whitespace-forward)
72c0ae01
ER
804 (not (eq ?\" (char-after (point))))))
805 (setq quote-beg (point)))
806 ;; TODO: don't record if unbalanced
154b3e39 807 (or (mail-extr-safe-move-sexp 1)
72c0ae01
ER
808 (forward-char 1))
809 (if (and quote-beg
810 (not quote-end))
811 (setq quote-end (point))))
812 ;; domain literals
813 ((eq char ?\[)
154b3e39
RS
814 (set-syntax-table mail-extr-address-domain-literal-syntax-table)
815 (or (mail-extr-safe-move-sexp 1)
72c0ae01 816 (forward-char 1))
154b3e39 817 (set-syntax-table mail-extr-address-syntax-table))
72c0ae01
ER
818 ;; commas delimit addresses when outside < > pairs.
819 ((and (eq char ?,)
154b3e39
RS
820 (or (and (null <-pos)
821 ;; Handle ROUTE-ADDR address that is missing its <.
822 (not (eq ?@ (char-after (1+ (point))))))
72c0ae01
ER
823 (and >-pos
824 ;; handle weird munged addresses
154b3e39
RS
825 ;; BUG FIX: This test was reversed. Thanks to the
826 ;; brilliant Rod Whitby <rwhitby@research.canon.oz.au>
827 ;; for discovering this!
828 (< (mail-extr-last <-pos) (car >-pos)))))
829;; It'd be great if some day this worked, but for now, punt.
830;; (setq multiple-addresses t)
831;; ;; *** Why do I want this:
832;; (mail-extr-delete-char 1)
833;; (narrow-to-region (point-min) (point))
834 (delete-region (point) (point-max))
835 (setq char ?\() ; HAVE I NO SHAME??
836 )
72c0ae01
ER
837 ;; record the position of various interesting chars, determine
838 ;; legality later.
839 ((setq record-pos-symbol
840 (cdr (assq char
841 '((?< . <-pos) (?> . >-pos) (?@ . @-pos)
842 (?: . :-pos) (?, . ,-pos) (?! . !-pos)
843 (?% . %-pos) (?\; . \;-pos)))))
844 (set record-pos-symbol
845 (cons (point) (symbol-value record-pos-symbol)))
846 (forward-char 1))
847 ((eq char ?.)
848 (forward-char 1))
849 ((memq char '(
850 ;; comment terminator illegal
851 ?\)
852 ;; domain literal terminator illegal
853 ?\]
854 ;; \ allowed only within quoted strings,
855 ;; domain literals, and comments
856 ?\\
857 ))
154b3e39 858 (mail-extr-nuke-char-at (point))
72c0ae01
ER
859 (forward-char 1))
860 (t
861 (forward-word 1)))
862 (or (eq char ?\()
154b3e39
RS
863 ;; At the end of first address of a multiple address header.
864 (and (eq char ?,)
865 (eobp))
72c0ae01
ER
866 (setq last-real-pos (point))))
867
868 ;; Use only the leftmost <, if any. Replace all others with spaces.
869 (while (cdr <-pos)
154b3e39 870 (mail-extr-nuke-char-at (car <-pos))
72c0ae01
ER
871 (setq <-pos (cdr <-pos)))
872
873 ;; Use only the rightmost >, if any. Replace all others with spaces.
874 (while (cdr >-pos)
154b3e39 875 (mail-extr-nuke-char-at (nth 1 >-pos))
72c0ae01
ER
876 (setcdr >-pos (nthcdr 2 >-pos)))
877
878 ;; If multiple @s and a :, but no < and >, insert around buffer.
154b3e39 879 ;; Example: @foo.bar.dom,@xxx.yyy.zzz:mailbox@aaa.bbb.ccc
72c0ae01
ER
880 ;; This commonly happens on the UUCP "From " line. Ugh.
881 (cond ((and (> (length @-pos) 1)
154b3e39
RS
882 (eq 1 (length :-pos)) ;TODO: check if between last two @s
883 (not \;-pos)
72c0ae01
ER
884 (not <-pos))
885 (goto-char (point-min))
154b3e39 886 (mail-extr-delete-char 1)
72c0ae01
ER
887 (setq <-pos (list (point)))
888 (insert ?<)))
889
890 ;; If < but no >, insert > in rightmost possible position
891 (cond ((and <-pos
892 (null >-pos))
893 (goto-char (point-max))
894 (setq >-pos (list (point)))
895 (insert ?>)))
896
897 ;; If > but no <, replace > with space.
898 (cond ((and >-pos
899 (null <-pos))
154b3e39 900 (mail-extr-nuke-char-at (car >-pos))
72c0ae01
ER
901 (setq >-pos nil)))
902
903 ;; Turn >-pos and <-pos into non-lists
904 (setq >-pos (car >-pos)
905 <-pos (car <-pos))
906
907 ;; Trim other punctuation lists of items outside < > pair to handle
908 ;; stupid MTAs.
909 (cond (<-pos ; don't need to check >-pos also
910 ;; handle bozo software that violates RFC 822 by sticking
911 ;; punctuation marks outside of a < > pair
154b3e39 912 (mail-extr-nuke-outside-range @-pos <-pos >-pos t)
72c0ae01
ER
913 ;; RFC 822 says nothing about these two outside < >, but
914 ;; remove those positions from the lists to make things
915 ;; easier.
154b3e39
RS
916 (mail-extr-nuke-outside-range !-pos <-pos >-pos t)
917 (mail-extr-nuke-outside-range %-pos <-pos >-pos t)))
72c0ae01
ER
918
919 ;; Check for : that indicates GROUP list and for : part of
920 ;; ROUTE-ADDR spec.
921 ;; Can't possibly be more than two :. Nuke any extra.
922 (while :-pos
923 (setq temp (car :-pos)
924 :-pos (cdr :-pos))
925 (cond ((and <-pos >-pos
926 (> temp <-pos)
927 (< temp >-pos))
928 (if (or route-addr-:-pos
929 (< (length @-pos) 2)
930 (> temp (car @-pos))
931 (< temp (nth 1 @-pos)))
154b3e39 932 (mail-extr-nuke-char-at temp)
72c0ae01
ER
933 (setq route-addr-:-pos temp)))
934 ((or (not <-pos)
935 (and <-pos
936 (< temp <-pos)))
937 (setq group-:-pos temp))))
938
939 ;; Nuke any ; that is in or to the left of a < > pair or to the left
940 ;; of a GROUP starting :. Also, there may only be one ;.
941 (while \;-pos
942 (setq temp (car \;-pos)
943 \;-pos (cdr \;-pos))
944 (cond ((and <-pos >-pos
945 (> temp <-pos)
946 (< temp >-pos))
154b3e39 947 (mail-extr-nuke-char-at temp))
72c0ae01
ER
948 ((and (or (not group-:-pos)
949 (> temp group-:-pos))
950 (not group-\;-pos))
951 (setq group-\;-pos temp))))
952
154b3e39
RS
953 ;; Nuke unmatched GROUP syntax characters.
954 (cond ((and group-:-pos (not group-\;-pos))
955 ;; *** Do I really need to erase it?
956 (mail-extr-nuke-char-at group-:-pos)
957 (setq group-:-pos nil)))
958 (cond ((and group-\;-pos (not group-:-pos))
959 ;; *** Do I really need to erase it?
960 (mail-extr-nuke-char-at group-\;-pos)
961 (setq group-\;-pos nil)))
962
72c0ae01
ER
963 ;; Handle junk like ";@host.company.dom" that sendmail adds.
964 ;; **** should I remember comment positions?
154b3e39
RS
965 (cond
966 (group-\;-pos
967 ;; this is fine for now
968 (mail-extr-nuke-outside-range !-pos group-:-pos group-\;-pos t)
969 (mail-extr-nuke-outside-range @-pos group-:-pos group-\;-pos t)
970 (mail-extr-nuke-outside-range %-pos group-:-pos group-\;-pos t)
971 (mail-extr-nuke-outside-range ,-pos group-:-pos group-\;-pos t)
972 (and last-real-pos
973 (> last-real-pos (1+ group-\;-pos))
974 (setq last-real-pos (1+ group-\;-pos)))
975 ;; *** This may be wrong:
976 (and cend
977 (> cend group-\;-pos)
978 (setq cend nil
979 cbeg nil))
980 (and quote-end
981 (> quote-end group-\;-pos)
982 (setq quote-end nil
983 quote-beg nil))
984 ;; This was both wrong and unnecessary:
985 ;;(narrow-to-region (point-min) group-\;-pos)
986
987 ;; *** The entire handling of GROUP addresses seems rather lame.
988 ;; *** It deserves a complete rethink, except that these addresses
989 ;; *** are hardly ever seen.
990 ))
72c0ae01
ER
991
992 ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any
993 ;; others.
994 ;; Hell, go ahead an nuke all of the commas.
995 ;; **** This will cause problems when we start handling commas in
996 ;; the PHRASE part .... no it won't ... yes it will ... ?????
154b3e39 997 (mail-extr-nuke-outside-range ,-pos 1 1)
72c0ae01
ER
998
999 ;; can only have multiple @s inside < >. The fact that some MTAs
1000 ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is
1001 ;; handled above.
1002
1003 ;; Locate PHRASE part of ROUTE-ADDR.
1004 (cond (<-pos
1005 (goto-char <-pos)
154b3e39 1006 (mail-extr-skip-whitespace-backward)
72c0ae01
ER
1007 (setq phrase-end (point))
1008 (goto-char (or ;;group-:-pos
1009 (point-min)))
154b3e39 1010 (mail-extr-skip-whitespace-forward)
72c0ae01
ER
1011 (if (< (point) phrase-end)
1012 (setq phrase-beg (point))
1013 (setq phrase-end nil))))
1014
1015 ;; handle ROUTE-ADDRS with real ROUTEs.
1016 ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and
1017 ;; any % or ! must be semantically meaningless.
1018 ;; TODO: do this processing into canonicalization buffer
1019 (cond (route-addr-:-pos
1020 (setq !-pos nil
1021 %-pos nil
1022 >-pos (copy-marker >-pos)
1023 route-addr-:-pos (copy-marker route-addr-:-pos))
1024 (goto-char >-pos)
1025 (insert-before-markers ?X)
1026 (goto-char (car @-pos))
1027 (while (setq @-pos (cdr @-pos))
154b3e39 1028 (mail-extr-delete-char 1)
72c0ae01
ER
1029 (setq %-pos (cons (point-marker) %-pos))
1030 (insert "%")
1031 (goto-char (1- >-pos))
1032 (save-excursion
1033 (insert-buffer-substring extraction-buffer
1034 (car @-pos) route-addr-:-pos)
1035 (delete-region (car @-pos) route-addr-:-pos))
1036 (or (cdr @-pos)
1037 (setq saved-@-pos (list (point)))))
1038 (setq @-pos saved-@-pos)
1039 (goto-char >-pos)
154b3e39
RS
1040 (mail-extr-delete-char -1)
1041 (mail-extr-nuke-char-at route-addr-:-pos)
1042 (mail-extr-demarkerize route-addr-:-pos)
72c0ae01 1043 (setq route-addr-:-pos nil
154b3e39
RS
1044 >-pos (mail-extr-demarkerize >-pos)
1045 %-pos (mapcar 'mail-extr-demarkerize %-pos))))
72c0ae01
ER
1046
1047 ;; de-listify @-pos
1048 (setq @-pos (car @-pos))
1049
1050 ;; TODO: remove comments in the middle of an address
1051
1052 (set-buffer canonicalization-buffer)
154b3e39
RS
1053 (fundamental-mode)
1054 (kill-all-local-variables)
e8a57935 1055 (buffer-disable-undo canonicalization-buffer)
154b3e39 1056 (set-syntax-table mail-extr-address-syntax-table)
72c0ae01
ER
1057 (setq case-fold-search nil)
1058
1059 (widen)
1060 (erase-buffer)
1061 (insert-buffer-substring extraction-buffer)
1062
1063 (if <-pos
1064 (narrow-to-region (progn
1065 (goto-char (1+ <-pos))
154b3e39 1066 (mail-extr-skip-whitespace-forward)
72c0ae01
ER
1067 (point))
1068 >-pos)
154b3e39
RS
1069 (if (and first-real-pos last-real-pos)
1070 (narrow-to-region first-real-pos last-real-pos)
1071 ;; ****** Oh no! What if the address is completely empty!
1072 ;; *** Is this correct?
1073 (narrow-to-region (point-max) (point-max))
1074 ))
72c0ae01
ER
1075
1076 (and @-pos %-pos
154b3e39 1077 (mail-extr-nuke-outside-range %-pos (point-min) @-pos))
72c0ae01 1078 (and %-pos !-pos
154b3e39 1079 (mail-extr-nuke-outside-range !-pos (point-min) (car %-pos)))
72c0ae01 1080 (and @-pos !-pos (not %-pos)
154b3e39 1081 (mail-extr-nuke-outside-range !-pos (point-min) @-pos))
72c0ae01
ER
1082
1083 ;; Error condition:?? (and %-pos (not @-pos))
154b3e39
RS
1084
1085 ;; WARNING: THIS CODE IS DUPLICATED BELOW.
1086 (cond ((and %-pos
1087 (not @-pos))
1088 (goto-char (car %-pos))
1089 (mail-extr-delete-char 1)
1090 (setq @-pos (point))
1091 (insert "@")
1092 (setq %-pos (cdr %-pos))))
72c0ae01 1093
154b3e39 1094 (if mail-extr-mangle-uucp
72c0ae01
ER
1095 (cond (!-pos
1096 ;; **** I don't understand this save-restriction and the
1097 ;; narrow-to-region inside it. Why did I do that?
1098 (save-restriction
1099 (cond ((and @-pos
154b3e39 1100 mail-extr-@-binds-tighter-than-!)
72c0ae01
ER
1101 (goto-char @-pos)
1102 (setq %-pos (cons (point) %-pos)
1103 @-pos nil)
154b3e39 1104 (mail-extr-delete-char 1)
72c0ae01
ER
1105 (insert "%")
1106 (setq insert-point (point-max)))
154b3e39 1107 (mail-extr-@-binds-tighter-than-!
72c0ae01
ER
1108 (setq insert-point (point-max)))
1109 (%-pos
154b3e39
RS
1110 (setq insert-point (mail-extr-last %-pos)
1111 saved-%-pos (mapcar 'mail-extr-markerize %-pos)
72c0ae01 1112 %-pos nil
154b3e39 1113 @-pos (mail-extr-markerize @-pos)))
72c0ae01
ER
1114 (@-pos
1115 (setq insert-point @-pos)
154b3e39 1116 (setq @-pos (mail-extr-markerize @-pos)))
72c0ae01
ER
1117 (t
1118 (setq insert-point (point-max))))
1119 (narrow-to-region (point-min) insert-point)
1120 (setq saved-!-pos (car !-pos))
1121 (while !-pos
1122 (goto-char (point-max))
1123 (cond ((and (not @-pos)
1124 (not (cdr !-pos)))
1125 (setq @-pos (point))
1126 (insert-before-markers "@ "))
1127 (t
1128 (setq %-pos (cons (point) %-pos))
1129 (insert-before-markers "% ")))
1130 (backward-char 1)
1131 (insert-buffer-substring
1132 (current-buffer)
1133 (if (nth 1 !-pos)
1134 (1+ (nth 1 !-pos))
1135 (point-min))
1136 (car !-pos))
154b3e39 1137 (mail-extr-delete-char 1)
72c0ae01 1138 (or (save-excursion
154b3e39
RS
1139 (mail-extr-safe-move-sexp -1)
1140 (mail-extr-skip-whitespace-backward)
72c0ae01
ER
1141 (eq ?. (preceding-char)))
1142 (insert-before-markers
1143 (if (save-excursion
154b3e39 1144 (mail-extr-skip-whitespace-backward)
72c0ae01
ER
1145 (eq ?. (preceding-char)))
1146 ""
1147 ".")
1148 "uucp"))
1149 (setq !-pos (cdr !-pos))))
1150 (and saved-%-pos
154b3e39
RS
1151 (setq %-pos (append (mapcar 'mail-extr-demarkerize
1152 saved-%-pos)
1153 %-pos)))
1154 (setq @-pos (mail-extr-demarkerize @-pos))
1155 (narrow-to-region (1+ saved-!-pos) (point-max)))))
1156
1157 ;; WARNING: THIS CODE IS DUPLICATED ABOVE.
72c0ae01
ER
1158 (cond ((and %-pos
1159 (not @-pos))
1160 (goto-char (car %-pos))
154b3e39 1161 (mail-extr-delete-char 1)
72c0ae01
ER
1162 (setq @-pos (point))
1163 (insert "@")
1164 (setq %-pos (cdr %-pos))))
154b3e39 1165
72c0ae01
ER
1166 (setq %-pos (nreverse %-pos))
1167 ;; RFC 1034 doesn't approve of this, oh well:
1168 (downcase-region (or (car %-pos) @-pos (point-max)) (point-max))
1169 (cond (%-pos ; implies @-pos valid
1170 (setq temp %-pos)
1171 (catch 'truncated
1172 (while temp
1173 (goto-char (or (nth 1 temp)
1174 @-pos))
154b3e39 1175 (mail-extr-skip-whitespace-backward)
72c0ae01 1176 (save-excursion
154b3e39 1177 (mail-extr-safe-move-sexp -1)
72c0ae01 1178 (setq domain-pos (point))
154b3e39 1179 (mail-extr-skip-whitespace-backward)
72c0ae01
ER
1180 (setq \.-pos (eq ?. (preceding-char))))
1181 (cond ((and \.-pos
154b3e39
RS
1182 ;; #### string consing
1183 (let ((s (intern-soft
1184 (buffer-substring domain-pos (point))
1185 mail-extr-all-top-level-domains)))
1186 (and s (get s 'domain-name))))
72c0ae01
ER
1187 (narrow-to-region (point-min) (point))
1188 (goto-char (car temp))
154b3e39 1189 (mail-extr-delete-char 1)
72c0ae01
ER
1190 (setq @-pos (point))
1191 (setcdr temp nil)
1192 (setq %-pos (delq @-pos %-pos))
1193 (insert "@")
1194 (throw 'truncated t)))
1195 (setq temp (cdr temp))))))
1196 (setq mbox-beg (point-min)
1197 mbox-end (if %-pos (car %-pos)
1198 (or @-pos
1199 (point-max))))
1200
1201 ;; Done canonicalizing address.
1202
1203 (set-buffer extraction-buffer)
1204
154b3e39
RS
1205 ;; Decide what part of the address to search to find the full name.
1206 (cond (
1207 ;; Example: "First M. Last" <fml@foo.bar.dom>
1208 (and phrase-beg
72c0ae01
ER
1209 (eq quote-beg phrase-beg)
1210 (<= quote-end phrase-end))
1211 (narrow-to-region (1+ quote-beg) (1- quote-end))
154b3e39
RS
1212 (mail-extr-undo-backslash-quoting (point-min) (point-max)))
1213
1214 ;; Example: First Last <fml@foo.bar.dom>
72c0ae01
ER
1215 (phrase-beg
1216 (narrow-to-region phrase-beg phrase-end))
154b3e39
RS
1217
1218 ;; Example: fml@foo.bar.dom (First M. Last)
1219 (cbeg
1220 (narrow-to-region (1+ cbeg) (1- cend))
1221 (mail-extr-undo-backslash-quoting (point-min) (point-max))
1222
1223 ;; Deal with spacing problems
1224 (goto-char (point-min))
1225; (cond ((not (search-forward " " nil t))
1226; (goto-char (point-min))
1227; (cond ((search-forward "_" nil t)
1228; ;; Handle the *idiotic* use of underlines as spaces.
1229; ;; Example: fml@foo.bar.dom (First_M._Last)
1230; (goto-char (point-min))
1231; (while (search-forward "_" nil t)
1232; (replace-match " " t)))
1233; ((search-forward "." nil t)
1234; ;; Fix . used as space
1235; ;; Example: danj1@cb.att.com (daniel.jacobson)
1236; (goto-char (point-min))
1237; (while (re-search-forward mail-extr-bad-dot-pattern nil t)
1238; (replace-match "\\1 \\2" t))))))
1239 )
1240
1241 ;; Otherwise we try to get the name from the mailbox portion
1242 ;; of the address.
1243 ;; Example: First_M_Last@foo.bar.dom
72c0ae01
ER
1244 (t
1245 ;; *** Work in canon buffer instead? No, can't. Hmm.
154b3e39
RS
1246 (goto-char (point-max))
1247 (narrow-to-region (point) (point))
72c0ae01
ER
1248 (insert-buffer-substring canonicalization-buffer
1249 mbox-beg mbox-end)
1250 (goto-char (point-min))
154b3e39
RS
1251
1252 ;; Example: First_Last.XXX@foo.bar.dom
1253 (setq \.-ends-name (re-search-forward "[_0-9]" nil t))
1254
72c0ae01 1255 (goto-char (point-min))
154b3e39
RS
1256
1257 (if (not mail-extr-mangle-uucp)
1258 (modify-syntax-entry ?! "w" (syntax-table)))
1259
72c0ae01 1260 (while (progn
154b3e39 1261 (mail-extr-skip-whitespace-forward)
72c0ae01
ER
1262 (not (eobp)))
1263 (setq char (char-after (point)))
1264 (cond
1265 ((eq char ?\")
1266 (setq quote-beg (point))
154b3e39 1267 (or (mail-extr-safe-move-sexp 1)
72c0ae01
ER
1268 ;; TODO: handle this error condition!!!!!
1269 (forward-char 1))
1270 ;; take into account deletions
1271 (setq quote-end (- (point) 2))
1272 (save-excursion
1273 (backward-char 1)
154b3e39 1274 (mail-extr-delete-char 1)
72c0ae01 1275 (goto-char quote-beg)
154b3e39
RS
1276 (mail-extr-delete-char 1))
1277 (mail-extr-undo-backslash-quoting quote-beg quote-end)
1278 (or (eq ?\ (char-after (point)))
72c0ae01 1279 (insert " "))
154b3e39 1280;; (setq mailbox-name-processed-flag t)
72c0ae01
ER
1281 (setq \.-ends-name t))
1282 ((eq char ?.)
154b3e39 1283 (if (memq (char-after (1+ (point))) '(?_ ?=))
72c0ae01
ER
1284 (progn
1285 (forward-char 1)
154b3e39
RS
1286 (mail-extr-delete-char 1)
1287 (insert ?\ ))
72c0ae01
ER
1288 (if \.-ends-name
1289 (narrow-to-region (point-min) (point))
154b3e39
RS
1290 (mail-extr-delete-char 1)
1291 (insert " ")))
1292;; (setq mailbox-name-processed-flag t)
1293 )
72c0ae01 1294 ((memq (char-syntax char) '(?. ?\\))
154b3e39
RS
1295 (mail-extr-delete-char 1)
1296 (insert " ")
1297;; (setq mailbox-name-processed-flag t)
1298 )
72c0ae01
ER
1299 (t
1300 (setq atom-beg (point))
1301 (forward-word 1)
1302 (setq atom-end (point))
154b3e39 1303 (goto-char atom-beg)
72c0ae01
ER
1304 (save-restriction
1305 (narrow-to-region atom-beg atom-end)
154b3e39
RS
1306 (cond
1307
1308 ;; Handle X.400 addresses encoded in RFC-822.
1309 ;; *** Shit! This has to handle the case where it is
1310 ;; *** embedded in a quote too!
1311 ;; *** Shit! The input is being broken up into atoms
1312 ;; *** by periods!
1313 ((looking-at mail-extr-x400-encoded-address-pattern)
1314
1315 ;; Copy the contents of the individual fields that
1316 ;; might hold name data to the beginning.
1317 (mapcar
1318 (function
1319 (lambda (field-pattern)
1320 (cond
1321 ((save-excursion
1322 (re-search-forward field-pattern nil t))
1323 (insert-buffer-substring (current-buffer)
1324 (match-beginning 1)
1325 (match-end 1))
1326 (insert " ")))))
1327 (list mail-extr-x400-encoded-address-given-name-pattern
1328 mail-extr-x400-encoded-address-surname-pattern
1329 mail-extr-x400-encoded-address-full-name-pattern))
1330
1331 ;; Discard the rest, since it contains stuff like
1332 ;; routing information, not part of a name.
1333 (mail-extr-skip-whitespace-backward)
1334 (delete-region (point) (point-max))
1335
1336 ;; Handle periods used for spacing.
1337 (while (re-search-forward mail-extr-bad-dot-pattern nil t)
1338 (replace-match "\\1 \\2" t))
1339
1340;; (setq mailbox-name-processed-flag t)
1341 )
1342
1343 ;; Handle normal addresses.
1344 (t
1345 (goto-char (point-min))
1346 ;; Handle _ and = used for spacing.
1347 (while (re-search-forward "\\([^_=]+\\)[_=]" nil t)
1348 (replace-match "\\1 " t)
1349;; (setq mailbox-name-processed-flag t)
1350 )
1351 (goto-char (point-max))))))))
1352
1353 ;; undo the dirty deed
1354 (if (not mail-extr-mangle-uucp)
1355 (modify-syntax-entry ?! "." (syntax-table)))
1356 ;;
1357 ;; If we derived the name from the mailbox part of the address,
1358 ;; and we only got one word out of it, don't treat that as a
1359 ;; name. "foo@bar" --> (nil "foo@bar"), not ("foo" "foo@bar")
1360 ;; (if (not mailbox-name-processed-flag)
1361 ;; (delete-region (point-min) (point-max)))
1362 ))
72c0ae01 1363
154b3e39 1364 (set-syntax-table mail-extr-address-text-syntax-table)
72c0ae01 1365
154b3e39 1366 (mail-extr-voodoo mbox-beg mbox-end canonicalization-buffer)
72c0ae01
ER
1367 (goto-char (point-min))
1368
72c0ae01
ER
1369 ;; If name is "First Last" and userid is "F?L", then assume
1370 ;; the middle initial is the second letter in the userid.
154b3e39
RS
1371 ;; Initial code by Jamie Zawinski <jwz@lucid.com>
1372 ;; *** Make it work when there's a suffix as well.
1373 (goto-char (point-min))
1374 (cond ((and mail-extr-guess-middle-initial
1375 (not disable-initial-guessing-flag)
1376 (eq 3 (- mbox-end mbox-beg))
72c0ae01
ER
1377 (progn
1378 (goto-char (point-min))
154b3e39 1379 (looking-at mail-extr-two-name-pattern)))
72c0ae01
ER
1380 (setq fi (char-after (match-beginning 0))
1381 li (char-after (match-beginning 3)))
1382 (save-excursion
1383 (set-buffer canonicalization-buffer)
1384 ;; char-equal is ignoring case here, so no need to upcase
1385 ;; or downcase.
1386 (let ((case-fold-search t))
1387 (and (char-equal fi (char-after mbox-beg))
1388 (char-equal li (char-after (1- mbox-end)))
1389 (setq mi (char-after (1+ mbox-beg))))))
1390 (cond ((and mi
1391 ;; TODO: use better table than syntax table
1392 (eq ?w (char-syntax mi)))
1393 (goto-char (match-beginning 3))
1394 (insert (upcase mi) ". ")))))
1395
154b3e39
RS
1396 ;; Nuke name if it is the same as mailbox name.
1397 (let ((buffer-length (- (point-max) (point-min)))
1398 (i 0)
1399 (names-match-flag t))
1400 (cond ((and (> buffer-length 0)
1401 (eq buffer-length (- mbox-end mbox-beg)))
1402 (goto-char (point-max))
1403 (insert-buffer-substring canonicalization-buffer
1404 mbox-beg mbox-end)
1405 (while (and names-match-flag
1406 (< i buffer-length))
1407 (or (eq (downcase (char-after (+ i (point-min))))
1408 (downcase
1409 (char-after (+ i buffer-length (point-min)))))
1410 (setq names-match-flag nil))
1411 (setq i (1+ i)))
1412 (delete-region (+ (point-min) buffer-length) (point-max))
1413 (if names-match-flag
1414 (narrow-to-region (point) (point))))))
1415
1416 ;; Nuke name if it's just one word.
1417 (goto-char (point-min))
1418 (and mail-extr-ignore-single-names
1419 (not (re-search-forward "[- ]" nil t))
1420 (narrow-to-region (point) (point)))
72c0ae01
ER
1421
1422 ;; Result
154b3e39
RS
1423 (list (if (not (= (point-min) (point-max)))
1424 (buffer-string))
72c0ae01
ER
1425 (progn
1426 (set-buffer canonicalization-buffer)
154b3e39
RS
1427 (if (not (= (point-min) (point-max)))
1428 (buffer-string))))
72c0ae01
ER
1429 )))
1430
154b3e39
RS
1431(defun mail-extr-voodoo (mbox-beg mbox-end canonicalization-buffer)
1432 (let ((word-count 0)
1433 (case-fold-search nil)
1434 mixed-case-flag lower-case-flag ;;upper-case-flag
72c0ae01 1435 suffix-flag last-name-comma-flag
154b3e39
RS
1436 ;;cbeg cend
1437 initial
1438 begin-again-flag
1439 drop-this-word-if-trailing-flag
1440 drop-last-word-if-trailing-flag
1441 word-found-flag
1442 this-word-beg last-word-beg
1443 name-beg name-end
1444 name-done-flag
72c0ae01
ER
1445 )
1446 (save-excursion
154b3e39 1447 (set-syntax-table mail-extr-address-text-syntax-table)
72c0ae01 1448
154b3e39 1449 ;; This was moved above.
72c0ae01 1450 ;; Fix . used as space
154b3e39
RS
1451 ;; But it belongs here because it occurs not only as
1452 ;; rypens@reks.uia.ac.be (Piet.Rypens)
1453 ;; but also as
1454 ;; "Piet.Rypens" <rypens@reks.uia.ac.be>
1455 ;;(goto-char (point-min))
1456 ;;(while (re-search-forward mail-extr-bad-dot-pattern nil t)
1457 ;; (replace-match "\\1 \\2" t))
1458
1459 (cond ((not (search-forward " " nil t))
1460 (goto-char (point-min))
1461 (cond ((search-forward "_" nil t)
1462 ;; Handle the *idiotic* use of underlines as spaces.
1463 ;; Example: fml@foo.bar.dom (First_M._Last)
1464 (goto-char (point-min))
1465 (while (search-forward "_" nil t)
1466 (replace-match " " t)))
1467 ((search-forward "." nil t)
1468 ;; Fix . used as space
1469 ;; Example: danj1@cb.att.com (daniel.jacobson)
1470 (goto-char (point-min))
1471 (while (re-search-forward mail-extr-bad-dot-pattern nil t)
1472 (replace-match "\\1 \\2" t))))))
1473
72c0ae01 1474
154b3e39 1475 ;; Loop over the words (and other junk) in the name.
72c0ae01 1476 (goto-char (point-min))
154b3e39
RS
1477 (while (not name-done-flag)
1478
1479 (cond (word-found-flag
1480 ;; Last time through this loop we skipped over a word.
1481 (setq last-word-beg this-word-beg)
1482 (setq drop-last-word-if-trailing-flag
1483 drop-this-word-if-trailing-flag)
1484 (setq word-found-flag nil)))
1485
1486 (cond (begin-again-flag
1487 ;; Last time through the loop we found something that
1488 ;; indicates we should pretend we are beginning again from
1489 ;; the start.
1490 (setq word-count 0)
1491 (setq last-word-beg nil)
1492 (setq drop-last-word-if-trailing-flag nil)
1493 (setq mixed-case-flag nil)
1494 (setq lower-case-flag nil)
1495;; (setq upper-case-flag nil)
1496 (setq begin-again-flag nil)
1497 ))
1498
1499 ;; Initialize for this iteration of the loop.
1500 (mail-extr-skip-whitespace-forward)
1501 (if (eq word-count 0) (narrow-to-region (point) (point-max)))
1502 (setq this-word-beg (point))
1503 (setq drop-this-word-if-trailing-flag nil)
1504
1505 ;; Decide what to do based on what we are looking at.
1506 (cond
1507
1508 ;; Delete title
1509 ((and (eq word-count 0)
1510 (looking-at mail-extr-full-name-prefixes))
1511 (goto-char (match-end 0))
1512 (narrow-to-region (point) (point-max)))
1513
1514 ;; Stop after name suffix
1515 ((and (>= word-count 2)
1516 (looking-at mail-extr-full-name-suffix-pattern))
1517 (mail-extr-skip-whitespace-backward)
1518 (setq suffix-flag (point))
1519 (if (eq ?, (following-char))
1520 (forward-char 1)
1521 (insert ?,))
1522 ;; Enforce at least one space after comma
1523 (or (eq ?\ (following-char))
1524 (insert ?\ ))
1525 (mail-extr-skip-whitespace-forward)
1526 (cond ((memq (following-char) '(?j ?J ?s ?S))
1527 (capitalize-word 1)
1528 (if (eq (following-char) ?.)
1529 (forward-char 1)
1530 (insert ?.)))
1531 (t
1532 (upcase-word 1)))
1533 (setq word-found-flag t)
1534 (setq name-done-flag t))
1535
1536 ;; Handle SCA names
1537 ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As"
1538 (goto-char (match-beginning 1))
1539 (narrow-to-region (point) (point-max))
1540 (setq begin-again-flag t))
1541
1542 ;; Check for initial last name followed by comma
1543 ((and (eq ?, (following-char))
1544 (eq word-count 1))
1545 (forward-char 1)
1546 (setq last-name-comma-flag t)
1547 (or (eq ?\ (following-char))
1548 (insert ?\ )))
1549
1550 ;; Stop before trailing comma-separated comment
1551 ;; THIS CASE MUST BE AFTER THE PRECEDING CASES.
1552 ;; *** This case is redundant???
1553 ;;((eq ?, (following-char))
1554 ;; (setq name-done-flag t))
1555
1556 ;; Delete parenthesized/quoted comment/nickname
1557 ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`))
1558 (setq cbeg (point))
1559 (set-syntax-table mail-extr-address-text-comment-syntax-table)
1560 (cond ((memq (following-char) '(?\' ?\`))
1561 (or (search-forward "'" nil t
1562 (if (eq ?\' (following-char)) 2 1))
1563 (mail-extr-delete-char 1)))
1564 (t
1565 (or (mail-extr-safe-move-sexp 1)
1566 (goto-char (point-max)))))
1567 (set-syntax-table mail-extr-address-text-syntax-table)
1568 (setq cend (point))
72c0ae01 1569 (cond
154b3e39 1570 ;; Handle case of entire name being quoted
72c0ae01 1571 ((and (eq word-count 0)
154b3e39
RS
1572 (looking-at " *\\'")
1573 (>= (- cend cbeg) 2))
1574 (narrow-to-region (1+ cbeg) (1- cend))
1575 (goto-char (point-min)))
1576 (t
1577 ;; Handle case of quoted initial
1578 (if (and (or (= 3 (- cend cbeg))
1579 (and (= 4 (- cend cbeg))
1580 (eq ?. (char-after (+ 2 cbeg)))))
1581 (not (looking-at " *\\'")))
1582 (setq initial (char-after (1+ cbeg)))
1583 (setq initial nil))
1584 (delete-region cbeg cend)
1585 (if initial
1586 (insert initial ". ")))))
1587
1588 ;; Handle & substitution
1589 ((and (or (bobp)
1590 (eq ?\ (preceding-char)))
1591 (looking-at "&\\( \\|\\'\\)"))
1592 (mail-extr-delete-char 1)
1593 (capitalize-region
1594 (point)
1595 (progn
1596 (insert-buffer-substring canonicalization-buffer
1597 mbox-beg mbox-end)
1598 (point)))
1599 (setq disable-initial-guessing-flag t)
1600 (setq word-found-flag t))
1601
1602 ;; Handle *Stupid* VMS date stamps
1603 ((looking-at mail-extr-stupid-vms-date-stamp-pattern)
1604 (replace-match "" t))
1605
1606 ;; Handle Chinese characters.
1607 ((looking-at mail-extr-hz-embedded-gb-encoded-chinese-pattern)
1608 (goto-char (match-end 0))
1609 (setq word-found-flag t))
1610
1611 ;; Skip initial garbage characters.
1612 ;; THIS CASE MUST BE AFTER THE PRECEDING CASES.
1613 ((and (eq word-count 0)
1614 (looking-at mail-extr-leading-garbage))
1615 (goto-char (match-end 0))
1616 ;; *** Skip backward over these???
1617 ;; (skip-chars-backward "& \"")
1618 (narrow-to-region (point) (point-max)))
1619
1620 ;; Various stopping points
1621 ((or
72c0ae01 1622
154b3e39
RS
1623 ;; Stop before ALL CAPS acronyms, if preceded by mixed-case
1624 ;; words. Example: XT-DEM.
1625 (and (>= word-count 2)
1626 mixed-case-flag
1627 (looking-at mail-extr-weird-acronym-pattern)
1628 (not (looking-at mail-extr-roman-numeral-pattern)))
72c0ae01 1629
154b3e39
RS
1630 ;; Stop before trailing alternative address
1631 (looking-at mail-extr-alternative-address-pattern)
72c0ae01 1632
154b3e39
RS
1633 ;; Stop before trailing comment not introduced by comma
1634 ;; THIS CASE MUST BE AFTER AN EARLIER CASE.
1635 (looking-at mail-extr-trailing-comment-start-pattern)
72c0ae01 1636
154b3e39
RS
1637 ;; Stop before telephone numbers
1638 (looking-at mail-extr-telephone-extension-pattern))
1639 (setq name-done-flag t))
1640
1641 ;; Delete ham radio call signs
1642 ((looking-at mail-extr-ham-call-sign-pattern)
1643 (delete-region (match-beginning 0) (match-end 0)))
1644
1645 ;; Fixup initials
1646 ((looking-at mail-extr-initial-pattern)
1647 (or (eq (following-char) (upcase (following-char)))
1648 (setq lower-case-flag t))
1649 (forward-char 1)
1650 (if (eq ?. (following-char))
1651 (forward-char 1)
1652 (insert ?.))
1653 (or (eq ?\ (following-char))
1654 (insert ?\ ))
1655 (setq word-found-flag t))
1656
1657 ;; Handle BITNET LISTSERV list names.
1658 ((and (eq word-count 0)
1659 (looking-at mail-extr-listserv-list-name-pattern))
1660 (narrow-to-region (match-beginning 1) (match-end 1))
1661 (setq word-found-flag t)
1662 (setq name-done-flag t))
1663
1664 ;; Regular name words
1665 ((looking-at mail-extr-name-pattern)
1666 (setq name-beg (point))
1667 (setq name-end (match-end 0))
1668
1669 ;; Certain words will be dropped if they are at the end.
1670 (and (>= word-count 2)
1671 (not lower-case-flag)
1672 (or
1673 ;; A trailing 4-or-more letter lowercase words preceded by
1674 ;; mixed case or uppercase words will be dropped.
1675 (looking-at "[a-z][a-z][a-z][a-z]+[ \t]*\\'")
1676 ;; Drop a trailing word which is terminated with a period.
1677 (eq ?. (char-after (1- name-end))))
1678 (setq drop-this-word-if-trailing-flag t))
1679
1680 ;; Set the flags that indicate whether we have seen a lowercase
1681 ;; word, a mixed case word, and an uppercase word.
1682 (if (re-search-forward "[a-z]" name-end t)
1683 (if (progn
1684 (goto-char name-beg)
1685 (re-search-forward "[A-Z]" name-end t))
1686 (setq mixed-case-flag t)
72c0ae01 1687 (setq lower-case-flag t))
154b3e39
RS
1688;; (setq upper-case-flag t)
1689 )
1690
1691 (goto-char name-end)
1692 (setq word-found-flag t))
72c0ae01 1693
154b3e39
RS
1694 (t
1695 (setq name-done-flag t)
1696 ))
1697
1698 ;; Count any word that we skipped over.
1699 (if word-found-flag
1700 (setq word-count (1+ word-count))))
72c0ae01 1701
154b3e39
RS
1702 ;; If the last thing in the name is 2 or more periods, or one or more
1703 ;; other sentence terminators (but not a single period) then keep them
1704 ;; and the preceeding word. This is for the benefit of whole sentences
1705 ;; in the name field: it's better behavior than dropping the last word
1706 ;; of the sentence...
1707 (if (and (not suffix-flag)
1708 (looking-at "\\(\\.+\\|[?!;:.][?!;:.]+\\|[?!;:][?!;:.]*\\)\\'"))
1709 (goto-char (setq suffix-flag (point-max))))
1710
1711 ;; Drop everything after point and certain trailing words.
1712 (narrow-to-region (point-min)
1713 (or (and drop-last-word-if-trailing-flag
1714 last-word-beg)
1715 (point)))
1716
1717 ;; Xerox's mailers SUCK!!!!!!
1718 ;; We simply refuse to believe that any last name is PARC or ADOC.
1719 ;; If it looks like that is the last name, that there is no meaningful
1720 ;; here at all. Actually I guess it would be best to map patterns
1721 ;; like foo.hoser@xerox.com into foo@hoser.xerox.com, but I don't
1722 ;; actually know that that is what's going on.
72c0ae01
ER
1723 (cond ((not suffix-flag)
1724 (goto-char (point-min))
154b3e39
RS
1725 (let ((case-fold-search t))
1726 (if (looking-at "[-A-Za-z_]+[. ]\\(PARC\\|ADOC\\)\\'")
1727 (erase-buffer)))))
1728
72c0ae01
ER
1729 ;; If last name first put it at end (but before suffix)
1730 (cond (last-name-comma-flag
1731 (goto-char (point-min))
1732 (search-forward ",")
154b3e39 1733 (setq name-end (1- (point)))
72c0ae01 1734 (goto-char (or suffix-flag (point-max)))
154b3e39
RS
1735 (or (eq ?\ (preceding-char))
1736 (insert ?\ ))
1737 (insert-buffer-substring (current-buffer) (point-min) name-end)
1738 (goto-char name-end)
1739 (skip-chars-forward "\t ,")
1740 (narrow-to-region (point) (point-max))))
72c0ae01 1741
154b3e39
RS
1742 ;; Delete leading and trailing junk characters.
1743 ;; *** This is probably completly unneeded now.
1744 ;;(goto-char (point-max))
1745 ;;(skip-chars-backward mail-extr-non-end-name-chars)
1746 ;;(if (eq ?. (following-char))
1747 ;; (forward-char 1))
1748 ;;(narrow-to-region (point)
1749 ;; (progn
1750 ;; (goto-char (point-min))
1751 ;; (skip-chars-forward mail-extr-non-begin-name-chars)
1752 ;; (point)))
72c0ae01
ER
1753
1754 ;; Compress whitespace
1755 (goto-char (point-min))
1756 (while (re-search-forward "[ \t\n]+" nil t)
154b3e39 1757 (replace-match (if (eobp) "" " ") t))
72c0ae01
ER
1758 )))
1759
154b3e39 1760\f
72c0ae01 1761
154b3e39
RS
1762;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1763;;
1764;; Table of top-level domain names.
1765;;
1766;; This is used during address canonicalization; be careful of format changes.
72c0ae01
ER
1767;; Keep in mind that the country abbreviations follow ISO-3166. There is
1768;; a U.S. FIPS that specifies a different set of two-letter country
1769;; abbreviations.
1770
154b3e39
RS
1771(defconst mail-extr-all-top-level-domains
1772 (let ((ob (make-vector 509 0)))
1773 (mapcar
1774 (function
1775 (lambda (x)
1776 (put (intern (downcase (car x)) ob)
1777 'domain-name
1778 (if (nth 2 x)
1779 (format (nth 2 x) (nth 1 x))
1780 (nth 1 x)))))
e1a34b58
RS
1781 '(
1782 ;; ISO 3166 codes:
1783 ("ae" "United Arab Emirates")
1784 ("ag" "Antigua and Barbuda")
1785 ("al" "Albania")
1786 ("ao" "Angola")
1787 ("aq" "Antarctica") ; continent
154b3e39 1788 ("ar" "Argentina" "Argentine Republic")
154b3e39
RS
1789 ("at" "Austria" "The Republic of %s")
1790 ("au" "Australia")
e1a34b58 1791 ("az" "Azerbaijan")
154b3e39 1792 ("bb" "Barbados")
e1a34b58 1793 ("bd" "Bangladesh")
154b3e39 1794 ("be" "Belgium" "The Kingdom of %s")
e1a34b58 1795 ("bf" "Burkina Faso")
154b3e39 1796 ("bg" "Bulgaria")
e1a34b58
RS
1797 ("bh" "Bahrain")
1798 ("bm" "Bermuda")
154b3e39
RS
1799 ("bo" "Bolivia" "Republic of %s")
1800 ("br" "Brazil" "The Federative Republic of %s")
1801 ("bs" "Bahamas")
e1a34b58
RS
1802 ("bw" "Botswana")
1803 ("by" "Belarus")
154b3e39
RS
1804 ("bz" "Belize")
1805 ("ca" "Canada")
e1a34b58 1806 ("cg" "Congo")
154b3e39 1807 ("ch" "Switzerland" "The Swiss Confederation")
e1a34b58 1808 ("ci" "Ivory Coast")
154b3e39 1809 ("cl" "Chile" "The Republic of %s")
e1a34b58 1810 ("cm" "Cameroon") ; In .fr domain
154b3e39 1811 ("cn" "China" "The People's Republic of %s")
e1a34b58 1812 ("co" "Colombia")
154b3e39
RS
1813 ("cr" "Costa Rica" "The Republic of %s")
1814 ("cs" "Czechoslovakia")
e1a34b58
RS
1815 ("cu" "Cuba")
1816 ("cy" "Cyprus")
1817 ("cz" "Czech Republic")
154b3e39
RS
1818 ("de" "Germany")
1819 ("dk" "Denmark")
1820 ("dm" "Dominica")
1821 ("do" "Dominican Republic" "The %s")
e1a34b58 1822 ("dz" "Algeria")
154b3e39 1823 ("ec" "Ecuador" "The Republic of %s")
e1a34b58 1824 ("ee" "Estonia")
154b3e39 1825 ("eg" "Egypt" "The Arab Republic of %s")
e1a34b58 1826 ("er" "Eritrea")
154b3e39
RS
1827 ("es" "Spain" "The Kingdom of %s")
1828 ("fi" "Finland" "The Republic of %s")
1829 ("fj" "Fiji")
e1a34b58 1830 ("fo" "Faroe Islands")
154b3e39 1831 ("fr" "France")
e1a34b58
RS
1832 ("gb" "Great Britain")
1833 ("gd" "Grenada")
1834 ("ge" "Georgia")
1835 ("gf" "Guyana (Fr.)")
1836 ("gp" "Guadeloupe (Fr.)")
154b3e39 1837 ("gr" "Greece" "The Hellenic Republic (%s)")
e1a34b58
RS
1838 ("gt" "Guatemala")
1839 ("gu" "Guam (U.S.)")
154b3e39 1840 ("hk" "Hong Kong")
e1a34b58
RS
1841 ("hn" "Honduras")
1842 ("hr" "Croatia")
1843 ("ht" "Haiti")
449c2cd8 1844 ("hu" "Hungary" "The Hungarian Republic") ;???
e1a34b58 1845 ("id" "Indonesia")
154b3e39
RS
1846 ("ie" "Ireland")
1847 ("il" "Israel" "The State of %s")
1848 ("in" "India" "The Republic of %s")
e1a34b58 1849 ("ir" "Iran")
154b3e39
RS
1850 ("is" "Iceland" "The Republic of %s")
1851 ("it" "Italy" "The Italian Republic")
1852 ("jm" "Jamaica")
1853 ("jp" "Japan")
e1a34b58
RS
1854 ("ke" "Kenya")
1855 ("kn" "St. Kitts, Nevis, and Anguilla")
1856 ("kp" "Korea (North)")
1857 ("kr" "Korea (South)")
1858 ("kw" "Kuwait")
1859 ("kz" "Kazachstan")
1860 ("lb" "Lebanon")
154b3e39 1861 ("lc" "St. Lucia")
e1a34b58 1862 ("li" "Liechtenstein")
154b3e39 1863 ("lk" "Sri Lanka" "The Democratic Socialist Republic of %s")
e1a34b58
RS
1864 ("ls" "Lesotho")
1865 ("lt" "Lithuania")
1866 ("lu" "Luxembourg")
1867 ("lv" "Latvia")
1868 ("ma" "Morocco")
1869 ("md" "Moldova")
1870 ("mg" "Madagascar")
1871 ("mk" "Macedonia")
1872 ("ml" "Mali")
1873 ("mo" "Macau")
1874 ("mt" "Malta")
1875 ("mu" "Mauritius")
1876 ("mw" "Malawi")
154b3e39
RS
1877 ("mx" "Mexico" "The United Mexican States")
1878 ("my" "Malaysia" "%s (changed to Myanmar?)") ;???
e1a34b58 1879 ("mz" "Mozambique")
154b3e39 1880 ("na" "Namibia")
e1a34b58
RS
1881 ("nc" "New Caledonia (Fr.)")
1882 ("ne" "Niger") ; In .fr domain
154b3e39
RS
1883 ("ni" "Nicaragua" "The Republic of %s")
1884 ("nl" "Netherlands" "The Kingdom of the %s")
1885 ("no" "Norway" "The Kingdom of %s")
e1a34b58 1886 ("np" "Nepal") ; Via .in domain
154b3e39 1887 ("nz" "New Zealand")
e1a34b58 1888 ("pa" "Panama")
154b3e39 1889 ("pe" "Peru")
e1a34b58 1890 ("pf" "Polynesia (Fr.)")
154b3e39
RS
1891 ("pg" "Papua New Guinea")
1892 ("ph" "Philippines" "The Republic of the %s")
e1a34b58 1893 ("pk" "Pakistan")
154b3e39 1894 ("pl" "Poland")
e1a34b58 1895 ("pr" "Puerto Rico (U.S.)")
154b3e39
RS
1896 ("pt" "Portugal" "The Portugese Republic")
1897 ("py" "Paraguay")
e1a34b58
RS
1898 ("re" "Reunion (Fr.)") ; In .fr domain
1899 ("ro" "Romania")
1900 ("ru" "Russian Federation")
1901 ("sa" "Saudi Arabia")
1902 ("sc" "Seychelles")
1903 ("sd" "Sudan")
154b3e39
RS
1904 ("se" "Sweden" "The Kingdom of %s")
1905 ("sg" "Singapore" "The Republic of %s")
e1a34b58
RS
1906 ("si" "Slovenia")
1907 ("sj" "Svalbard and Jan Mayen Is.") ; In .no domain
1908 ("sk" "Slovakia" "The Slovak Republic")
1909 ("sn" "Senegal")
154b3e39
RS
1910 ("sr" "Suriname")
1911 ("su" "Soviet Union")
e1a34b58
RS
1912 ("sz" "Swaziland")
1913 ("tg" "Togo")
154b3e39 1914 ("th" "Thailand" "The Kingdom of %s")
e1a34b58 1915 ("tm" "Turkmenistan") ; In .su domain
154b3e39
RS
1916 ("tn" "Tunisia")
1917 ("tr" "Turkey" "The Republic of %s")
1918 ("tt" "Trinidad and Tobago")
1919 ("tw" "Taiwan")
e1a34b58 1920 ("ua" "Ukraine")
44a1338a 1921 ("uk" "United Kingdom" "The %s of Great Britain and Northern Ireland")
e1a34b58 1922 ("us" "United States" "The %s of America")
154b3e39
RS
1923 ("uy" "Uruguay" "The Eastern Republic of %s")
1924 ("vc" "St. Vincent and the Grenadines")
1925 ("ve" "Venezuela" "The Republic of %s")
e1a34b58
RS
1926 ("vi" "Virgin Islands (U.S.)")
1927 ("vn" "Vietnam")
1928 ("vu" "Vanuatu")
154b3e39 1929 ("yu" "Yugoslavia" "The Socialist Federal Republic of %s")
154b3e39
RS
1930 ("za" "South Africa" "The Republic of %s (or Zambia? Zaire?)")
1931 ("zw" "Zimbabwe" "Republic of %s")
e1a34b58
RS
1932 ;; Special top-level domains:
1933 ("arpa" t "Advanced Research Projects Agency (U.S. DoD)")
1934 ("bitnet" t "Because It's Time NET")
1935 ("com" t "Commercial")
1936 ("edu" t "Educational")
1937 ("gov" t "Government (U.S.)")
1938 ("int" t "International (NATO)")
1939 ("mil" t "Military (U.S.)")
1940 ("nato" t "North Atlantic Treaty Organization")
1941 ("net" t "Network")
1942 ("org" t "Non-profit Organization")
1943 ;;("unter-dom" t "? (Ger.)")
1944 ("uucp" t "Unix to Unix CoPy")
1945 ;;("fipnet" nil "?")
154b3e39
RS
1946 ))
1947 ob))
1948
1949;;;###autoload
1950(defun what-domain (domain)
e1cd65fd 1951 "Convert mail domain DOMAIN to the country it corresponds to."
154b3e39
RS
1952 (interactive
1953 (let ((completion-ignore-case t))
1954 (list (completing-read "Domain: "
1955 mail-extr-all-top-level-domains nil t))))
1956 (or (setq domain (intern-soft (downcase domain)
1957 mail-extr-all-top-level-domains))
1958 (error "no such domain"))
1959 (message "%s: %s" (upcase (symbol-name domain)) (get domain 'domain-name)))
72c0ae01
ER
1960
1961\f
154b3e39
RS
1962;(let ((all nil))
1963; (mapatoms #'(lambda (x)
1964; (if (and (boundp x)
1965; (string-match "^mail-extr-" (symbol-name x)))
1966; (setq all (cons x all)))))
1967; (setq all (sort all #'string-lessp))
1968; (cons 'setq
1969; (apply 'nconc (mapcar #'(lambda (x)
1970; (list x (symbol-value x)))
1971; all))))
72c0ae01
ER
1972
1973\f
154b3e39 1974(provide 'mail-extr)
72c0ae01
ER
1975
1976;;; mail-extr.el ends here