Commit | Line | Data |
---|---|---|
b1ccc1ce | 1 | ;;; mail-extr.el --- extract full name and address from RFC 822 mail header -*- coding: utf-8 -*- |
72c0ae01 | 2 | |
e84b4b86 | 3 | ;; Copyright (C) 1991, 1992, 1993, 1994, 1997, 2001, 2002, 2003, 2004, |
2f043267 | 4 | ;; 2005, 2006, 2007, 2008 Free Software Foundation, Inc. |
3a801d0c | 5 | |
72c0ae01 | 6 | ;; Author: Joe Wells <jbw@cs.bu.edu> |
d1782bd8 | 7 | ;; Maintainer: FSF |
72c0ae01 ER |
8 | ;; Keywords: mail |
9 | ||
72c0ae01 ER |
10 | ;; This file is part of GNU Emacs. |
11 | ||
12 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
13 | ;; it under the terms of the GNU General Public License as published by | |
ceaeecb0 | 14 | ;; the Free Software Foundation; either version 3, or (at your option) |
72c0ae01 ER |
15 | ;; any later version. |
16 | ||
17 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
18 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ;; GNU General Public License for more details. | |
21 | ||
22 | ;; You should have received a copy of the GNU General Public License | |
b578f267 | 23 | ;; along with GNU Emacs; see the file COPYING. If not, write to the |
3a35cf56 LK |
24 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
25 | ;; Boston, MA 02110-1301, USA. | |
72c0ae01 ER |
26 | |
27 | ;;; Commentary: | |
28 | ||
154b3e39 RS |
29 | ;; The entry point of this code is |
30 | ;; | |
d1782bd8 | 31 | ;; mail-extract-address-components: (address &optional all) |
6c83d99f | 32 | ;; |
154b3e39 RS |
33 | ;; Given an RFC-822 ADDRESS, extract full name and canonical address. |
34 | ;; Returns a list of the form (FULL-NAME CANONICAL-ADDRESS). | |
35 | ;; If no name can be extracted, FULL-NAME will be nil. | |
6c83d99f | 36 | ;; ADDRESS may be a string or a buffer. If it is a buffer, the visible |
154b3e39 RS |
37 | ;; (narrowed) portion of the buffer will be interpreted as the address. |
38 | ;; (This feature exists so that the clever caller might be able to avoid | |
39 | ;; consing a string.) | |
40 | ;; If ADDRESS contains more than one RFC-822 address, only the first is | |
41 | ;; returned. | |
42 | ;; | |
d1782bd8 KH |
43 | ;; If ALL is non-nil, that means return info about all the addresses |
44 | ;; that are found in ADDRESS. The value is a list of elements of | |
45 | ;; the form (FULL-NAME CANONICAL-ADDRESS), one per address. | |
46 | ;; | |
154b3e39 RS |
47 | ;; This code is more correct (and more heuristic) parser than the code in |
48 | ;; rfc822.el. And despite its size, it's fairly fast. | |
49 | ;; | |
72c0ae01 | 50 | ;; There are two main benefits: |
154b3e39 | 51 | ;; |
72c0ae01 | 52 | ;; 1. Higher probability of getting the correct full name for a human than |
154b3e39 | 53 | ;; any other package we know of. (On the other hand, it will cheerfully |
72c0ae01 ER |
54 | ;; mangle non-human names/comments.) |
55 | ;; 2. Address part is put in a canonical form. | |
154b3e39 RS |
56 | ;; |
57 | ;; The interface is not yet carved in stone; please give us suggestions. | |
58 | ;; | |
59 | ;; We have an extensive test-case collection of funny addresses if you want to | |
72c0ae01 | 60 | ;; work with the code. Developing this code requires frequent testing to |
154b3e39 RS |
61 | ;; make sure you're not breaking functionality. The test cases aren't included |
62 | ;; because they are over 100K. | |
63 | ;; | |
6c83d99f | 64 | ;; If you find an address that mail-extr fails on, please send it to the |
154b3e39 RS |
65 | ;; maintainer along with what you think the correct results should be. We do |
66 | ;; not consider it a bug if mail-extr mangles a comment that does not | |
6c83d99f | 67 | ;; correspond to a real human full name, although we would prefer that |
154b3e39 RS |
68 | ;; mail-extr would return the comment as-is. |
69 | ;; | |
72c0ae01 | 70 | ;; Features: |
154b3e39 | 71 | ;; |
72c0ae01 | 72 | ;; * Full name handling: |
154b3e39 | 73 | ;; |
72c0ae01 ER |
74 | ;; * knows where full names can be found in an address. |
75 | ;; * avoids using empty comments and quoted text. | |
76 | ;; * extracts full names from mailbox names. | |
77 | ;; * recognizes common formats for comments after a full name. | |
78 | ;; * puts a period and a space after each initial. | |
154b3e39 RS |
79 | ;; * understands & referring to the mailbox name, capitalized. |
80 | ;; * strips name prefixes like "Prof.", etc. | |
72c0ae01 ER |
81 | ;; * understands what characters can occur in names (not just letters). |
82 | ;; * figures out middle initial from mailbox name. | |
83 | ;; * removes funny nicknames. | |
84 | ;; * keeps suffixes such as Jr., Sr., III, etc. | |
85 | ;; * reorders "Last, First" type names. | |
154b3e39 | 86 | ;; |
72c0ae01 | 87 | ;; * Address handling: |
154b3e39 | 88 | ;; |
72c0ae01 ER |
89 | ;; * parses rfc822 quoted text, comments, and domain literals. |
90 | ;; * parses rfc822 multi-line headers. | |
91 | ;; * does something reasonable with rfc822 GROUP addresses. | |
92 | ;; * handles many rfc822 noncompliant and garbage addresses. | |
93 | ;; * canonicalizes addresses (after stripping comments/phrases outside <>). | |
94 | ;; * converts ! addresses into .UUCP and %-style addresses. | |
95 | ;; * converts rfc822 ROUTE addresses to %-style addresses. | |
96 | ;; * truncates %-style addresses at leftmost fully qualified domain name. | |
97 | ;; * handles local relative precedence of ! vs. % and @ (untested). | |
154b3e39 | 98 | ;; |
72c0ae01 ER |
99 | ;; It does almost no string creation. It primarily uses the built-in |
100 | ;; parsing routines with the appropriate syntax tables. This should | |
101 | ;; result in greater speed. | |
154b3e39 | 102 | ;; |
72c0ae01 | 103 | ;; TODO: |
154b3e39 | 104 | ;; |
72c0ae01 ER |
105 | ;; * handle all test cases. (This will take forever.) |
106 | ;; * software to pick the correct header to use (eg., "Senders-Name:"). | |
107 | ;; * multiple addresses in the "From:" header (almost all of the necessary | |
108 | ;; code is there). | |
109 | ;; * flag to not treat `,' as an address separator. (This is useful when | |
110 | ;; there is a "From:" header but no "Sender:" header, because then there | |
111 | ;; is only allowed to be one address.) | |
112 | ;; * mailbox name does not necessarily contain full name. | |
113 | ;; * fixing capitalization when it's all upper or lowercase. (Hard!) | |
114 | ;; * some of the domain literal handling is missing. (But I've never even | |
115 | ;; seen one of these in a mail address, so maybe no big deal.) | |
116 | ;; * arrange to have syntax tables byte-compiled. | |
117 | ;; * speed hacks. | |
118 | ;; * delete unused variables. | |
119 | ;; * arrange for testing with different relative precedences of ! vs. @ | |
120 | ;; and %. | |
72c0ae01 ER |
121 | ;; * insert documentation strings! |
122 | ;; * handle X.400-gatewayed addresses according to RFC 1148. | |
123 | ||
6c83d99f JB |
124 | ;;; Change Log: |
125 | ;; | |
154b3e39 RS |
126 | ;; Thu Feb 17 17:57:33 1994 Jamie Zawinski (jwz@lucid.com) |
127 | ;; | |
128 | ;; * merged with jbw's latest version | |
129 | ;; | |
130 | ;; Wed Feb 9 21:56:27 1994 Jamie Zawinski (jwz@lucid.com) | |
131 | ;; | |
132 | ;; * high-bit chars in comments weren't treated as word syntax | |
133 | ;; | |
134 | ;; Sat Feb 5 03:13:40 1994 Jamie Zawinski (jwz@lucid.com) | |
135 | ;; | |
136 | ;; * call replace-match with fixed-case arg | |
137 | ;; | |
138 | ;; Thu Dec 16 21:56:45 1993 Jamie Zawinski (jwz@lucid.com) | |
139 | ;; | |
140 | ;; * some more cleanup, doc, added provide | |
141 | ;; | |
142 | ;; Tue Mar 23 21:23:18 1993 Joe Wells (jbw at csd.bu.edu) | |
6c83d99f | 143 | ;; |
154b3e39 | 144 | ;; * Made mail-full-name-prefixes a user-customizable variable. |
79814626 | 145 | ;; Allow passing the address as a buffer as well as a string. |
154b3e39 | 146 | ;; Allow [ and ] as name characters (Finnish character set). |
6c83d99f | 147 | ;; |
154b3e39 | 148 | ;; Mon Mar 22 21:20:56 1993 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 149 | ;; |
154b3e39 RS |
150 | ;; * Handle "null" addresses. Handle = used for spacing in mailbox |
151 | ;; name. Fix bug in handling of ROUTE-ADDR-type addresses that are | |
152 | ;; missing their brackets. Handle uppercase "JR". Extract full | |
153 | ;; names from X.400 addresses encoded in RFC-822. Fix bug in | |
154 | ;; handling of multiple addresses where first has trailing comment. | |
155 | ;; Handle more kinds of telephone extension lead-ins. | |
6c83d99f | 156 | ;; |
154b3e39 | 157 | ;; Mon Mar 22 20:16:57 1993 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 158 | ;; |
154b3e39 | 159 | ;; * Handle HZ encoding for embedding GB encoded chinese characters. |
6c83d99f | 160 | ;; |
154b3e39 | 161 | ;; Mon Mar 22 00:46:12 1993 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 162 | ;; |
154b3e39 RS |
163 | ;; * Fixed too broad matching of ham radio call signs. Fixed bug in |
164 | ;; handling an unmatched ' in a name string. Enhanced recognition | |
165 | ;; of when . in the mailbox name terminates the name portion. | |
166 | ;; Narrowed conversion of . to space to only the necessary | |
167 | ;; situation. Deal with VMS's stupid date stamps. Handle a unique | |
168 | ;; way of introducing an alternate address. Fixed spacing bug I | |
169 | ;; introduced in switching last name order. Fixed bug in handling | |
170 | ;; address with ! and % but no @. Narrowed the cases in which | |
171 | ;; certain trailing words are discarded. | |
6c83d99f | 172 | ;; |
154b3e39 | 173 | ;; Sun Mar 21 21:41:06 1993 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 174 | ;; |
154b3e39 RS |
175 | ;; * Fixed bugs in handling GROUP addresses. Certain words in the |
176 | ;; middle of a name no longer terminate it. Handle LISTSERV list | |
177 | ;; names. Ignore comment field containing mailbox name. | |
6c83d99f | 178 | ;; |
154b3e39 | 179 | ;; Sun Mar 21 14:39:38 1993 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 180 | ;; |
154b3e39 RS |
181 | ;; * Moved variant-method code back into main function. Handle |
182 | ;; underscores as spaces in comments. Handle leading nickname. Add | |
183 | ;; flag to ignore single-word names. Other changes. | |
6c83d99f | 184 | ;; |
154b3e39 | 185 | ;; Mon Feb 1 22:23:31 1993 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 186 | ;; |
154b3e39 RS |
187 | ;; * Added in changes by Rod Whitby and Jamie Zawinski. This |
188 | ;; includes the flag mail-extr-guess-middle-initial and the fix for | |
6b07c06e RS |
189 | ;; handling multiple addresses correctly. (Whitby just changed |
190 | ;; a > to a <.) | |
6c83d99f | 191 | ;; |
72c0ae01 | 192 | ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 193 | ;; |
72c0ae01 | 194 | ;; * Cleaned up some more. Release version 1.0 to world. |
6c83d99f | 195 | ;; |
72c0ae01 | 196 | ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 197 | ;; |
72c0ae01 | 198 | ;; * Cleaned up full name extraction extensively. |
6c83d99f | 199 | ;; |
72c0ae01 | 200 | ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu) |
6c83d99f | 201 | ;; |
72c0ae01 ER |
202 | ;; * Total rewrite. Integrated mail-canonicalize-address into |
203 | ;; mail-extract-address-components. Now handles GROUP addresses more | |
204 | ;; or less correctly. Better handling of lots of different cases. | |
6c83d99f | 205 | ;; |
72c0ae01 ER |
206 | ;; Fri Jun 14 19:39:50 1991 |
207 | ;; * Created. | |
208 | ||
209 | ;;; Code: | |
210 | \f | |
72c0ae01 | 211 | |
0b5bb3ec SE |
212 | (defgroup mail-extr nil |
213 | "Extract full name and address from RFC 822 mail header." | |
214 | :prefix "mail-extr-" | |
215 | :group 'mail) | |
216 | ||
154b3e39 RS |
217 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
218 | ;; | |
219 | ;; User configuration variable definitions. | |
220 | ;; | |
221 | ||
0b5bb3ec | 222 | (defcustom mail-extr-guess-middle-initial nil |
154b3e39 RS |
223 | "*Whether to try to guess middle initial from mail address. |
224 | If true, then when we see an address like \"John Smith <jqs@host.com>\" | |
0b5bb3ec SE |
225 | we will assume that \"John Q. Smith\" is the fellow's name." |
226 | :type 'boolean | |
227 | :group 'mail-extr) | |
154b3e39 | 228 | |
2a487c6c | 229 | (defcustom mail-extr-ignore-single-names nil |
154b3e39 RS |
230 | "*Whether to ignore a name that is just a single word. |
231 | If true, then when we see an address like \"Idiot <dumb@stupid.com>\" | |
0b5bb3ec SE |
232 | we will act as though we couldn't find a full name in the address." |
233 | :type 'boolean | |
bf247b6e | 234 | :version "22.1" |
0b5bb3ec | 235 | :group 'mail-extr) |
154b3e39 | 236 | |
67e61e16 EZ |
237 | (defcustom mail-extr-ignore-realname-equals-mailbox-name t |
238 | "*Whether to ignore a name that is equal to the mailbox name. | |
239 | If true, then when the address is like \"Single <single@address.com>\" | |
240 | we will act as though we couldn't find a full name in the address." | |
241 | :type 'boolean | |
242 | :group 'mail-extr) | |
243 | ||
154b3e39 RS |
244 | ;; Matches a leading title that is not part of the name (does not |
245 | ;; contribute to uniquely identifying the person). | |
0b5bb3ec | 246 | (defcustom mail-extr-full-name-prefixes |
154b3e39 RS |
247 | (purecopy |
248 | "\\(Prof\\|D[Rr]\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.?[ \t\n]") | |
249 | "*Matches prefixes to the full name that identify a person's position. | |
250 | These are stripped from the full name because they do not contribute to | |
0b5bb3ec SE |
251 | uniquely identifying the person." |
252 | :type 'regexp | |
253 | :group 'mail-extr) | |
154b3e39 | 254 | |
0b5bb3ec SE |
255 | (defcustom mail-extr-@-binds-tighter-than-! nil |
256 | "*Whether the local mail transport agent looks at ! before @." | |
257 | :type 'boolean | |
258 | :group 'mail-extr) | |
154b3e39 | 259 | |
0b5bb3ec | 260 | (defcustom mail-extr-mangle-uucp nil |
154b3e39 | 261 | "*Whether to throw away information in UUCP addresses |
0b5bb3ec SE |
262 | by translating things like \"foo!bar!baz@host\" into \"baz@bar.UUCP\"." |
263 | :type 'boolean | |
264 | :group 'mail-extr) | |
72c0ae01 ER |
265 | |
266 | ;;---------------------------------------------------------------------- | |
267 | ;; what orderings are meaningful????? | |
268 | ;;(defvar mail-operator-precedence-list '(?! ?% ?@)) | |
269 | ;; Right operand of a % or a @ must be a domain name, period. No other | |
270 | ;; operators allowed. Left operand of a @ is an address relative to that | |
271 | ;; site. | |
272 | ||
273 | ;; Left operand of a ! must be a domain name. Right operand is an | |
274 | ;; arbitrary address. | |
275 | ;;---------------------------------------------------------------------- | |
276 | ||
154b3e39 | 277 | \f |
72c0ae01 | 278 | |
154b3e39 RS |
279 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
280 | ;; | |
281 | ;; Constant definitions. | |
282 | ;; | |
283 | ||
72c0ae01 | 284 | ;; Any character that can occur in a name, not counting characters that |
154b3e39 RS |
285 | ;; separate parts of a multipart name (hyphen and period). |
286 | ;; Yes, there are weird people with digits in their names. | |
287 | ;; You will also notice the consideration for the | |
288 | ;; Swedish/Finnish/Norwegian character set. | |
154b3e39 | 289 | (defconst mail-extr-all-letters-but-separators |
6e66e4c6 | 290 | (purecopy "][[:alnum:]{|}'~`")) |
154b3e39 RS |
291 | |
292 | ;; Any character that can occur in a name in an RFC822 address including | |
293 | ;; the separator (hyphen and possibly period) for multipart names. | |
294 | ;; #### should . be in here? | |
295 | (defconst mail-extr-all-letters | |
296 | (purecopy (concat mail-extr-all-letters-but-separators "---"))) | |
297 | ||
298 | ;; Any character that can start a name. | |
299 | ;; Keep this set as minimal as possible. | |
6e66e4c6 | 300 | (defconst mail-extr-first-letters (purecopy "[:alpha:]")) |
72c0ae01 ER |
301 | |
302 | ;; Any character that can end a name. | |
154b3e39 | 303 | ;; Keep this set as minimal as possible. |
6e66e4c6 | 304 | (defconst mail-extr-last-letters (purecopy "[:alpha:]`'.")) |
72c0ae01 | 305 | |
af604656 | 306 | (defconst mail-extr-leading-garbage "\\W+") |
72c0ae01 | 307 | |
6c83d99f | 308 | ;; (defconst mail-extr-non-name-chars |
154b3e39 RS |
309 | ;; (purecopy (concat "^" mail-extr-all-letters "."))) |
310 | ;; (defconst mail-extr-non-begin-name-chars | |
311 | ;; (purecopy (concat "^" mail-extr-first-letters))) | |
312 | ;; (defconst mail-extr-non-end-name-chars | |
313 | ;; (purecopy (concat "^" mail-extr-last-letters))) | |
72c0ae01 | 314 | |
6c83d99f | 315 | ;; Matches an initial not followed by both a period and a space. |
154b3e39 | 316 | ;; (defconst mail-extr-bad-initials-pattern |
6c83d99f | 317 | ;; (purecopy |
154b3e39 RS |
318 | ;; (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)" |
319 | ;; mail-extr-all-letters mail-extr-first-letters mail-extr-all-letters))) | |
72c0ae01 ER |
320 | |
321 | ;; Matches periods used instead of spaces. Must not match the period | |
322 | ;; following an initial. | |
154b3e39 RS |
323 | (defconst mail-extr-bad-dot-pattern |
324 | (purecopy | |
325 | (format "\\([%s][%s]\\)\\.+\\([%s]\\)" | |
326 | mail-extr-all-letters | |
327 | mail-extr-last-letters | |
328 | mail-extr-first-letters))) | |
72c0ae01 ER |
329 | |
330 | ;; Matches an embedded or leading nickname that should be removed. | |
154b3e39 RS |
331 | ;; (defconst mail-extr-nickname-pattern |
332 | ;; (purecopy | |
333 | ;; (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] " | |
334 | ;; mail-extr-all-letters))) | |
72c0ae01 ER |
335 | |
336 | ;; Matches the occurrence of a generational name suffix, and the last | |
154b3e39 RS |
337 | ;; character of the preceding name. This is important because we want to |
338 | ;; keep such suffixes: they help to uniquely identify the person. | |
339 | ;; *** Perhaps this should be a user-customizable variable. However, the | |
340 | ;; *** regular expression is fairly tricky to alter, so maybe not. | |
341 | (defconst mail-extr-full-name-suffix-pattern | |
342 | (purecopy | |
343 | (format | |
344 | "\\(,? ?\\([JjSs][Rr]\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)" | |
345 | mail-extr-all-letters mail-extr-all-letters))) | |
346 | ||
347 | (defconst mail-extr-roman-numeral-pattern (purecopy "V?I+V?\\b")) | |
72c0ae01 ER |
348 | |
349 | ;; Matches a trailing uppercase (with other characters possible) acronym. | |
350 | ;; Must not match a trailing uppercase last name or trailing initial | |
154b3e39 RS |
351 | (defconst mail-extr-weird-acronym-pattern |
352 | (purecopy "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)")) | |
6c83d99f | 353 | |
72c0ae01 | 354 | ;; Matches a mixed-case or lowercase name (not an initial). |
154b3e39 RS |
355 | ;; #### Match Latin1 lower case letters here too? |
356 | ;; (defconst mail-extr-mixed-case-name-pattern | |
357 | ;; (purecopy | |
358 | ;; (format | |
359 | ;; "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)" | |
360 | ;; mail-extr-all-letters mail-extr-last-letters | |
361 | ;; mail-extr-first-letters mail-extr-all-letters mail-extr-all-letters | |
362 | ;; mail-extr-last-letters mail-extr-first-letters mail-extr-all-letters))) | |
72c0ae01 ER |
363 | |
364 | ;; Matches a trailing alternative address. | |
154b3e39 | 365 | ;; #### Match Latin1 letters here too? |
6c83d99f | 366 | ;; #### Match _ before @ here too? |
154b3e39 RS |
367 | (defconst mail-extr-alternative-address-pattern |
368 | (purecopy "\\(aka *\\)?[a-zA-Z.]+[!@][a-zA-Z.]")) | |
72c0ae01 ER |
369 | |
370 | ;; Matches a variety of trailing comments not including comma-delimited | |
371 | ;; comments. | |
154b3e39 RS |
372 | (defconst mail-extr-trailing-comment-start-pattern |
373 | (purecopy " [-{]\\|--\\|[+@#></\;]")) | |
72c0ae01 ER |
374 | |
375 | ;; Matches a name (not an initial). | |
376 | ;; This doesn't force a word boundary at the end because sometimes a | |
377 | ;; comment is separated by a `-' with no preceding space. | |
154b3e39 RS |
378 | (defconst mail-extr-name-pattern |
379 | (purecopy (format "\\b[%s][%s]*[%s]" | |
380 | mail-extr-first-letters | |
381 | mail-extr-all-letters | |
382 | mail-extr-last-letters))) | |
72c0ae01 | 383 | |
154b3e39 RS |
384 | (defconst mail-extr-initial-pattern |
385 | (purecopy (format "\\b[%s]\\([. ]\\|\\b\\)" mail-extr-first-letters))) | |
72c0ae01 ER |
386 | |
387 | ;; Matches a single name before a comma. | |
154b3e39 RS |
388 | ;; (defconst mail-extr-last-name-first-pattern |
389 | ;; (purecopy (concat "\\`" mail-extr-name-pattern ","))) | |
72c0ae01 ER |
390 | |
391 | ;; Matches telephone extensions. | |
154b3e39 RS |
392 | (defconst mail-extr-telephone-extension-pattern |
393 | (purecopy | |
394 | "\\(\\([Ee]xt\\|\\|[Tt]ph\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+")) | |
72c0ae01 ER |
395 | |
396 | ;; Matches ham radio call signs. | |
154b3e39 RS |
397 | ;; Help from: Mat Maessen N2NJZ <maessm@rpi.edu>, Mark Feit |
398 | ;; <mark@era.com>, Michael Covington <mcovingt@ai.uga.edu>. | |
399 | ;; Examples: DX504 DX515 K5MRU K8DHK KA9WGN KA9WGN KD3FU KD6EUI KD6HBW | |
400 | ;; KE9TV KF0NV N1API N3FU N3GZE N3IGS N4KCC N7IKQ N9HHU W4YHF W6ANK WA2SUH | |
401 | ;; WB7VZI N2NJZ NR3G KJ4KK AB4UM AL7NI KH6OH WN3KBT N4TMI W1A N0NZO | |
402 | (defconst mail-extr-ham-call-sign-pattern | |
403 | (purecopy "\\b\\(DX[0-9]+\\|[AKNW][A-Z]?[0-9][A-Z][A-Z]?[A-Z]?\\)")) | |
404 | ||
405 | ;; Possible trailing suffixes: "\\(/\\(KT\\|A[AEG]\\|[R0-9]\\)\\)?" | |
406 | ;; /KT == Temporary Technician (has CSC but not "real" license) | |
407 | ;; /AA == Temporary Advanced | |
408 | ;; /AE == Temporary Extra | |
409 | ;; /AG == Temporary General | |
410 | ;; /R == repeater | |
411 | ;; /# == stations operating out of home district | |
412 | ;; I don't include these in the regexp above because I can't imagine | |
413 | ;; anyone putting them with their name in an e-mail address. | |
72c0ae01 ER |
414 | |
415 | ;; Matches normal single-part name | |
154b3e39 RS |
416 | (defconst mail-extr-normal-name-pattern |
417 | (purecopy (format "\\b[%s][%s]+[%s]" | |
418 | mail-extr-first-letters | |
419 | mail-extr-all-letters-but-separators | |
420 | mail-extr-last-letters))) | |
421 | ||
422 | ;; Matches a single word name. | |
423 | ;; (defconst mail-extr-one-name-pattern | |
424 | ;; (purecopy (concat "\\`" mail-extr-normal-name-pattern "\\'"))) | |
6c83d99f | 425 | |
72c0ae01 | 426 | ;; Matches normal two names with missing middle initial |
154b3e39 RS |
427 | ;; The first name is not allowed to have a hyphen because this can cause |
428 | ;; false matches where the "middle initial" is actually the first letter | |
429 | ;; of the second part of the first name. | |
430 | (defconst mail-extr-two-name-pattern | |
431 | (purecopy | |
432 | (concat "\\`\\(" mail-extr-normal-name-pattern | |
433 | "\\|" mail-extr-initial-pattern | |
434 | "\\) +\\(" mail-extr-name-pattern "\\)\\(,\\|\\'\\)"))) | |
435 | ||
436 | (defconst mail-extr-listserv-list-name-pattern | |
437 | (purecopy "Multiple recipients of list \\([-A-Z]+\\)")) | |
438 | ||
439 | (defconst mail-extr-stupid-vms-date-stamp-pattern | |
440 | (purecopy | |
441 | "[0-9][0-9]-[JFMASOND][aepuco][nbrylgptvc]-[0-9][0-9][0-9][0-9] [0-9]+ *")) | |
442 | ||
443 | ;;; HZ -- GB (PRC Chinese character encoding) in ASCII embedding protocol | |
444 | ;; | |
445 | ;; In ASCII mode, a byte is interpreted as an ASCII character, unless a '~' is | |
446 | ;; encountered. The character '~' is an escape character. By convention, it | |
447 | ;; must be immediately followed ONLY by '~', '{' or '\n' (<LF>), with the | |
448 | ;; following special meaning. | |
6c83d99f | 449 | ;; |
154b3e39 RS |
450 | ;; o The escape sequence '~~' is interpreted as a '~'. |
451 | ;; o The escape-to-GB sequence '~{' switches the mode from ASCII to GB. | |
452 | ;; o The escape sequence '~\n' is a line-continuation marker to be consumed | |
453 | ;; with no output produced. | |
6c83d99f | 454 | ;; |
154b3e39 RS |
455 | ;; In GB mode, characters are interpreted two bytes at a time as (pure) GB |
456 | ;; codes until the escape-from-GB code '~}' is read. This code switches the | |
457 | ;; mode from GB back to ASCII. (Note that the escape-from-GB code '~}' | |
458 | ;; ($7E7D) is outside the defined GB range.) | |
459 | (defconst mail-extr-hz-embedded-gb-encoded-chinese-pattern | |
460 | (purecopy "~{\\([^~].\\|~[^\}]\\)+~}")) | |
461 | ||
462 | ;; The leading optional lowercase letters are for a bastardized version of | |
463 | ;; the encoding, as is the optional nature of the final slash. | |
464 | (defconst mail-extr-x400-encoded-address-pattern | |
465 | (purecopy "[a-z]?[a-z]?\\(/[A-Za-z]+\\(\\.[A-Za-z]+\\)?=[^/]+\\)+/?\\'")) | |
466 | ||
467 | (defconst mail-extr-x400-encoded-address-field-pattern-format | |
468 | (purecopy "/%s=\\([^/]+\\)\\(/\\|\\'\\)")) | |
469 | ||
470 | (defconst mail-extr-x400-encoded-address-surname-pattern | |
471 | ;; S stands for Surname (family name). | |
472 | (purecopy | |
473 | (format mail-extr-x400-encoded-address-field-pattern-format "[Ss]"))) | |
474 | ||
475 | (defconst mail-extr-x400-encoded-address-given-name-pattern | |
476 | ;; G stands for Given name. | |
477 | (purecopy | |
478 | (format mail-extr-x400-encoded-address-field-pattern-format "[Gg]"))) | |
479 | ||
480 | (defconst mail-extr-x400-encoded-address-full-name-pattern | |
481 | ;; PN stands for Personal Name. When used it represents the combination | |
482 | ;; of the G and S fields. | |
483 | ;; "The one system I used having this field asked it with the prompt | |
484 | ;; `Personal Name'. But they mapped it into G and S on outgoing real | |
485 | ;; X.400 addresses. As they mapped G and S into PN on incoming..." | |
486 | (purecopy | |
487 | (format mail-extr-x400-encoded-address-field-pattern-format "[Pp][Nn]"))) | |
488 | ||
489 | \f | |
490 | ||
491 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
492 | ;; | |
493 | ;; Syntax tables used for quick parsing. | |
494 | ;; | |
495 | ||
496 | (defconst mail-extr-address-syntax-table (make-syntax-table)) | |
497 | (defconst mail-extr-address-comment-syntax-table (make-syntax-table)) | |
498 | (defconst mail-extr-address-domain-literal-syntax-table (make-syntax-table)) | |
499 | (defconst mail-extr-address-text-comment-syntax-table (make-syntax-table)) | |
500 | (defconst mail-extr-address-text-syntax-table (make-syntax-table)) | |
d980c402 SM |
501 | (mapc |
502 | (lambda (pair) | |
503 | (let ((syntax-table (symbol-value (car pair)))) | |
504 | (dolist (item (cdr pair)) | |
505 | (if (eq 2 (length item)) | |
506 | ;; modifying syntax of a single character | |
507 | (modify-syntax-entry (car item) (car (cdr item)) syntax-table) | |
508 | ;; modifying syntax of a range of characters | |
509 | (let ((char (nth 0 item)) | |
510 | (bound (nth 1 item)) | |
511 | (syntax (nth 2 item))) | |
512 | (while (<= char bound) | |
513 | (modify-syntax-entry char syntax syntax-table) | |
514 | (setq char (1+ char)))))))) | |
154b3e39 RS |
515 | '((mail-extr-address-syntax-table |
516 | (?\000 ?\037 "w") ;control characters | |
517 | (?\040 " ") ;SPC | |
518 | (?! ?~ "w") ;printable characters | |
519 | (?\177 "w") ;DEL | |
72c0ae01 ER |
520 | (?\t " ") |
521 | (?\r " ") | |
522 | (?\n " ") | |
523 | (?\( ".") | |
524 | (?\) ".") | |
525 | (?< ".") | |
526 | (?> ".") | |
527 | (?@ ".") | |
528 | (?, ".") | |
529 | (?\; ".") | |
530 | (?: ".") | |
531 | (?\\ "\\") | |
532 | (?\" "\"") | |
533 | (?. ".") | |
534 | (?\[ ".") | |
535 | (?\] ".") | |
536 | ;; % and ! aren't RFC822 characters, but it is convenient to pretend | |
537 | (?% ".") | |
154b3e39 | 538 | (?! ".") ;; this needs to be word-constituent when not in .UUCP mode |
72c0ae01 | 539 | ) |
154b3e39 RS |
540 | (mail-extr-address-comment-syntax-table |
541 | (?\000 ?\377 "w") | |
542 | (?\040 " ") | |
543 | (?\240 " ") | |
544 | (?\t " ") | |
545 | (?\r " ") | |
546 | (?\n " ") | |
72c0ae01 ER |
547 | (?\( "\(\)") |
548 | (?\) "\)\(") | |
549 | (?\\ "\\")) | |
154b3e39 RS |
550 | (mail-extr-address-domain-literal-syntax-table |
551 | (?\000 ?\377 "w") | |
552 | (?\040 " ") | |
553 | (?\240 " ") | |
554 | (?\t " ") | |
555 | (?\r " ") | |
556 | (?\n " ") | |
72c0ae01 ER |
557 | (?\[ "\(\]") ;?????? |
558 | (?\] "\)\[") ;?????? | |
559 | (?\\ "\\")) | |
154b3e39 RS |
560 | (mail-extr-address-text-comment-syntax-table |
561 | (?\000 ?\377 "w") | |
562 | (?\040 " ") | |
563 | (?\240 " ") | |
564 | (?\t " ") | |
565 | (?\r " ") | |
566 | (?\n " ") | |
72c0ae01 ER |
567 | (?\( "\(\)") |
568 | (?\) "\)\(") | |
569 | (?\[ "\(\]") | |
570 | (?\] "\)\[") | |
571 | (?\{ "\(\}") | |
572 | (?\} "\)\{") | |
573 | (?\\ "\\") | |
574 | (?\" "\"") | |
575 | ;; (?\' "\)\`") | |
576 | ;; (?\` "\(\'") | |
577 | ) | |
154b3e39 RS |
578 | (mail-extr-address-text-syntax-table |
579 | (?\000 ?\177 ".") | |
580 | (?\200 ?\377 "w") | |
581 | (?\040 " ") | |
582 | (?\t " ") | |
583 | (?\r " ") | |
584 | (?\n " ") | |
72c0ae01 ER |
585 | (?A ?Z "w") |
586 | (?a ?z "w") | |
587 | (?- "w") | |
588 | (?\} "w") | |
589 | (?\{ "w") | |
590 | (?| "w") | |
591 | (?\' "w") | |
592 | (?~ "w") | |
593 | (?0 ?9 "w")) | |
594 | )) | |
595 | ||
596 | \f | |
154b3e39 RS |
597 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
598 | ;; | |
72c0ae01 | 599 | ;; Utility functions and macros. |
154b3e39 RS |
600 | ;; |
601 | ||
ff9ed544 DL |
602 | ;; Fixme: There are Latin-1 nbsp below. If such characters should be |
603 | ;; included, this is the wrong thing to do -- it should use syntax (or | |
604 | ;; regexp char classes). | |
605 | ||
7a9ebd0b | 606 | (defsubst mail-extr-skip-whitespace-forward () |
154b3e39 | 607 | ;; v19 fn skip-syntax-forward is more tasteful, but not byte-coded. |
b1ccc1ce | 608 | (skip-chars-forward " \t\n\r ")) |
154b3e39 | 609 | |
7a9ebd0b | 610 | (defsubst mail-extr-skip-whitespace-backward () |
154b3e39 | 611 | ;; v19 fn skip-syntax-backward is more tasteful, but not byte-coded. |
b1ccc1ce | 612 | (skip-chars-backward " \t\n\r ")) |
7a9ebd0b | 613 | |
72c0ae01 | 614 | |
7a9ebd0b SM |
615 | (defsubst mail-extr-undo-backslash-quoting (beg end) |
616 | (save-excursion | |
617 | (save-restriction | |
618 | (narrow-to-region beg end) | |
619 | (goto-char (point-min)) | |
620 | ;; undo \ quoting | |
621 | (while (search-forward "\\" nil t) | |
d980c402 | 622 | (delete-char -1) |
7a9ebd0b SM |
623 | (or (eobp) |
624 | (forward-char 1)))))) | |
625 | ||
626 | (defsubst mail-extr-nuke-char-at (pos) | |
627 | (save-excursion | |
628 | (goto-char pos) | |
d980c402 | 629 | (delete-char 1) |
7a9ebd0b | 630 | (insert ?\ ))) |
154b3e39 RS |
631 | |
632 | (put 'mail-extr-nuke-outside-range | |
633 | 'edebug-form-spec '(symbolp &optional form form atom)) | |
634 | ||
635 | (defmacro mail-extr-nuke-outside-range (list-symbol | |
636 | beg-symbol end-symbol | |
637 | &optional no-replace) | |
d980c402 SM |
638 | "Delete all elements outside BEG..END in LIST. |
639 | LIST-SYMBOL names a variable holding a list of buffer positions | |
640 | BEG-SYMBOL and END-SYMBOL name variables delimiting a range | |
641 | Each element of LIST-SYMBOL which lies outside of the range is | |
642 | deleted from the list. | |
643 | Unless NO-REPLACE is true, at each of the positions in LIST-SYMBOL | |
644 | which lie outside of the range, one character at that position is | |
645 | replaced with a SPC." | |
154b3e39 | 646 | (or (memq no-replace '(t nil)) |
29565a87 | 647 | (error "no-replace must be t or nil, evaluable at macroexpand-time")) |
d980c402 | 648 | `(let ((temp ,list-symbol) |
154b3e39 | 649 | ch) |
72c0ae01 | 650 | (while temp |
154b3e39 | 651 | (setq ch (car temp)) |
d980c402 SM |
652 | (when (or (> ch ,end-symbol) |
653 | (< ch ,beg-symbol)) | |
654 | ,@(if no-replace | |
655 | nil | |
656 | `((mail-extr-nuke-char-at ch))) | |
657 | (setcar temp nil)) | |
72c0ae01 | 658 | (setq temp (cdr temp))) |
d980c402 | 659 | (setq ,list-symbol (delq nil ,list-symbol)))) |
72c0ae01 | 660 | |
154b3e39 RS |
661 | (defun mail-extr-demarkerize (marker) |
662 | ;; if arg is a marker, destroys the marker, then returns the old value. | |
663 | ;; otherwise returns the arg. | |
664 | (if (markerp marker) | |
665 | (let ((temp (marker-position marker))) | |
666 | (set-marker marker nil) | |
667 | temp) | |
668 | marker)) | |
669 | ||
670 | (defun mail-extr-markerize (pos) | |
671 | ;; coerces pos to a marker if non-nil. | |
672 | (if (or (markerp pos) (null pos)) | |
673 | pos | |
674 | (copy-marker pos))) | |
675 | ||
7a9ebd0b | 676 | (defsubst mail-extr-safe-move-sexp (arg) |
154b3e39 | 677 | ;; Safely skip over one balanced sexp, if there is one. Return t if success. |
7a9ebd0b SM |
678 | (condition-case error |
679 | (progn | |
680 | (goto-char (or (scan-sexps (point) arg) (point))) | |
681 | t) | |
682 | (error | |
683 | ;; #### kludge kludge kludge kludge kludge kludge kludge !!! | |
684 | (if (string-equal (nth 1 error) "Unbalanced parentheses") | |
685 | nil | |
686 | (while t | |
687 | (signal (car error) (cdr error))))))) | |
72c0ae01 | 688 | \f |
154b3e39 RS |
689 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
690 | ;; | |
72c0ae01 | 691 | ;; The main function to grind addresses |
154b3e39 RS |
692 | ;; |
693 | ||
694 | (defvar disable-initial-guessing-flag) ; dynamic assignment | |
695 | (defvar cbeg) ; dynamic assignment | |
696 | (defvar cend) ; dynamic assignment | |
4d96f7e7 | 697 | (defvar mail-extr-all-top-level-domains) ; Defined below. |
72c0ae01 | 698 | |
154b3e39 | 699 | ;;;###autoload |
d1782bd8 KH |
700 | (defun mail-extract-address-components (address &optional all) |
701 | "Given an RFC-822 address ADDRESS, extract full name and canonical address. | |
e8f4db18 RS |
702 | Returns a list of the form (FULL-NAME CANONICAL-ADDRESS). If no |
703 | name can be extracted, FULL-NAME will be nil. Also see | |
704 | `mail-extr-ignore-single-names' and | |
705 | `mail-extr-ignore-realname-equals-mailbox-name'. | |
d1782bd8 KH |
706 | |
707 | If the optional argument ALL is non-nil, then ADDRESS can contain zero | |
708 | or more recipients, separated by commas, and we return a list of | |
709 | the form ((FULL-NAME CANONICAL-ADDRESS) ...) with one element for | |
710 | each recipient. If ALL is nil, then if ADDRESS contains more than | |
711 | one recipients, all but the first is ignored. | |
712 | ||
7a9ebd0b | 713 | ADDRESS may be a string or a buffer. If it is a buffer, the visible |
2a487c6c SJ |
714 | \(narrowed) portion of the buffer will be interpreted as the address. |
715 | \(This feature exists so that the clever caller might be able to avoid | |
716 | consing a string.)" | |
154b3e39 RS |
717 | (let ((canonicalization-buffer (get-buffer-create " *canonical address*")) |
718 | (extraction-buffer (get-buffer-create " *extract address components*")) | |
d1782bd8 KH |
719 | value-list) |
720 | ||
7a9ebd0b | 721 | (with-current-buffer (get-buffer-create extraction-buffer) |
154b3e39 | 722 | (fundamental-mode) |
e8a57935 | 723 | (buffer-disable-undo extraction-buffer) |
154b3e39 | 724 | (set-syntax-table mail-extr-address-syntax-table) |
72c0ae01 ER |
725 | (widen) |
726 | (erase-buffer) | |
727 | (setq case-fold-search nil) | |
6c83d99f | 728 | |
72c0ae01 ER |
729 | ;; Insert extra space at beginning to allow later replacement with < |
730 | ;; without having to move markers. | |
154b3e39 RS |
731 | (insert ?\ ) |
732 | ||
733 | ;; Insert the address itself. | |
734 | (cond ((stringp address) | |
735 | (insert address)) | |
736 | ((bufferp address) | |
737 | (insert-buffer-substring address)) | |
738 | (t | |
29565a87 | 739 | (error "Invalid address: %s" address))) |
03007ccc RS |
740 | |
741 | (set-text-properties (point-min) (point-max) nil) | |
d1782bd8 | 742 | |
7a9ebd0b | 743 | (with-current-buffer (get-buffer-create canonicalization-buffer) |
d1782bd8 KH |
744 | (fundamental-mode) |
745 | (buffer-disable-undo canonicalization-buffer) | |
d1782bd8 KH |
746 | (setq case-fold-search nil)) |
747 | ||
6c83d99f | 748 | |
72c0ae01 ER |
749 | ;; Unfold multiple lines. |
750 | (goto-char (point-min)) | |
751 | (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t) | |
752 | (replace-match "\\1 " t)) | |
6c83d99f | 753 | |
d1782bd8 KH |
754 | ;; Loop over addresses until we have as many as we want. |
755 | (while (and (or all (null value-list)) | |
756 | (progn (goto-char (point-min)) | |
757 | (skip-chars-forward " \t") | |
758 | (not (eobp)))) | |
759 | (let (char | |
760 | end-of-address | |
f98b752f | 761 | <-pos >-pos @-pos colon-pos comma-pos !-pos %-pos \;-pos |
d1782bd8 KH |
762 | group-:-pos group-\;-pos route-addr-:-pos |
763 | record-pos-symbol | |
764 | first-real-pos last-real-pos | |
765 | phrase-beg phrase-end | |
766 | cbeg cend ; dynamically set from -voodoo | |
767 | quote-beg quote-end | |
768 | atom-beg atom-end | |
769 | mbox-beg mbox-end | |
770 | \.-ends-name | |
771 | temp | |
772 | ;; name-suffix | |
773 | fi mi li ; first, middle, last initial | |
774 | saved-%-pos saved-!-pos saved-@-pos | |
775 | domain-pos \.-pos insert-point | |
776 | ;; mailbox-name-processed-flag | |
777 | disable-initial-guessing-flag) ; dynamically set from -voodoo | |
778 | ||
7a9ebd0b | 779 | (set-syntax-table mail-extr-address-syntax-table) |
d1782bd8 KH |
780 | (goto-char (point-min)) |
781 | ||
782 | ;; Insert extra space at beginning to allow later replacement with < | |
783 | ;; without having to move markers. | |
784 | (or (eq (following-char) ?\ ) | |
785 | (insert ?\ )) | |
786 | ||
787 | ;; First pass grabs useful information about address. | |
788 | (while (progn | |
789 | (mail-extr-skip-whitespace-forward) | |
790 | (not (eobp))) | |
791 | (setq char (char-after (point))) | |
792 | (or first-real-pos | |
793 | (if (not (eq char ?\()) | |
794 | (setq first-real-pos (point)))) | |
795 | (cond | |
796 | ;; comment | |
797 | ((eq char ?\() | |
798 | (set-syntax-table mail-extr-address-comment-syntax-table) | |
799 | ;; only record the first non-empty comment's position | |
800 | (if (and (not cbeg) | |
801 | (save-excursion | |
802 | (forward-char 1) | |
803 | (mail-extr-skip-whitespace-forward) | |
804 | (not (eq ?\) (char-after (point)))))) | |
805 | (setq cbeg (point))) | |
806 | ;; TODO: don't record if unbalanced | |
807 | (or (mail-extr-safe-move-sexp 1) | |
808 | (forward-char 1)) | |
809 | (set-syntax-table mail-extr-address-syntax-table) | |
810 | (if (and cbeg | |
811 | (not cend)) | |
812 | (setq cend (point)))) | |
813 | ;; quoted text | |
814 | ((eq char ?\") | |
815 | ;; only record the first non-empty quote's position | |
816 | (if (and (not quote-beg) | |
817 | (save-excursion | |
818 | (forward-char 1) | |
819 | (mail-extr-skip-whitespace-forward) | |
820 | (not (eq ?\" (char-after (point)))))) | |
821 | (setq quote-beg (point))) | |
822 | ;; TODO: don't record if unbalanced | |
823 | (or (mail-extr-safe-move-sexp 1) | |
824 | (forward-char 1)) | |
825 | (if (and quote-beg | |
826 | (not quote-end)) | |
827 | (setq quote-end (point)))) | |
828 | ;; domain literals | |
829 | ((eq char ?\[) | |
830 | (set-syntax-table mail-extr-address-domain-literal-syntax-table) | |
831 | (or (mail-extr-safe-move-sexp 1) | |
832 | (forward-char 1)) | |
833 | (set-syntax-table mail-extr-address-syntax-table)) | |
834 | ;; commas delimit addresses when outside < > pairs. | |
835 | ((and (eq char ?,) | |
836 | (or (and (null <-pos) | |
837 | ;; Handle ROUTE-ADDR address that is missing its <. | |
838 | (not (eq ?@ (char-after (1+ (point)))))) | |
839 | (and >-pos | |
840 | ;; handle weird munged addresses | |
841 | ;; BUG FIX: This test was reversed. Thanks to the | |
842 | ;; brilliant Rod Whitby <rwhitby@research.canon.oz.au> | |
843 | ;; for discovering this! | |
7a9ebd0b | 844 | (< (car (last <-pos)) (car >-pos))))) |
d1782bd8 KH |
845 | ;; The argument contains more than one address. |
846 | ;; Temporarily hide everything after this one. | |
7a9ebd0b | 847 | (setq end-of-address (copy-marker (1+ (point)) t)) |
d1782bd8 | 848 | (narrow-to-region (point-min) (1+ (point))) |
7a9ebd0b | 849 | (delete-char 1) |
d1782bd8 KH |
850 | (setq char ?\() ; HAVE I NO SHAME?? |
851 | ) | |
852 | ;; record the position of various interesting chars, determine | |
4f9c00e2 | 853 | ;; validity later. |
d1782bd8 KH |
854 | ((setq record-pos-symbol |
855 | (cdr (assq char | |
856 | '((?< . <-pos) (?> . >-pos) (?@ . @-pos) | |
f98b752f | 857 | (?: . colon-pos) (?, . comma-pos) (?! . !-pos) |
d1782bd8 KH |
858 | (?% . %-pos) (?\; . \;-pos))))) |
859 | (set record-pos-symbol | |
860 | (cons (point) (symbol-value record-pos-symbol))) | |
72c0ae01 | 861 | (forward-char 1)) |
d1782bd8 | 862 | ((eq char ?.) |
72c0ae01 | 863 | (forward-char 1)) |
d1782bd8 | 864 | ((memq char '( |
4f9c00e2 | 865 | ;; comment terminator invalid |
d1782bd8 | 866 | ?\) |
4f9c00e2 | 867 | ;; domain literal terminator invalid |
d1782bd8 KH |
868 | ?\] |
869 | ;; \ allowed only within quoted strings, | |
870 | ;; domain literals, and comments | |
871 | ?\\ | |
872 | )) | |
873 | (mail-extr-nuke-char-at (point)) | |
72c0ae01 | 874 | (forward-char 1)) |
d1782bd8 | 875 | (t |
0330231b CY |
876 | ;; Do `(forward-word 1)', recognizing non-ASCII characters |
877 | ;; except Latin-1 nbsp as words. | |
878 | (while (progn | |
879 | (skip-chars-forward "^\000-\177 ") | |
880 | (and (not (eobp)) | |
881 | (eq ?w (char-syntax (char-after))) | |
882 | (progn | |
883 | (forward-word 1) | |
884 | (and (not (eobp)) | |
885 | (> (char-after) ?\177) | |
886 | (not (eq (char-after) ? ))))))))) | |
d1782bd8 KH |
887 | (or (eq char ?\() |
888 | ;; At the end of first address of a multiple address header. | |
889 | (and (eq char ?,) | |
890 | (eobp)) | |
891 | (setq last-real-pos (point)))) | |
892 | ||
893 | ;; Use only the leftmost <, if any. Replace all others with spaces. | |
894 | (while (cdr <-pos) | |
895 | (mail-extr-nuke-char-at (car <-pos)) | |
896 | (setq <-pos (cdr <-pos))) | |
897 | ||
898 | ;; Use only the rightmost >, if any. Replace all others with spaces. | |
899 | (while (cdr >-pos) | |
900 | (mail-extr-nuke-char-at (nth 1 >-pos)) | |
901 | (setcdr >-pos (nthcdr 2 >-pos))) | |
902 | ||
903 | ;; If multiple @s and a :, but no < and >, insert around buffer. | |
904 | ;; Example: @foo.bar.dom,@xxx.yyy.zzz:mailbox@aaa.bbb.ccc | |
905 | ;; This commonly happens on the UUCP "From " line. Ugh. | |
d980c402 | 906 | (when (and (> (length @-pos) 1) |
f98b752f | 907 | (eq 1 (length colon-pos)) ;TODO: check if between last two @s |
d1782bd8 KH |
908 | (not \;-pos) |
909 | (not <-pos)) | |
d980c402 SM |
910 | (goto-char (point-min)) |
911 | (delete-char 1) | |
912 | (setq <-pos (list (point))) | |
913 | (insert ?<)) | |
d1782bd8 KH |
914 | |
915 | ;; If < but no >, insert > in rightmost possible position | |
d980c402 SM |
916 | (when (and <-pos (null >-pos)) |
917 | (goto-char (point-max)) | |
918 | (setq >-pos (list (point))) | |
919 | (insert ?>)) | |
d1782bd8 KH |
920 | |
921 | ;; If > but no <, replace > with space. | |
d980c402 SM |
922 | (when (and >-pos (null <-pos)) |
923 | (mail-extr-nuke-char-at (car >-pos)) | |
924 | (setq >-pos nil)) | |
d1782bd8 KH |
925 | |
926 | ;; Turn >-pos and <-pos into non-lists | |
927 | (setq >-pos (car >-pos) | |
928 | <-pos (car <-pos)) | |
929 | ||
930 | ;; Trim other punctuation lists of items outside < > pair to handle | |
931 | ;; stupid MTAs. | |
d980c402 SM |
932 | (when <-pos ; don't need to check >-pos also |
933 | ;; handle bozo software that violates RFC 822 by sticking | |
934 | ;; punctuation marks outside of a < > pair | |
935 | (mail-extr-nuke-outside-range @-pos <-pos >-pos t) | |
936 | ;; RFC 822 says nothing about these two outside < >, but | |
937 | ;; remove those positions from the lists to make things | |
938 | ;; easier. | |
939 | (mail-extr-nuke-outside-range !-pos <-pos >-pos t) | |
940 | (mail-extr-nuke-outside-range %-pos <-pos >-pos t)) | |
d1782bd8 KH |
941 | |
942 | ;; Check for : that indicates GROUP list and for : part of | |
943 | ;; ROUTE-ADDR spec. | |
944 | ;; Can't possibly be more than two :. Nuke any extra. | |
f98b752f KH |
945 | (while colon-pos |
946 | (setq temp (car colon-pos) | |
947 | colon-pos (cdr colon-pos)) | |
d1782bd8 KH |
948 | (cond ((and <-pos >-pos |
949 | (> temp <-pos) | |
950 | (< temp >-pos)) | |
951 | (if (or route-addr-:-pos | |
952 | (< (length @-pos) 2) | |
953 | (> temp (car @-pos)) | |
954 | (< temp (nth 1 @-pos))) | |
955 | (mail-extr-nuke-char-at temp) | |
956 | (setq route-addr-:-pos temp))) | |
957 | ((or (not <-pos) | |
958 | (and <-pos | |
959 | (< temp <-pos))) | |
960 | (setq group-:-pos temp)))) | |
961 | ||
962 | ;; Nuke any ; that is in or to the left of a < > pair or to the left | |
963 | ;; of a GROUP starting :. Also, there may only be one ;. | |
964 | (while \;-pos | |
965 | (setq temp (car \;-pos) | |
966 | \;-pos (cdr \;-pos)) | |
967 | (cond ((and <-pos >-pos | |
968 | (> temp <-pos) | |
969 | (< temp >-pos)) | |
970 | (mail-extr-nuke-char-at temp)) | |
971 | ((and (or (not group-:-pos) | |
972 | (> temp group-:-pos)) | |
973 | (not group-\;-pos)) | |
974 | (setq group-\;-pos temp)))) | |
975 | ||
976 | ;; Nuke unmatched GROUP syntax characters. | |
d980c402 SM |
977 | (when (and group-:-pos (not group-\;-pos)) |
978 | ;; *** Do I really need to erase it? | |
979 | (mail-extr-nuke-char-at group-:-pos) | |
980 | (setq group-:-pos nil)) | |
981 | (when (and group-\;-pos (not group-:-pos)) | |
982 | ;; *** Do I really need to erase it? | |
983 | (mail-extr-nuke-char-at group-\;-pos) | |
984 | (setq group-\;-pos nil)) | |
d1782bd8 KH |
985 | |
986 | ;; Handle junk like ";@host.company.dom" that sendmail adds. | |
987 | ;; **** should I remember comment positions? | |
d980c402 | 988 | (when group-\;-pos |
d1782bd8 KH |
989 | ;; this is fine for now |
990 | (mail-extr-nuke-outside-range !-pos group-:-pos group-\;-pos t) | |
991 | (mail-extr-nuke-outside-range @-pos group-:-pos group-\;-pos t) | |
992 | (mail-extr-nuke-outside-range %-pos group-:-pos group-\;-pos t) | |
993 | (mail-extr-nuke-outside-range comma-pos group-:-pos group-\;-pos t) | |
994 | (and last-real-pos | |
995 | (> last-real-pos (1+ group-\;-pos)) | |
996 | (setq last-real-pos (1+ group-\;-pos))) | |
997 | ;; *** This may be wrong: | |
998 | (and cend | |
999 | (> cend group-\;-pos) | |
1000 | (setq cend nil | |
1001 | cbeg nil)) | |
1002 | (and quote-end | |
1003 | (> quote-end group-\;-pos) | |
1004 | (setq quote-end nil | |
1005 | quote-beg nil)) | |
1006 | ;; This was both wrong and unnecessary: | |
1007 | ;;(narrow-to-region (point-min) group-\;-pos) | |
1008 | ||
1009 | ;; *** The entire handling of GROUP addresses seems rather lame. | |
1010 | ;; *** It deserves a complete rethink, except that these addresses | |
1011 | ;; *** are hardly ever seen. | |
d980c402 | 1012 | ) |
d1782bd8 KH |
1013 | |
1014 | ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any | |
1015 | ;; others. | |
6c83d99f | 1016 | ;; Hell, go ahead and nuke all of the commas. |
d1782bd8 KH |
1017 | ;; **** This will cause problems when we start handling commas in |
1018 | ;; the PHRASE part .... no it won't ... yes it will ... ????? | |
1019 | (mail-extr-nuke-outside-range comma-pos 1 1) | |
1020 | ||
1021 | ;; can only have multiple @s inside < >. The fact that some MTAs | |
1022 | ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is | |
1023 | ;; handled above. | |
1024 | ||
1025 | ;; Locate PHRASE part of ROUTE-ADDR. | |
d980c402 SM |
1026 | (when <-pos |
1027 | (goto-char <-pos) | |
1028 | (mail-extr-skip-whitespace-backward) | |
1029 | (setq phrase-end (point)) | |
1030 | (goto-char (or ;;group-:-pos | |
1031 | (point-min))) | |
1032 | (mail-extr-skip-whitespace-forward) | |
1033 | (if (< (point) phrase-end) | |
1034 | (setq phrase-beg (point)) | |
1035 | (setq phrase-end nil))) | |
d1782bd8 KH |
1036 | |
1037 | ;; handle ROUTE-ADDRS with real ROUTEs. | |
1038 | ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and | |
1039 | ;; any % or ! must be semantically meaningless. | |
1040 | ;; TODO: do this processing into canonicalization buffer | |
d980c402 SM |
1041 | (when route-addr-:-pos |
1042 | (setq !-pos nil | |
1043 | %-pos nil | |
1044 | >-pos (copy-marker >-pos) | |
1045 | route-addr-:-pos (copy-marker route-addr-:-pos)) | |
1046 | (goto-char >-pos) | |
1047 | (insert-before-markers ?X) | |
1048 | (goto-char (car @-pos)) | |
1049 | (while (setq @-pos (cdr @-pos)) | |
1050 | (delete-char 1) | |
1051 | (setq %-pos (cons (point-marker) %-pos)) | |
1052 | (insert "%") | |
1053 | (goto-char (1- >-pos)) | |
1054 | (save-excursion | |
1055 | (insert-buffer-substring extraction-buffer | |
1056 | (car @-pos) route-addr-:-pos) | |
1057 | (delete-region (car @-pos) route-addr-:-pos)) | |
1058 | (or (cdr @-pos) | |
1059 | (setq saved-@-pos (list (point))))) | |
1060 | (setq @-pos saved-@-pos) | |
1061 | (goto-char >-pos) | |
1062 | (delete-char -1) | |
1063 | (mail-extr-nuke-char-at route-addr-:-pos) | |
1064 | (mail-extr-demarkerize route-addr-:-pos) | |
1065 | (setq route-addr-:-pos nil | |
1066 | >-pos (mail-extr-demarkerize >-pos) | |
1067 | %-pos (mapcar 'mail-extr-demarkerize %-pos))) | |
d1782bd8 KH |
1068 | |
1069 | ;; de-listify @-pos | |
1070 | (setq @-pos (car @-pos)) | |
1071 | ||
1072 | ;; TODO: remove comments in the middle of an address | |
1073 | ||
d980c402 | 1074 | (with-current-buffer canonicalization-buffer |
d1782bd8 KH |
1075 | (widen) |
1076 | (erase-buffer) | |
1077 | (insert-buffer-substring extraction-buffer) | |
1078 | ||
1079 | (if <-pos | |
1080 | (narrow-to-region (progn | |
1081 | (goto-char (1+ <-pos)) | |
1082 | (mail-extr-skip-whitespace-forward) | |
1083 | (point)) | |
1084 | >-pos) | |
1085 | (if (and first-real-pos last-real-pos) | |
1086 | (narrow-to-region first-real-pos last-real-pos) | |
1087 | ;; ****** Oh no! What if the address is completely empty! | |
1088 | ;; *** Is this correct? | |
d980c402 | 1089 | (narrow-to-region (point-max) (point-max)))) |
d1782bd8 KH |
1090 | |
1091 | (and @-pos %-pos | |
1092 | (mail-extr-nuke-outside-range %-pos (point-min) @-pos)) | |
1093 | (and %-pos !-pos | |
1094 | (mail-extr-nuke-outside-range !-pos (point-min) (car %-pos))) | |
1095 | (and @-pos !-pos (not %-pos) | |
1096 | (mail-extr-nuke-outside-range !-pos (point-min) @-pos)) | |
1097 | ||
1098 | ;; Error condition:?? (and %-pos (not @-pos)) | |
1099 | ||
1100 | ;; WARNING: THIS CODE IS DUPLICATED BELOW. | |
d980c402 SM |
1101 | (when (and %-pos (not @-pos)) |
1102 | (goto-char (car %-pos)) | |
1103 | (delete-char 1) | |
1104 | (setq @-pos (point)) | |
1105 | (insert "@") | |
1106 | (setq %-pos (cdr %-pos))) | |
1107 | ||
1108 | (when (and mail-extr-mangle-uucp !-pos) | |
1109 | ;; **** I don't understand this save-restriction and the | |
1110 | ;; narrow-to-region inside it. Why did I do that? | |
1111 | (save-restriction | |
1112 | (cond ((and @-pos | |
1113 | mail-extr-@-binds-tighter-than-!) | |
1114 | (goto-char @-pos) | |
1115 | (setq %-pos (cons (point) %-pos) | |
1116 | @-pos nil) | |
1117 | (delete-char 1) | |
1118 | (insert "%") | |
1119 | (setq insert-point (point-max))) | |
1120 | (mail-extr-@-binds-tighter-than-! | |
1121 | (setq insert-point (point-max))) | |
1122 | (%-pos | |
1123 | (setq insert-point (car (last %-pos)) | |
1124 | saved-%-pos (mapcar 'mail-extr-markerize %-pos) | |
1125 | %-pos nil | |
1126 | @-pos (mail-extr-markerize @-pos))) | |
1127 | (@-pos | |
1128 | (setq insert-point @-pos) | |
1129 | (setq @-pos (mail-extr-markerize @-pos))) | |
1130 | (t | |
1131 | (setq insert-point (point-max)))) | |
1132 | (narrow-to-region (point-min) insert-point) | |
1133 | (setq saved-!-pos (car !-pos)) | |
1134 | (while !-pos | |
1135 | (goto-char (point-max)) | |
1136 | (cond ((and (not @-pos) | |
1137 | (not (cdr !-pos))) | |
1138 | (setq @-pos (point)) | |
1139 | (insert-before-markers "@ ")) | |
1140 | (t | |
1141 | (setq %-pos (cons (point) %-pos)) | |
1142 | (insert-before-markers "% "))) | |
1143 | (backward-char 1) | |
1144 | (insert-buffer-substring | |
1145 | (current-buffer) | |
1146 | (if (nth 1 !-pos) | |
1147 | (1+ (nth 1 !-pos)) | |
1148 | (point-min)) | |
1149 | (car !-pos)) | |
1150 | (delete-char 1) | |
1151 | (or (save-excursion | |
1152 | (mail-extr-safe-move-sexp -1) | |
1153 | (mail-extr-skip-whitespace-backward) | |
1154 | (eq ?. (preceding-char))) | |
1155 | (insert-before-markers | |
1156 | (if (save-excursion | |
1157 | (mail-extr-skip-whitespace-backward) | |
1158 | (eq ?. (preceding-char))) | |
1159 | "" | |
1160 | ".") | |
1161 | "uucp")) | |
1162 | (setq !-pos (cdr !-pos)))) | |
1163 | (and saved-%-pos | |
1164 | (setq %-pos (append (mapcar 'mail-extr-demarkerize | |
1165 | saved-%-pos) | |
1166 | %-pos))) | |
1167 | (setq @-pos (mail-extr-demarkerize @-pos)) | |
1168 | (narrow-to-region (1+ saved-!-pos) (point-max))) | |
d1782bd8 KH |
1169 | |
1170 | ;; WARNING: THIS CODE IS DUPLICATED ABOVE. | |
d980c402 SM |
1171 | (when (and %-pos (not @-pos)) |
1172 | (goto-char (car %-pos)) | |
1173 | (delete-char 1) | |
1174 | (setq @-pos (point)) | |
1175 | (insert "@") | |
1176 | (setq %-pos (cdr %-pos))) | |
1177 | ||
1178 | (when (setq %-pos (nreverse %-pos)) ; implies @-pos valid | |
1179 | (setq temp %-pos) | |
1180 | (catch 'truncated | |
1181 | (while temp | |
1182 | (goto-char (or (nth 1 temp) | |
1183 | @-pos)) | |
1184 | (mail-extr-skip-whitespace-backward) | |
1185 | (save-excursion | |
1186 | (mail-extr-safe-move-sexp -1) | |
1187 | (setq domain-pos (point)) | |
1188 | (mail-extr-skip-whitespace-backward) | |
1189 | (setq \.-pos (eq ?. (preceding-char)))) | |
1190 | (when (and \.-pos | |
1191 | ;; #### string consing | |
1192 | (let ((s (intern-soft | |
1193 | (buffer-substring domain-pos (point)) | |
1194 | mail-extr-all-top-level-domains))) | |
1195 | (and s (get s 'domain-name)))) | |
1196 | (narrow-to-region (point-min) (point)) | |
1197 | (goto-char (car temp)) | |
1198 | (delete-char 1) | |
1199 | (setq @-pos (point)) | |
1200 | (setcdr temp nil) | |
1201 | (setq %-pos (delq @-pos %-pos)) | |
1202 | (insert "@") | |
1203 | (throw 'truncated t)) | |
1204 | (setq temp (cdr temp))))) | |
d1782bd8 KH |
1205 | (setq mbox-beg (point-min) |
1206 | mbox-end (if %-pos (car %-pos) | |
1207 | (or @-pos | |
d980c402 SM |
1208 | (point-max)))) |
1209 | ||
1210 | (when @-pos | |
1211 | ;; Make the domain-name part lowercase since it's case | |
1212 | ;; insensitive anyway. | |
1213 | (downcase-region (1+ @-pos) (point-max)))) | |
d1782bd8 KH |
1214 | |
1215 | ;; Done canonicalizing address. | |
1216 | ;; We are now back in extraction-buffer. | |
1217 | ||
1218 | ;; Decide what part of the address to search to find the full name. | |
1219 | (cond ( | |
1220 | ;; Example: "First M. Last" <fml@foo.bar.dom> | |
1221 | (and phrase-beg | |
1222 | (eq quote-beg phrase-beg) | |
1223 | (<= quote-end phrase-end)) | |
1224 | (narrow-to-region (1+ quote-beg) (1- quote-end)) | |
1225 | (mail-extr-undo-backslash-quoting (point-min) (point-max))) | |
1226 | ||
1227 | ;; Example: First Last <fml@foo.bar.dom> | |
1228 | (phrase-beg | |
1229 | (narrow-to-region phrase-beg phrase-end)) | |
1230 | ||
1231 | ;; Example: fml@foo.bar.dom (First M. Last) | |
1232 | (cbeg | |
1233 | (narrow-to-region (1+ cbeg) (1- cend)) | |
1234 | (mail-extr-undo-backslash-quoting (point-min) (point-max)) | |
1235 | ||
1236 | ;; Deal with spacing problems | |
1237 | (goto-char (point-min)) | |
1238 | ;;; (cond ((not (search-forward " " nil t)) | |
1239 | ;;; (goto-char (point-min)) | |
1240 | ;;; (cond ((search-forward "_" nil t) | |
1241 | ;;; ;; Handle the *idiotic* use of underlines as spaces. | |
1242 | ;;; ;; Example: fml@foo.bar.dom (First_M._Last) | |
1243 | ;;; (goto-char (point-min)) | |
1244 | ;;; (while (search-forward "_" nil t) | |
1245 | ;;; (replace-match " " t))) | |
1246 | ;;; ((search-forward "." nil t) | |
1247 | ;;; ;; Fix . used as space | |
1248 | ;;; ;; Example: danj1@cb.att.com (daniel.jacobson) | |
1249 | ;;; (goto-char (point-min)) | |
1250 | ;;; (while (re-search-forward mail-extr-bad-dot-pattern nil t) | |
1251 | ;;; (replace-match "\\1 \\2" t)))))) | |
154b3e39 | 1252 | ) |
d1782bd8 KH |
1253 | |
1254 | ;; Otherwise we try to get the name from the mailbox portion | |
1255 | ;; of the address. | |
1256 | ;; Example: First_M_Last@foo.bar.dom | |
72c0ae01 | 1257 | (t |
d1782bd8 KH |
1258 | ;; *** Work in canon buffer instead? No, can't. Hmm. |
1259 | (goto-char (point-max)) | |
1260 | (narrow-to-region (point) (point)) | |
1261 | (insert-buffer-substring canonicalization-buffer | |
1262 | mbox-beg mbox-end) | |
1263 | (goto-char (point-min)) | |
1264 | ||
1265 | ;; Example: First_Last.XXX@foo.bar.dom | |
1266 | (setq \.-ends-name (re-search-forward "[_0-9]" nil t)) | |
1267 | ||
1268 | (goto-char (point-min)) | |
1269 | ||
1270 | (if (not mail-extr-mangle-uucp) | |
1271 | (modify-syntax-entry ?! "w" (syntax-table))) | |
1272 | ||
1273 | (while (progn | |
1274 | (mail-extr-skip-whitespace-forward) | |
1275 | (not (eobp))) | |
1276 | (setq char (char-after (point))) | |
154b3e39 | 1277 | (cond |
d1782bd8 KH |
1278 | ((eq char ?\") |
1279 | (setq quote-beg (point)) | |
1280 | (or (mail-extr-safe-move-sexp 1) | |
1281 | ;; TODO: handle this error condition!!!!! | |
1282 | (forward-char 1)) | |
1283 | ;; take into account deletions | |
1284 | (setq quote-end (- (point) 2)) | |
1285 | (save-excursion | |
1286 | (backward-char 1) | |
d980c402 | 1287 | (delete-char 1) |
d1782bd8 KH |
1288 | (goto-char quote-beg) |
1289 | (or (eobp) | |
d980c402 | 1290 | (delete-char 1))) |
d1782bd8 KH |
1291 | (mail-extr-undo-backslash-quoting quote-beg quote-end) |
1292 | (or (eq ?\ (char-after (point))) | |
1293 | (insert " ")) | |
1294 | ;; (setq mailbox-name-processed-flag t) | |
1295 | (setq \.-ends-name t)) | |
1296 | ((eq char ?.) | |
1297 | (if (memq (char-after (1+ (point))) '(?_ ?=)) | |
1298 | (progn | |
1299 | (forward-char 1) | |
d980c402 | 1300 | (delete-char 1) |
d1782bd8 KH |
1301 | (insert ?\ )) |
1302 | (if \.-ends-name | |
1303 | (narrow-to-region (point-min) (point)) | |
d980c402 | 1304 | (delete-char 1) |
d1782bd8 KH |
1305 | (insert " "))) |
1306 | ;; (setq mailbox-name-processed-flag t) | |
1307 | ) | |
1308 | ((memq (char-syntax char) '(?. ?\\)) | |
d980c402 | 1309 | (delete-char 1) |
d1782bd8 KH |
1310 | (insert " ") |
1311 | ;; (setq mailbox-name-processed-flag t) | |
154b3e39 | 1312 | ) |
154b3e39 | 1313 | (t |
d1782bd8 KH |
1314 | (setq atom-beg (point)) |
1315 | (forward-word 1) | |
1316 | (setq atom-end (point)) | |
1317 | (goto-char atom-beg) | |
1318 | (save-restriction | |
1319 | (narrow-to-region atom-beg atom-end) | |
1320 | (cond | |
1321 | ||
1322 | ;; Handle X.400 addresses encoded in RFC-822. | |
1323 | ;; *** Shit! This has to handle the case where it is | |
1324 | ;; *** embedded in a quote too! | |
1325 | ;; *** Shit! The input is being broken up into atoms | |
1326 | ;; *** by periods! | |
1327 | ((looking-at mail-extr-x400-encoded-address-pattern) | |
1328 | ||
1329 | ;; Copy the contents of the individual fields that | |
1330 | ;; might hold name data to the beginning. | |
d980c402 SM |
1331 | (mapc |
1332 | (lambda (field-pattern) | |
1333 | (when | |
1334 | (save-excursion | |
1335 | (re-search-forward field-pattern nil t)) | |
1336 | (insert-buffer-substring (current-buffer) | |
1337 | (match-beginning 1) | |
1338 | (match-end 1)) | |
1339 | (insert " "))) | |
d1782bd8 KH |
1340 | (list mail-extr-x400-encoded-address-given-name-pattern |
1341 | mail-extr-x400-encoded-address-surname-pattern | |
1342 | mail-extr-x400-encoded-address-full-name-pattern)) | |
1343 | ||
1344 | ;; Discard the rest, since it contains stuff like | |
1345 | ;; routing information, not part of a name. | |
1346 | (mail-extr-skip-whitespace-backward) | |
1347 | (delete-region (point) (point-max)) | |
1348 | ||
1349 | ;; Handle periods used for spacing. | |
1350 | (while (re-search-forward mail-extr-bad-dot-pattern nil t) | |
1351 | (replace-match "\\1 \\2" t)) | |
1352 | ||
1353 | ;; (setq mailbox-name-processed-flag t) | |
1354 | ) | |
1355 | ||
1356 | ;; Handle normal addresses. | |
1357 | (t | |
1358 | (goto-char (point-min)) | |
1359 | ;; Handle _ and = used for spacing. | |
1360 | (while (re-search-forward "\\([^_=]+\\)[_=]" nil t) | |
1361 | (replace-match "\\1 " t) | |
1362 | ;; (setq mailbox-name-processed-flag t) | |
1363 | ) | |
1364 | (goto-char (point-max)))))))) | |
1365 | ||
1366 | ;; undo the dirty deed | |
1367 | (if (not mail-extr-mangle-uucp) | |
1368 | (modify-syntax-entry ?! "." (syntax-table))) | |
1369 | ;; | |
1370 | ;; If we derived the name from the mailbox part of the address, | |
1371 | ;; and we only got one word out of it, don't treat that as a | |
1372 | ;; name. "foo@bar" --> (nil "foo@bar"), not ("foo" "foo@bar") | |
1373 | ;; (if (not mailbox-name-processed-flag) | |
1374 | ;; (delete-region (point-min) (point-max))) | |
1375 | )) | |
1376 | ||
1377 | (set-syntax-table mail-extr-address-text-syntax-table) | |
1378 | ||
1379 | (mail-extr-voodoo mbox-beg mbox-end canonicalization-buffer) | |
1380 | (goto-char (point-min)) | |
1381 | ||
1382 | ;; If name is "First Last" and userid is "F?L", then assume | |
1383 | ;; the middle initial is the second letter in the userid. | |
1384 | ;; Initial code by Jamie Zawinski <jwz@lucid.com> | |
1385 | ;; *** Make it work when there's a suffix as well. | |
1386 | (goto-char (point-min)) | |
d980c402 SM |
1387 | (when (and mail-extr-guess-middle-initial |
1388 | (not disable-initial-guessing-flag) | |
1389 | (eq 3 (- mbox-end mbox-beg)) | |
1390 | (progn | |
1391 | (goto-char (point-min)) | |
1392 | (looking-at mail-extr-two-name-pattern))) | |
1393 | (setq fi (char-after (match-beginning 0)) | |
1394 | li (char-after (match-beginning 3))) | |
1395 | (with-current-buffer canonicalization-buffer | |
1396 | ;; char-equal is ignoring case here, so no need to upcase | |
1397 | ;; or downcase. | |
1398 | (let ((case-fold-search t)) | |
1399 | (and (char-equal fi (char-after mbox-beg)) | |
1400 | (char-equal li (char-after (1- mbox-end))) | |
1401 | (setq mi (char-after (1+ mbox-beg)))))) | |
1402 | (when (and mi | |
1403 | ;; TODO: use better table than syntax table | |
1404 | (eq ?w (char-syntax mi))) | |
1405 | (goto-char (match-beginning 3)) | |
1406 | (insert (upcase mi) ". "))) | |
d1782bd8 KH |
1407 | |
1408 | ;; Nuke name if it is the same as mailbox name. | |
1409 | (let ((buffer-length (- (point-max) (point-min))) | |
1410 | (i 0) | |
1411 | (names-match-flag t)) | |
d980c402 SM |
1412 | (when (and (> buffer-length 0) |
1413 | (eq buffer-length (- mbox-end mbox-beg))) | |
1414 | (goto-char (point-max)) | |
1415 | (insert-buffer-substring canonicalization-buffer | |
1416 | mbox-beg mbox-end) | |
1417 | (while (and names-match-flag | |
1418 | (< i buffer-length)) | |
1419 | (or (eq (downcase (char-after (+ i (point-min)))) | |
1420 | (downcase | |
1421 | (char-after (+ i buffer-length (point-min))))) | |
1422 | (setq names-match-flag nil)) | |
1423 | (setq i (1+ i))) | |
1424 | (delete-region (+ (point-min) buffer-length) (point-max)) | |
bf247b6e | 1425 | (and names-match-flag |
67e61e16 EZ |
1426 | mail-extr-ignore-realname-equals-mailbox-name |
1427 | (narrow-to-region (point) (point))))) | |
d1782bd8 KH |
1428 | |
1429 | ;; Nuke name if it's just one word. | |
1430 | (goto-char (point-min)) | |
1431 | (and mail-extr-ignore-single-names | |
1432 | (not (re-search-forward "[- ]" nil t)) | |
1433 | (narrow-to-region (point) (point))) | |
1434 | ||
1435 | ;; Record the result | |
1436 | (setq value-list | |
1437 | (cons (list (if (not (= (point-min) (point-max))) | |
1438 | (buffer-string)) | |
d980c402 | 1439 | (with-current-buffer canonicalization-buffer |
d1782bd8 KH |
1440 | (if (not (= (point-min) (point-max))) |
1441 | (buffer-string)))) | |
1442 | value-list)) | |
1443 | ||
1444 | ;; Unless one address is all we wanted, | |
1445 | ;; delete this one from extraction-buffer | |
1446 | ;; and get ready to extract the next address. | |
1447 | (when all | |
1448 | (if end-of-address | |
1449 | (narrow-to-region 1 end-of-address) | |
1450 | (widen)) | |
1451 | (delete-region (point-min) (point-max)) | |
1452 | (widen)) | |
1453 | ))) | |
1454 | (if all (nreverse value-list) (car value-list)) | |
1455 | )) | |
72c0ae01 | 1456 | |
2d8a5449 KH |
1457 | (defcustom mail-extr-disable-voodoo "\\cj" |
1458 | "*If it is a regexp, names matching it will never be modified. | |
1459 | If it is neither nil nor a string, modifying of names will never take | |
1460 | place. It affects how `mail-extract-address-components' works." | |
1461 | :type '(choice (regexp :size 0) | |
1462 | (const :tag "Always enabled" nil) | |
1463 | (const :tag "Always disabled" t)) | |
1464 | :group 'mail-extr) | |
1465 | ||
154b3e39 | 1466 | (defun mail-extr-voodoo (mbox-beg mbox-end canonicalization-buffer) |
2d8a5449 KH |
1467 | (unless (and mail-extr-disable-voodoo |
1468 | (or (not (stringp mail-extr-disable-voodoo)) | |
1469 | (progn | |
1470 | (goto-char (point-min)) | |
1471 | (re-search-forward mail-extr-disable-voodoo nil t)))) | |
1472 | (let ((word-count 0) | |
1473 | (case-fold-search nil) | |
1474 | mixed-case-flag lower-case-flag ;;upper-case-flag | |
1475 | suffix-flag last-name-comma-flag | |
1476 | ;;cbeg cend | |
1477 | initial | |
1478 | begin-again-flag | |
1479 | drop-this-word-if-trailing-flag | |
1480 | drop-last-word-if-trailing-flag | |
1481 | word-found-flag | |
1482 | this-word-beg last-word-beg | |
1483 | name-beg name-end | |
1484 | name-done-flag | |
1485 | ) | |
1486 | (save-excursion | |
1487 | (set-syntax-table mail-extr-address-text-syntax-table) | |
1488 | ||
1489 | ;; Get rid of comments. | |
d980c402 | 1490 | (goto-char (point-min)) |
2d8a5449 KH |
1491 | (while (not (eobp)) |
1492 | ;; Initialize for this iteration of the loop. | |
1493 | (skip-chars-forward "^({[\"'`") | |
1494 | (let ((cbeg (point))) | |
1495 | (set-syntax-table mail-extr-address-text-comment-syntax-table) | |
1496 | (if (memq (following-char) '(?\' ?\`)) | |
1497 | (search-forward "'" nil 'move | |
1498 | (if (eq ?\' (following-char)) 2 1)) | |
1499 | (or (mail-extr-safe-move-sexp 1) | |
1500 | (goto-char (point-max)))) | |
1501 | (set-syntax-table mail-extr-address-text-syntax-table) | |
1502 | (when (eq (char-after cbeg) ?\() | |
1503 | ;; Delete the comment itself. | |
1504 | (delete-region cbeg (point)) | |
1505 | ;; Canonicalize whitespace where the comment was. | |
1506 | (skip-chars-backward " \t") | |
1507 | (if (looking-at "\\([ \t]+$\\|[ \t]+,\\)") | |
1508 | (replace-match "") | |
1509 | (setq cbeg (point)) | |
1510 | (skip-chars-forward " \t") | |
1511 | (if (bobp) | |
1512 | (delete-region (point) cbeg) | |
1513 | (just-one-space)))))) | |
1514 | ||
1515 | ;; This was moved above. | |
1516 | ;; Fix . used as space | |
1517 | ;; But it belongs here because it occurs not only as | |
1518 | ;; rypens@reks.uia.ac.be (Piet.Rypens) | |
1519 | ;; but also as | |
1520 | ;; "Piet.Rypens" <rypens@reks.uia.ac.be> | |
1521 | ;;(goto-char (point-min)) | |
1522 | ;;(while (re-search-forward mail-extr-bad-dot-pattern nil t) | |
1523 | ;; (replace-match "\\1 \\2" t)) | |
1524 | ||
1525 | (unless (search-forward " " nil t) | |
1526 | (goto-char (point-min)) | |
1527 | (cond ((search-forward "_" nil t) | |
1528 | ;; Handle the *idiotic* use of underlines as spaces. | |
1529 | ;; Example: fml@foo.bar.dom (First_M._Last) | |
1530 | (goto-char (point-min)) | |
1531 | (while (search-forward "_" nil t) | |
1532 | (replace-match " " t))) | |
1533 | ((search-forward "." nil t) | |
1534 | ;; Fix . used as space | |
1535 | ;; Example: danj1@cb.att.com (daniel.jacobson) | |
1536 | (goto-char (point-min)) | |
1537 | (while (re-search-forward mail-extr-bad-dot-pattern nil t) | |
1538 | (replace-match "\\1 \\2" t))))) | |
6c83d99f | 1539 | |
2d8a5449 KH |
1540 | ;; Loop over the words (and other junk) in the name. |
1541 | (goto-char (point-min)) | |
1542 | (while (not name-done-flag) | |
1543 | ||
1544 | (when word-found-flag | |
1545 | ;; Last time through this loop we skipped over a word. | |
1546 | (setq last-word-beg this-word-beg) | |
1547 | (setq drop-last-word-if-trailing-flag | |
1548 | drop-this-word-if-trailing-flag) | |
1549 | (setq word-found-flag nil)) | |
1550 | ||
1551 | (when begin-again-flag | |
1552 | ;; Last time through the loop we found something that | |
1553 | ;; indicates we should pretend we are beginning again from | |
1554 | ;; the start. | |
1555 | (setq word-count 0) | |
1556 | (setq last-word-beg nil) | |
1557 | (setq drop-last-word-if-trailing-flag nil) | |
1558 | (setq mixed-case-flag nil) | |
1559 | (setq lower-case-flag nil) | |
1560 | ;; (setq upper-case-flag nil) | |
1561 | (setq begin-again-flag nil)) | |
1562 | ||
1563 | ;; Initialize for this iteration of the loop. | |
154b3e39 | 1564 | (mail-extr-skip-whitespace-forward) |
2d8a5449 KH |
1565 | (if (eq word-count 0) (narrow-to-region (point) (point-max))) |
1566 | (setq this-word-beg (point)) | |
1567 | (setq drop-this-word-if-trailing-flag nil) | |
1568 | ||
1569 | ;; Decide what to do based on what we are looking at. | |
72c0ae01 | 1570 | (cond |
2d8a5449 KH |
1571 | |
1572 | ;; Delete title | |
72c0ae01 | 1573 | ((and (eq word-count 0) |
2d8a5449 KH |
1574 | (looking-at mail-extr-full-name-prefixes)) |
1575 | (goto-char (match-end 0)) | |
1576 | (narrow-to-region (point) (point-max))) | |
6c83d99f | 1577 | |
2d8a5449 KH |
1578 | ;; Stop after name suffix |
1579 | ((and (>= word-count 2) | |
1580 | (looking-at mail-extr-full-name-suffix-pattern)) | |
1581 | (mail-extr-skip-whitespace-backward) | |
1582 | (setq suffix-flag (point)) | |
1583 | (if (eq ?, (following-char)) | |
1584 | (forward-char 1) | |
1585 | (insert ?,)) | |
1586 | ;; Enforce at least one space after comma | |
1587 | (or (eq ?\ (following-char)) | |
1588 | (insert ?\ )) | |
1589 | (mail-extr-skip-whitespace-forward) | |
1590 | (cond ((memq (following-char) '(?j ?J ?s ?S)) | |
1591 | (capitalize-word 1) | |
1592 | (if (eq (following-char) ?.) | |
1593 | (forward-char 1) | |
1594 | (insert ?.))) | |
1595 | (t | |
1596 | (upcase-word 1))) | |
1597 | (setq word-found-flag t) | |
1598 | (setq name-done-flag t)) | |
1599 | ||
1600 | ;; Handle SCA names | |
1601 | ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As" | |
1602 | (goto-char (match-beginning 1)) | |
1603 | (narrow-to-region (point) (point-max)) | |
1604 | (setq begin-again-flag t)) | |
1605 | ||
1606 | ;; Check for initial last name followed by comma | |
1607 | ((and (eq ?, (following-char)) | |
1608 | (eq word-count 1)) | |
1609 | (forward-char 1) | |
1610 | (setq last-name-comma-flag t) | |
1611 | (or (eq ?\ (following-char)) | |
1612 | (insert ?\ ))) | |
1613 | ||
1614 | ;; Stop before trailing comma-separated comment | |
1615 | ;; THIS CASE MUST BE AFTER THE PRECEDING CASES. | |
1616 | ;; *** This case is redundant??? | |
1617 | ;;((eq ?, (following-char)) | |
1618 | ;; (setq name-done-flag t)) | |
1619 | ||
1620 | ;; Delete parenthesized/quoted comment/nickname | |
1621 | ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`)) | |
1622 | (setq cbeg (point)) | |
1623 | (set-syntax-table mail-extr-address-text-comment-syntax-table) | |
1624 | (cond ((memq (following-char) '(?\' ?\`)) | |
1625 | (or (search-forward "'" nil t | |
1626 | (if (eq ?\' (following-char)) 2 1)) | |
1627 | (delete-char 1))) | |
1628 | (t | |
1629 | (or (mail-extr-safe-move-sexp 1) | |
1630 | (goto-char (point-max))))) | |
1631 | (set-syntax-table mail-extr-address-text-syntax-table) | |
1632 | (setq cend (point)) | |
1633 | (cond | |
1634 | ;; Handle case of entire name being quoted | |
1635 | ((and (eq word-count 0) | |
1636 | (looking-at " *\\'") | |
1637 | (>= (- cend cbeg) 2)) | |
1638 | (narrow-to-region (1+ cbeg) (1- cend)) | |
1639 | (goto-char (point-min))) | |
1640 | (t | |
1641 | ;; Handle case of quoted initial | |
1642 | (if (and (or (= 3 (- cend cbeg)) | |
1643 | (and (= 4 (- cend cbeg)) | |
1644 | (eq ?. (char-after (+ 2 cbeg))))) | |
1645 | (not (looking-at " *\\'"))) | |
1646 | (setq initial (char-after (1+ cbeg))) | |
1647 | (setq initial nil)) | |
1648 | (delete-region cbeg cend) | |
1649 | (if initial | |
1650 | (insert initial ". "))))) | |
1651 | ||
1652 | ;; Handle *Stupid* VMS date stamps | |
1653 | ((looking-at mail-extr-stupid-vms-date-stamp-pattern) | |
1654 | (replace-match "" t)) | |
1655 | ||
1656 | ;; Handle Chinese characters. | |
1657 | ((looking-at mail-extr-hz-embedded-gb-encoded-chinese-pattern) | |
1658 | (goto-char (match-end 0)) | |
1659 | (setq word-found-flag t)) | |
1660 | ||
1661 | ;; Skip initial garbage characters. | |
1662 | ;; THIS CASE MUST BE AFTER THE PRECEDING CASES. | |
1663 | ((and (eq word-count 0) | |
1664 | (looking-at mail-extr-leading-garbage)) | |
1665 | (goto-char (match-end 0)) | |
1666 | ;; *** Skip backward over these??? | |
1667 | ;; (skip-chars-backward "& \"") | |
1668 | (narrow-to-region (point) (point-max))) | |
1669 | ||
1670 | ;; Various stopping points | |
1671 | ((or | |
1672 | ||
1673 | ;; Stop before ALL CAPS acronyms, if preceded by mixed-case | |
1674 | ;; words. Example: XT-DEM. | |
1675 | (and (>= word-count 2) | |
1676 | mixed-case-flag | |
1677 | (looking-at mail-extr-weird-acronym-pattern) | |
1678 | (not (looking-at mail-extr-roman-numeral-pattern))) | |
1679 | ||
1680 | ;; Stop before trailing alternative address | |
1681 | (looking-at mail-extr-alternative-address-pattern) | |
1682 | ||
1683 | ;; Stop before trailing comment not introduced by comma | |
1684 | ;; THIS CASE MUST BE AFTER AN EARLIER CASE. | |
1685 | (looking-at mail-extr-trailing-comment-start-pattern) | |
1686 | ||
1687 | ;; Stop before telephone numbers | |
1688 | (and (>= word-count 1) | |
1689 | (looking-at mail-extr-telephone-extension-pattern))) | |
1690 | (setq name-done-flag t)) | |
1691 | ||
1692 | ;; Delete ham radio call signs | |
1693 | ((looking-at mail-extr-ham-call-sign-pattern) | |
1694 | (delete-region (match-beginning 0) (match-end 0))) | |
1695 | ||
1696 | ;; Fixup initials | |
1697 | ((looking-at mail-extr-initial-pattern) | |
1698 | (or (eq (following-char) (upcase (following-char))) | |
72c0ae01 | 1699 | (setq lower-case-flag t)) |
2d8a5449 KH |
1700 | (forward-char 1) |
1701 | (if (eq ?. (following-char)) | |
1702 | (forward-char 1) | |
1703 | (insert ?.)) | |
1704 | (or (eq ?\ (following-char)) | |
1705 | (insert ?\ )) | |
1706 | (setq word-found-flag t)) | |
1707 | ||
1708 | ;; Handle BITNET LISTSERV list names. | |
1709 | ((and (eq word-count 0) | |
1710 | (looking-at mail-extr-listserv-list-name-pattern)) | |
1711 | (narrow-to-region (match-beginning 1) (match-end 1)) | |
1712 | (setq word-found-flag t) | |
1713 | (setq name-done-flag t)) | |
1714 | ||
1715 | ;; Handle & substitution, when & is last and is not first. | |
1716 | ((and (> word-count 0) | |
1717 | (eq ?\ (preceding-char)) | |
1718 | (eq (following-char) ?&) | |
1719 | (eq (1+ (point)) (point-max))) | |
1720 | (delete-char 1) | |
1721 | (capitalize-region | |
1722 | (point) | |
1723 | (progn | |
1724 | (insert-buffer-substring canonicalization-buffer | |
1725 | mbox-beg mbox-end) | |
1726 | (point))) | |
1727 | (setq disable-initial-guessing-flag t) | |
1728 | (setq word-found-flag t)) | |
1729 | ||
1730 | ;; Handle & between names, as in "Bob & Susie". | |
1731 | ((and (> word-count 0) (eq (following-char) ?\&)) | |
1732 | (setq name-beg (point)) | |
1733 | (setq name-end (1+ name-beg)) | |
1734 | (setq word-found-flag t) | |
1735 | (goto-char name-end)) | |
1736 | ||
1737 | ;; Regular name words | |
1738 | ((looking-at mail-extr-name-pattern) | |
1739 | (setq name-beg (point)) | |
1740 | (setq name-end (match-end 0)) | |
1741 | ||
1742 | ;; Certain words will be dropped if they are at the end. | |
1743 | (and (>= word-count 2) | |
1744 | (not lower-case-flag) | |
1745 | (or | |
1746 | ;; Trailing 4-or-more letter lowercase words preceded by | |
1747 | ;; mixed case or uppercase words will be dropped. | |
1748 | (looking-at "[[:lower:]]\\{4,\\}[ \t]*\\'") | |
1749 | ;; Drop a trailing word which is terminated with a period. | |
1750 | (eq ?. (char-after (1- name-end)))) | |
1751 | (setq drop-this-word-if-trailing-flag t)) | |
1752 | ||
1753 | ;; Set the flags that indicate whether we have seen a lowercase | |
1754 | ;; word, a mixed case word, and an uppercase word. | |
1755 | (if (re-search-forward "[[:lower:]]" name-end t) | |
1756 | (if (progn | |
1757 | (goto-char name-beg) | |
1758 | (re-search-forward "[[:upper:]]" name-end t)) | |
1759 | (setq mixed-case-flag t) | |
1760 | (setq lower-case-flag t)) | |
1761 | ;; (setq upper-case-flag t) | |
1762 | ) | |
6c83d99f | 1763 | |
2d8a5449 KH |
1764 | (goto-char name-end) |
1765 | (setq word-found-flag t)) | |
72c0ae01 | 1766 | |
2d8a5449 KH |
1767 | ;; Allow a number as a word, if it doesn't mean anything else. |
1768 | ((looking-at "[0-9]+\\>") | |
1769 | (setq name-beg (point)) | |
1770 | (setq name-end (match-end 0)) | |
1771 | (goto-char name-end) | |
1772 | (setq word-found-flag t)) | |
1773 | ||
1774 | (t | |
1775 | (setq name-done-flag t) | |
1776 | )) | |
1777 | ||
1778 | ;; Count any word that we skipped over. | |
1779 | (if word-found-flag | |
1780 | (setq word-count (1+ word-count)))) | |
1781 | ||
1782 | ;; If the last thing in the name is 2 or more periods, or one or more | |
1783 | ;; other sentence terminators (but not a single period) then keep them | |
1784 | ;; and the preceding word. This is for the benefit of whole sentences | |
1785 | ;; in the name field: it's better behavior than dropping the last word | |
1786 | ;; of the sentence... | |
1787 | (if (and (not suffix-flag) | |
1788 | (looking-at "\\(\\.+\\|[?!;:.][?!;:.]+\\|[?!;:][?!;:.]*\\)\\'")) | |
1789 | (goto-char (setq suffix-flag (point-max)))) | |
1790 | ||
1791 | ;; Drop everything after point and certain trailing words. | |
1792 | (narrow-to-region (point-min) | |
1793 | (or (and drop-last-word-if-trailing-flag | |
1794 | last-word-beg) | |
1795 | (point))) | |
1796 | ||
1797 | ;; Xerox's mailers SUCK!!!!!! | |
1798 | ;; We simply refuse to believe that any last name is PARC or ADOC. | |
1799 | ;; If it looks like that is the last name, that there is no meaningful | |
1800 | ;; here at all. Actually I guess it would be best to map patterns | |
1801 | ;; like foo.hoser@xerox.com into foo@hoser.xerox.com, but I don't | |
1802 | ;; actually know that that is what's going on. | |
1803 | (unless suffix-flag | |
1804 | (goto-char (point-min)) | |
1805 | (let ((case-fold-search t)) | |
1806 | (if (looking-at "[-A-Za-z_]+[. ]\\(PARC\\|ADOC\\)\\'") | |
1807 | (erase-buffer)))) | |
1808 | ||
1809 | ;; If last name first put it at end (but before suffix) | |
1810 | (when last-name-comma-flag | |
1811 | (goto-char (point-min)) | |
1812 | (search-forward ",") | |
1813 | (setq name-end (1- (point))) | |
1814 | (goto-char (or suffix-flag (point-max))) | |
1815 | (or (eq ?\ (preceding-char)) | |
1816 | (insert ?\ )) | |
1817 | (insert-buffer-substring (current-buffer) (point-min) name-end) | |
af604656 | 1818 | (goto-char name-end) |
2d8a5449 KH |
1819 | (skip-chars-forward "\t ,") |
1820 | (narrow-to-region (point) (point-max))) | |
154b3e39 | 1821 | |
2d8a5449 KH |
1822 | ;; Delete leading and trailing junk characters. |
1823 | ;; *** This is probably completely unneeded now. | |
1824 | ;;(goto-char (point-max)) | |
1825 | ;;(skip-chars-backward mail-extr-non-end-name-chars) | |
1826 | ;;(if (eq ?. (following-char)) | |
1827 | ;; (forward-char 1)) | |
1828 | ;;(narrow-to-region (point) | |
1829 | ;; (progn | |
1830 | ;; (goto-char (point-min)) | |
1831 | ;; (skip-chars-forward mail-extr-non-begin-name-chars) | |
1832 | ;; (point))) | |
1833 | ||
1834 | ;; Compress whitespace | |
d980c402 | 1835 | (goto-char (point-min)) |
2d8a5449 KH |
1836 | (while (re-search-forward "[ \t\n]+" nil t) |
1837 | (replace-match (if (eobp) "" " ") t)) | |
1838 | )))) | |
72c0ae01 | 1839 | |
154b3e39 | 1840 | \f |
72c0ae01 | 1841 | |
154b3e39 RS |
1842 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
1843 | ;; | |
1844 | ;; Table of top-level domain names. | |
1845 | ;; | |
1846 | ;; This is used during address canonicalization; be careful of format changes. | |
72c0ae01 ER |
1847 | ;; Keep in mind that the country abbreviations follow ISO-3166. There is |
1848 | ;; a U.S. FIPS that specifies a different set of two-letter country | |
1849 | ;; abbreviations. | |
97f83272 SJ |
1850 | ;; |
1851 | ;; Updated by the RIPE Network Coordination Centre. | |
1852 | ;; | |
1853 | ;; Source: ISO 3166 Maintenance Agency | |
1854 | ;; http://www.iso.org/iso/en/prods-services/iso3166ma/02iso-3166-code-lists/list-en1-semic.txt | |
1855 | ;; http://www.iana.org/domain-names.htm | |
1856 | ;; http://www.iana.org/cctld/cctld-whois.htm | |
98ede5d4 | 1857 | ;; Latest change: 2007/11/15 |
72c0ae01 | 1858 | |
154b3e39 | 1859 | (defconst mail-extr-all-top-level-domains |
0b725d8c | 1860 | (let ((ob (make-vector 739 0))) |
d980c402 SM |
1861 | (mapc |
1862 | (lambda (x) | |
1863 | (put (intern (downcase (car x)) ob) | |
1864 | 'domain-name | |
1865 | (if (nth 2 x) | |
1866 | (format (nth 2 x) (nth 1 x)) | |
1867 | (nth 1 x)))) | |
e1a34b58 RS |
1868 | '( |
1869 | ;; ISO 3166 codes: | |
98ede5d4 | 1870 | ("ac" "Ascension Island") |
44248b7f | 1871 | ("ad" "Andorra") |
e1a34b58 | 1872 | ("ae" "United Arab Emirates") |
0b725d8c | 1873 | ("af" "Afghanistan") |
e1a34b58 | 1874 | ("ag" "Antigua and Barbuda") |
0b725d8c | 1875 | ("ai" "Anguilla") |
e1a34b58 | 1876 | ("al" "Albania") |
44248b7f | 1877 | ("am" "Armenia") |
0b725d8c | 1878 | ("an" "Netherlands Antilles") |
e1a34b58 RS |
1879 | ("ao" "Angola") |
1880 | ("aq" "Antarctica") ; continent | |
154b3e39 | 1881 | ("ar" "Argentina" "Argentine Republic") |
0b725d8c | 1882 | ("as" "American Samoa") |
154b3e39 RS |
1883 | ("at" "Austria" "The Republic of %s") |
1884 | ("au" "Australia") | |
0b725d8c | 1885 | ("aw" "Aruba") |
98ede5d4 | 1886 | ("ax" "Aland Islands") |
e1a34b58 | 1887 | ("az" "Azerbaijan") |
44248b7f | 1888 | ("ba" "Bosnia-Herzegovina") |
154b3e39 | 1889 | ("bb" "Barbados") |
e1a34b58 | 1890 | ("bd" "Bangladesh") |
154b3e39 | 1891 | ("be" "Belgium" "The Kingdom of %s") |
e1a34b58 | 1892 | ("bf" "Burkina Faso") |
154b3e39 | 1893 | ("bg" "Bulgaria") |
e1a34b58 | 1894 | ("bh" "Bahrain") |
0b725d8c GM |
1895 | ("bi" "Burundi") |
1896 | ("bj" "Benin") | |
98ede5d4 | 1897 | ("bl" "Saint Barthelemy") |
e1a34b58 | 1898 | ("bm" "Bermuda") |
0b725d8c | 1899 | ("bn" "Brunei Darussalam") |
154b3e39 RS |
1900 | ("bo" "Bolivia" "Republic of %s") |
1901 | ("br" "Brazil" "The Federative Republic of %s") | |
1902 | ("bs" "Bahamas") | |
0b725d8c GM |
1903 | ("bt" "Bhutan") |
1904 | ("bv" "Bouvet Island") | |
e1a34b58 RS |
1905 | ("bw" "Botswana") |
1906 | ("by" "Belarus") | |
154b3e39 RS |
1907 | ("bz" "Belize") |
1908 | ("ca" "Canada") | |
0b725d8c | 1909 | ("cc" "Cocos (Keeling) Islands") |
97f83272 | 1910 | ("cd" "Congo" "The Democratic Republic of the %s") |
0b725d8c | 1911 | ("cf" "Central African Republic") |
e1a34b58 | 1912 | ("cg" "Congo") |
154b3e39 | 1913 | ("ch" "Switzerland" "The Swiss Confederation") |
0b725d8c GM |
1914 | ("ci" "Ivory Coast") ; Cote D'ivoire |
1915 | ("ck" "Cook Islands") | |
154b3e39 | 1916 | ("cl" "Chile" "The Republic of %s") |
e1a34b58 | 1917 | ("cm" "Cameroon") ; In .fr domain |
154b3e39 | 1918 | ("cn" "China" "The People's Republic of %s") |
e1a34b58 | 1919 | ("co" "Colombia") |
154b3e39 | 1920 | ("cr" "Costa Rica" "The Republic of %s") |
e1a34b58 | 1921 | ("cu" "Cuba") |
0b725d8c GM |
1922 | ("cv" "Cape Verde") |
1923 | ("cx" "Christmas Island") | |
e1a34b58 RS |
1924 | ("cy" "Cyprus") |
1925 | ("cz" "Czech Republic") | |
154b3e39 | 1926 | ("de" "Germany") |
0b725d8c | 1927 | ("dj" "Djibouti") |
154b3e39 RS |
1928 | ("dk" "Denmark") |
1929 | ("dm" "Dominica") | |
1930 | ("do" "Dominican Republic" "The %s") | |
e1a34b58 | 1931 | ("dz" "Algeria") |
154b3e39 | 1932 | ("ec" "Ecuador" "The Republic of %s") |
e1a34b58 | 1933 | ("ee" "Estonia") |
154b3e39 | 1934 | ("eg" "Egypt" "The Arab Republic of %s") |
0b725d8c | 1935 | ("eh" "Western Sahara") |
e1a34b58 | 1936 | ("er" "Eritrea") |
154b3e39 | 1937 | ("es" "Spain" "The Kingdom of %s") |
44248b7f | 1938 | ("et" "Ethiopia") |
98ede5d4 | 1939 | ("eu" "European Union") |
154b3e39 | 1940 | ("fi" "Finland" "The Republic of %s") |
0b725d8c GM |
1941 | ("fj" "Fiji") |
1942 | ("fk" "Falkland Islands (Malvinas)") | |
1943 | ("fm" "Micronesia" "Federated States of %s") | |
e1a34b58 | 1944 | ("fo" "Faroe Islands") |
154b3e39 | 1945 | ("fr" "France") |
44248b7f KH |
1946 | ("ga" "Gabon") |
1947 | ("gb" "United Kingdom") | |
e1a34b58 RS |
1948 | ("gd" "Grenada") |
1949 | ("ge" "Georgia") | |
0b725d8c | 1950 | ("gf" "French Guiana") |
98ede5d4 | 1951 | ("gg" "Guernsey") |
0b725d8c GM |
1952 | ("gh" "Ghana") |
1953 | ("gi" "Gibraltar") | |
44248b7f KH |
1954 | ("gl" "Greenland") |
1955 | ("gm" "Gambia") | |
0b725d8c | 1956 | ("gn" "Guinea") |
e1a34b58 | 1957 | ("gp" "Guadeloupe (Fr.)") |
0b725d8c | 1958 | ("gq" "Equatorial Guinea") |
154b3e39 | 1959 | ("gr" "Greece" "The Hellenic Republic (%s)") |
97f83272 | 1960 | ("gs" "South Georgia and The South Sandwich Islands") |
e1a34b58 RS |
1961 | ("gt" "Guatemala") |
1962 | ("gu" "Guam (U.S.)") | |
0b725d8c GM |
1963 | ("gw" "Guinea-Bissau") |
1964 | ("gy" "Guyana") | |
154b3e39 | 1965 | ("hk" "Hong Kong") |
97f83272 | 1966 | ("hm" "Heard Island and Mcdonald Islands") |
e1a34b58 | 1967 | ("hn" "Honduras") |
44248b7f | 1968 | ("hr" "Croatia" "Croatia (Hrvatska)") |
e1a34b58 | 1969 | ("ht" "Haiti") |
44248b7f | 1970 | ("hu" "Hungary" "The Hungarian Republic") |
e1a34b58 | 1971 | ("id" "Indonesia") |
154b3e39 RS |
1972 | ("ie" "Ireland") |
1973 | ("il" "Israel" "The State of %s") | |
0b725d8c | 1974 | ("im" "Isle of Man" "The %s") ; NOT in ISO 3166-1 of 2001-02-26 |
154b3e39 | 1975 | ("in" "India" "The Republic of %s") |
0b725d8c GM |
1976 | ("io" "British Indian Ocean Territory") |
1977 | ("iq" "Iraq") | |
1978 | ("ir" "Iran" "Islamic Republic of %s") | |
154b3e39 RS |
1979 | ("is" "Iceland" "The Republic of %s") |
1980 | ("it" "Italy" "The Italian Republic") | |
98ede5d4 | 1981 | ("je" "Jersey") |
154b3e39 | 1982 | ("jm" "Jamaica") |
44248b7f | 1983 | ("jo" "Jordan") |
154b3e39 | 1984 | ("jp" "Japan") |
e1a34b58 | 1985 | ("ke" "Kenya") |
0b725d8c GM |
1986 | ("kg" "Kyrgyzstan") |
1987 | ("kh" "Cambodia") | |
1988 | ("ki" "Kiribati") | |
1989 | ("km" "Comoros") | |
1990 | ("kn" "Saint Kitts and Nevis") | |
1991 | ("kp" "Korea (North)" "Democratic People's Republic of Korea") | |
1992 | ("kr" "Korea (South)" "Republic of Korea") | |
e1a34b58 | 1993 | ("kw" "Kuwait") |
0b725d8c | 1994 | ("ky" "Cayman Islands") |
97f83272 | 1995 | ("kz" "Kazakhstan") |
0b725d8c | 1996 | ("la" "Lao People's Democratic Republic") |
e1a34b58 | 1997 | ("lb" "Lebanon") |
0b725d8c | 1998 | ("lc" "Saint Lucia") |
e1a34b58 | 1999 | ("li" "Liechtenstein") |
154b3e39 | 2000 | ("lk" "Sri Lanka" "The Democratic Socialist Republic of %s") |
0b725d8c | 2001 | ("lr" "Liberia") |
e1a34b58 RS |
2002 | ("ls" "Lesotho") |
2003 | ("lt" "Lithuania") | |
2004 | ("lu" "Luxembourg") | |
2005 | ("lv" "Latvia") | |
0b725d8c | 2006 | ("ly" "Libyan Arab Jamahiriya") |
e1a34b58 | 2007 | ("ma" "Morocco") |
44248b7f KH |
2008 | ("mc" "Monaco") |
2009 | ("md" "Moldova" "The Republic of %s") | |
98ede5d4 GM |
2010 | ("me" "Montenegro") |
2011 | ("mf" "Saint Martin (French part)") | |
e1a34b58 | 2012 | ("mg" "Madagascar") |
0b725d8c GM |
2013 | ("mh" "Marshall Islands") |
2014 | ("mk" "Macedonia" "The Former Yugoslav Republic of %s") | |
e1a34b58 | 2015 | ("ml" "Mali") |
0b725d8c GM |
2016 | ("mm" "Myanmar") |
2017 | ("mn" "Mongolia") | |
97f83272 | 2018 | ("mo" "Macao") |
0b725d8c GM |
2019 | ("mp" "Northern Mariana Islands") |
2020 | ("mq" "Martinique") | |
2021 | ("mr" "Mauritania") | |
2022 | ("ms" "Montserrat") | |
e1a34b58 RS |
2023 | ("mt" "Malta") |
2024 | ("mu" "Mauritius") | |
44248b7f | 2025 | ("mv" "Maldives") |
e1a34b58 | 2026 | ("mw" "Malawi") |
154b3e39 | 2027 | ("mx" "Mexico" "The United Mexican States") |
97f83272 | 2028 | ("my" "Malaysia") |
e1a34b58 | 2029 | ("mz" "Mozambique") |
154b3e39 | 2030 | ("na" "Namibia") |
e1a34b58 RS |
2031 | ("nc" "New Caledonia (Fr.)") |
2032 | ("ne" "Niger") ; In .fr domain | |
0b725d8c GM |
2033 | ("nf" "Norfolk Island") |
2034 | ("ng" "Nigeria") | |
154b3e39 RS |
2035 | ("ni" "Nicaragua" "The Republic of %s") |
2036 | ("nl" "Netherlands" "The Kingdom of the %s") | |
2037 | ("no" "Norway" "The Kingdom of %s") | |
e1a34b58 | 2038 | ("np" "Nepal") ; Via .in domain |
0b725d8c | 2039 | ("nr" "Nauru") |
44248b7f | 2040 | ("nu" "Niue") |
154b3e39 | 2041 | ("nz" "New Zealand") |
0b725d8c | 2042 | ("om" "Oman") |
e1a34b58 | 2043 | ("pa" "Panama") |
154b3e39 | 2044 | ("pe" "Peru") |
97f83272 | 2045 | ("pf" "French Polynesia") |
154b3e39 RS |
2046 | ("pg" "Papua New Guinea") |
2047 | ("ph" "Philippines" "The Republic of the %s") | |
e1a34b58 | 2048 | ("pk" "Pakistan") |
154b3e39 | 2049 | ("pl" "Poland") |
0b725d8c GM |
2050 | ("pm" "Saint Pierre and Miquelon") |
2051 | ("pn" "Pitcairn") | |
e1a34b58 | 2052 | ("pr" "Puerto Rico (U.S.)") |
0b725d8c | 2053 | ("ps" "Palestinian Territory, Occupied") |
4802af76 | 2054 | ("pt" "Portugal" "The Portuguese Republic") |
0b725d8c | 2055 | ("pw" "Palau") |
154b3e39 | 2056 | ("py" "Paraguay") |
44248b7f | 2057 | ("qa" "Qatar") |
e1a34b58 RS |
2058 | ("re" "Reunion (Fr.)") ; In .fr domain |
2059 | ("ro" "Romania") | |
98ede5d4 | 2060 | ("rs" "Serbia") |
0b725d8c GM |
2061 | ("ru" "Russia" "Russian Federation") |
2062 | ("rw" "Rwanda") | |
e1a34b58 | 2063 | ("sa" "Saudi Arabia") |
0b725d8c | 2064 | ("sb" "Solomon Islands") |
e1a34b58 RS |
2065 | ("sc" "Seychelles") |
2066 | ("sd" "Sudan") | |
154b3e39 RS |
2067 | ("se" "Sweden" "The Kingdom of %s") |
2068 | ("sg" "Singapore" "The Republic of %s") | |
0b725d8c | 2069 | ("sh" "Saint Helena") |
e1a34b58 | 2070 | ("si" "Slovenia") |
0b725d8c | 2071 | ("sj" "Svalbard and Jan Mayen") ; In .no domain |
e1a34b58 | 2072 | ("sk" "Slovakia" "The Slovak Republic") |
0b725d8c | 2073 | ("sl" "Sierra Leone") |
44248b7f | 2074 | ("sm" "San Marino") |
e1a34b58 | 2075 | ("sn" "Senegal") |
0b725d8c | 2076 | ("so" "Somalia") |
154b3e39 | 2077 | ("sr" "Suriname") |
97f83272 | 2078 | ("st" "Sao Tome and Principe") |
0b725d8c GM |
2079 | ("su" "U.S.S.R." "The Union of Soviet Socialist Republics") |
2080 | ("sv" "El Salvador") | |
2081 | ("sy" "Syrian Arab Republic") | |
e1a34b58 | 2082 | ("sz" "Swaziland") |
97f83272 | 2083 | ("tc" "Turks and Caicos Islands") |
0b725d8c GM |
2084 | ("td" "Chad") |
2085 | ("tf" "French Southern Territories") | |
e1a34b58 | 2086 | ("tg" "Togo") |
154b3e39 | 2087 | ("th" "Thailand" "The Kingdom of %s") |
0b725d8c GM |
2088 | ("tj" "Tajikistan") |
2089 | ("tk" "Tokelau") | |
97f83272 | 2090 | ("tl" "East Timor") |
0b725d8c | 2091 | ("tm" "Turkmenistan") |
154b3e39 | 2092 | ("tn" "Tunisia") |
44248b7f | 2093 | ("to" "Tonga") |
0b725d8c | 2094 | ("tp" "East Timor") |
154b3e39 RS |
2095 | ("tr" "Turkey" "The Republic of %s") |
2096 | ("tt" "Trinidad and Tobago") | |
0b725d8c GM |
2097 | ("tv" "Tuvalu") |
2098 | ("tw" "Taiwan" "%s, Province of China") | |
2099 | ("tz" "Tanzania" "United Republic of %s") | |
e1a34b58 | 2100 | ("ua" "Ukraine") |
0b725d8c | 2101 | ("ug" "Uganda") |
44a1338a | 2102 | ("uk" "United Kingdom" "The %s of Great Britain and Northern Ireland") |
0b725d8c | 2103 | ("um" "United States Minor Outlying Islands") |
e1a34b58 | 2104 | ("us" "United States" "The %s of America") |
154b3e39 | 2105 | ("uy" "Uruguay" "The Eastern Republic of %s") |
0b725d8c GM |
2106 | ("uz" "Uzbekistan") |
2107 | ("va" "Holy See (Vatican City State)") | |
97f83272 | 2108 | ("vc" "Saint Vincent and the Grenadines") |
154b3e39 | 2109 | ("ve" "Venezuela" "The Republic of %s") |
0b725d8c GM |
2110 | ("vg" "Virgin Islands, British") |
2111 | ("vi" "Virgin Islands, U.S.") | |
e1a34b58 RS |
2112 | ("vn" "Vietnam") |
2113 | ("vu" "Vanuatu") | |
0b725d8c GM |
2114 | ("wf" "Wallis and Futuna") |
2115 | ("ws" "Samoa") | |
2116 | ("ye" "Yemen") | |
2117 | ("yt" "Mayotte") | |
44248b7f KH |
2118 | ("yu" "Yugoslavia" "Yugoslavia, AKA Serbia-Montenegro") |
2119 | ("za" "South Africa" "The Republic of %s") | |
0b725d8c | 2120 | ("zm" "Zambia") |
154b3e39 | 2121 | ("zw" "Zimbabwe" "Republic of %s") |
97f83272 SJ |
2122 | ;; Generic Domains: |
2123 | ("aero" t "Air Transport Industry") | |
98ede5d4 | 2124 | ("asia" t "Pan-Asia and Asia Pacific community") |
97f83272 | 2125 | ("biz" t "Businesses") |
98ede5d4 | 2126 | ("cat" t "Catalan language and culture") |
e1a34b58 | 2127 | ("com" t "Commercial") |
97f83272 SJ |
2128 | ("coop" t "Cooperative Associations") |
2129 | ("info" t "Info") | |
98ede5d4 GM |
2130 | ("jobs" t "Employment") |
2131 | ("mobi" t "Mobile products") | |
97f83272 SJ |
2132 | ("museum" t "Museums") |
2133 | ("name" t "Individuals") | |
e1a34b58 RS |
2134 | ("net" t "Network") |
2135 | ("org" t "Non-profit Organization") | |
98ede5d4 GM |
2136 | ("pro" t "Credentialed professionals") |
2137 | ("tel" t "Contact data") | |
2138 | ("travel" t "Travel industry") | |
97f83272 SJ |
2139 | ;;("bitnet" t "Because It's Time NET") |
2140 | ("gov" t "United States Government") | |
2141 | ("edu" t "Educational") | |
2142 | ("mil" t "United States Military") | |
2143 | ("int" t "International Treaties") | |
2144 | ;;("nato" t "North Atlantic Treaty Organization") | |
e1a34b58 | 2145 | ("uucp" t "Unix to Unix CoPy") |
97f83272 SJ |
2146 | ;; Infrastructure Domains: |
2147 | ("arpa" t "Advanced Research Projects Agency (U.S. DoD)") | |
154b3e39 RS |
2148 | )) |
2149 | ob)) | |
2150 | ||
2151 | ;;;###autoload | |
2152 | (defun what-domain (domain) | |
e1cd65fd | 2153 | "Convert mail domain DOMAIN to the country it corresponds to." |
154b3e39 RS |
2154 | (interactive |
2155 | (let ((completion-ignore-case t)) | |
2156 | (list (completing-read "Domain: " | |
2157 | mail-extr-all-top-level-domains nil t)))) | |
2158 | (or (setq domain (intern-soft (downcase domain) | |
2159 | mail-extr-all-top-level-domains)) | |
29565a87 | 2160 | (error "No such domain")) |
154b3e39 | 2161 | (message "%s: %s" (upcase (symbol-name domain)) (get domain 'domain-name))) |
72c0ae01 ER |
2162 | |
2163 | \f | |
154b3e39 RS |
2164 | ;(let ((all nil)) |
2165 | ; (mapatoms #'(lambda (x) | |
6c83d99f | 2166 | ; (if (and (boundp x) |
154b3e39 RS |
2167 | ; (string-match "^mail-extr-" (symbol-name x))) |
2168 | ; (setq all (cons x all))))) | |
2169 | ; (setq all (sort all #'string-lessp)) | |
2170 | ; (cons 'setq | |
2171 | ; (apply 'nconc (mapcar #'(lambda (x) | |
2172 | ; (list x (symbol-value x))) | |
2173 | ; all)))) | |
72c0ae01 ER |
2174 | |
2175 | \f | |
154b3e39 | 2176 | (provide 'mail-extr) |
72c0ae01 | 2177 | |
ab5796a9 | 2178 | ;;; arch-tag: 7785fade-1073-4ed6-b4f6-28db34a7982d |
72c0ae01 | 2179 | ;;; mail-extr.el ends here |