| 1 | ;;; mail-extr.el --- extract full name and address from RFC 822 mail header -*- coding: utf-8 -*- |
| 2 | |
| 3 | ;; Copyright (C) 1991-1994, 1997, 2001-2011 Free Software Foundation, Inc. |
| 4 | |
| 5 | ;; Author: Joe Wells <jbw@cs.bu.edu> |
| 6 | ;; Maintainer: FSF |
| 7 | ;; Keywords: mail |
| 8 | ;; Package: mail-utils |
| 9 | |
| 10 | ;; This file is part of GNU Emacs. |
| 11 | |
| 12 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
| 13 | ;; it under the terms of the GNU General Public License as published by |
| 14 | ;; the Free Software Foundation, either version 3 of the License, or |
| 15 | ;; (at your option) any later version. |
| 16 | |
| 17 | ;; GNU Emacs is distributed in the hope that it will be useful, |
| 18 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | ;; GNU General Public License for more details. |
| 21 | |
| 22 | ;; You should have received a copy of the GNU General Public License |
| 23 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
| 24 | |
| 25 | ;;; Commentary: |
| 26 | |
| 27 | ;; The entry point of this code is |
| 28 | ;; |
| 29 | ;; mail-extract-address-components: (address &optional all) |
| 30 | ;; |
| 31 | ;; Given an RFC-822 ADDRESS, extract full name and canonical address. |
| 32 | ;; Returns a list of the form (FULL-NAME CANONICAL-ADDRESS). |
| 33 | ;; If no name can be extracted, FULL-NAME will be nil. |
| 34 | ;; ADDRESS may be a string or a buffer. If it is a buffer, the visible |
| 35 | ;; (narrowed) portion of the buffer will be interpreted as the address. |
| 36 | ;; (This feature exists so that the clever caller might be able to avoid |
| 37 | ;; consing a string.) |
| 38 | ;; If ADDRESS contains more than one RFC-822 address, only the first is |
| 39 | ;; returned. |
| 40 | ;; |
| 41 | ;; If ALL is non-nil, that means return info about all the addresses |
| 42 | ;; that are found in ADDRESS. The value is a list of elements of |
| 43 | ;; the form (FULL-NAME CANONICAL-ADDRESS), one per address. |
| 44 | ;; |
| 45 | ;; This code is more correct (and more heuristic) parser than the code in |
| 46 | ;; rfc822.el. And despite its size, it's fairly fast. |
| 47 | ;; |
| 48 | ;; There are two main benefits: |
| 49 | ;; |
| 50 | ;; 1. Higher probability of getting the correct full name for a human than |
| 51 | ;; any other package we know of. (On the other hand, it will cheerfully |
| 52 | ;; mangle non-human names/comments.) |
| 53 | ;; 2. Address part is put in a canonical form. |
| 54 | ;; |
| 55 | ;; The interface is not yet carved in stone; please give us suggestions. |
| 56 | ;; |
| 57 | ;; We have an extensive test-case collection of funny addresses if you want to |
| 58 | ;; work with the code. Developing this code requires frequent testing to |
| 59 | ;; make sure you're not breaking functionality. The test cases aren't included |
| 60 | ;; because they are over 100K. |
| 61 | ;; |
| 62 | ;; If you find an address that mail-extr fails on, please send it to the |
| 63 | ;; maintainer along with what you think the correct results should be. We do |
| 64 | ;; not consider it a bug if mail-extr mangles a comment that does not |
| 65 | ;; correspond to a real human full name, although we would prefer that |
| 66 | ;; mail-extr would return the comment as-is. |
| 67 | ;; |
| 68 | ;; Features: |
| 69 | ;; |
| 70 | ;; * Full name handling: |
| 71 | ;; |
| 72 | ;; * knows where full names can be found in an address. |
| 73 | ;; * avoids using empty comments and quoted text. |
| 74 | ;; * extracts full names from mailbox names. |
| 75 | ;; * recognizes common formats for comments after a full name. |
| 76 | ;; * puts a period and a space after each initial. |
| 77 | ;; * understands & referring to the mailbox name, capitalized. |
| 78 | ;; * strips name prefixes like "Prof.", etc. |
| 79 | ;; * understands what characters can occur in names (not just letters). |
| 80 | ;; * figures out middle initial from mailbox name. |
| 81 | ;; * removes funny nicknames. |
| 82 | ;; * keeps suffixes such as Jr., Sr., III, etc. |
| 83 | ;; * reorders "Last, First" type names. |
| 84 | ;; |
| 85 | ;; * Address handling: |
| 86 | ;; |
| 87 | ;; * parses rfc822 quoted text, comments, and domain literals. |
| 88 | ;; * parses rfc822 multi-line headers. |
| 89 | ;; * does something reasonable with rfc822 GROUP addresses. |
| 90 | ;; * handles many rfc822 noncompliant and garbage addresses. |
| 91 | ;; * canonicalizes addresses (after stripping comments/phrases outside <>). |
| 92 | ;; * converts ! addresses into .UUCP and %-style addresses. |
| 93 | ;; * converts rfc822 ROUTE addresses to %-style addresses. |
| 94 | ;; * truncates %-style addresses at leftmost fully qualified domain name. |
| 95 | ;; * handles local relative precedence of ! vs. % and @ (untested). |
| 96 | ;; |
| 97 | ;; It does almost no string creation. It primarily uses the built-in |
| 98 | ;; parsing routines with the appropriate syntax tables. This should |
| 99 | ;; result in greater speed. |
| 100 | ;; |
| 101 | ;; TODO: |
| 102 | ;; |
| 103 | ;; * handle all test cases. (This will take forever.) |
| 104 | ;; * software to pick the correct header to use (eg., "Senders-Name:"). |
| 105 | ;; * multiple addresses in the "From:" header (almost all of the necessary |
| 106 | ;; code is there). |
| 107 | ;; * flag to not treat `,' as an address separator. (This is useful when |
| 108 | ;; there is a "From:" header but no "Sender:" header, because then there |
| 109 | ;; is only allowed to be one address.) |
| 110 | ;; * mailbox name does not necessarily contain full name. |
| 111 | ;; * fixing capitalization when it's all upper or lowercase. (Hard!) |
| 112 | ;; * some of the domain literal handling is missing. (But I've never even |
| 113 | ;; seen one of these in a mail address, so maybe no big deal.) |
| 114 | ;; * arrange to have syntax tables byte-compiled. |
| 115 | ;; * speed hacks. |
| 116 | ;; * delete unused variables. |
| 117 | ;; * arrange for testing with different relative precedences of ! vs. @ |
| 118 | ;; and %. |
| 119 | ;; * insert documentation strings! |
| 120 | ;; * handle X.400-gatewayed addresses according to RFC 1148. |
| 121 | |
| 122 | ;;; Change Log: |
| 123 | ;; |
| 124 | ;; Thu Feb 17 17:57:33 1994 Jamie Zawinski (jwz@lucid.com) |
| 125 | ;; |
| 126 | ;; * merged with jbw's latest version |
| 127 | ;; |
| 128 | ;; Wed Feb 9 21:56:27 1994 Jamie Zawinski (jwz@lucid.com) |
| 129 | ;; |
| 130 | ;; * high-bit chars in comments weren't treated as word syntax |
| 131 | ;; |
| 132 | ;; Sat Feb 5 03:13:40 1994 Jamie Zawinski (jwz@lucid.com) |
| 133 | ;; |
| 134 | ;; * call replace-match with fixed-case arg |
| 135 | ;; |
| 136 | ;; Thu Dec 16 21:56:45 1993 Jamie Zawinski (jwz@lucid.com) |
| 137 | ;; |
| 138 | ;; * some more cleanup, doc, added provide |
| 139 | ;; |
| 140 | ;; Tue Mar 23 21:23:18 1993 Joe Wells (jbw at csd.bu.edu) |
| 141 | ;; |
| 142 | ;; * Made mail-full-name-prefixes a user-customizable variable. |
| 143 | ;; Allow passing the address as a buffer as well as a string. |
| 144 | ;; Allow [ and ] as name characters (Finnish character set). |
| 145 | ;; |
| 146 | ;; Mon Mar 22 21:20:56 1993 Joe Wells (jbw at bigbird.bu.edu) |
| 147 | ;; |
| 148 | ;; * Handle "null" addresses. Handle = used for spacing in mailbox |
| 149 | ;; name. Fix bug in handling of ROUTE-ADDR-type addresses that are |
| 150 | ;; missing their brackets. Handle uppercase "JR". Extract full |
| 151 | ;; names from X.400 addresses encoded in RFC-822. Fix bug in |
| 152 | ;; handling of multiple addresses where first has trailing comment. |
| 153 | ;; Handle more kinds of telephone extension lead-ins. |
| 154 | ;; |
| 155 | ;; Mon Mar 22 20:16:57 1993 Joe Wells (jbw at bigbird.bu.edu) |
| 156 | ;; |
| 157 | ;; * Handle HZ encoding for embedding GB encoded chinese characters. |
| 158 | ;; |
| 159 | ;; Mon Mar 22 00:46:12 1993 Joe Wells (jbw at bigbird.bu.edu) |
| 160 | ;; |
| 161 | ;; * Fixed too broad matching of ham radio call signs. Fixed bug in |
| 162 | ;; handling an unmatched ' in a name string. Enhanced recognition |
| 163 | ;; of when . in the mailbox name terminates the name portion. |
| 164 | ;; Narrowed conversion of . to space to only the necessary |
| 165 | ;; situation. Deal with VMS's stupid date stamps. Handle a unique |
| 166 | ;; way of introducing an alternate address. Fixed spacing bug I |
| 167 | ;; introduced in switching last name order. Fixed bug in handling |
| 168 | ;; address with ! and % but no @. Narrowed the cases in which |
| 169 | ;; certain trailing words are discarded. |
| 170 | ;; |
| 171 | ;; Sun Mar 21 21:41:06 1993 Joe Wells (jbw at bigbird.bu.edu) |
| 172 | ;; |
| 173 | ;; * Fixed bugs in handling GROUP addresses. Certain words in the |
| 174 | ;; middle of a name no longer terminate it. Handle LISTSERV list |
| 175 | ;; names. Ignore comment field containing mailbox name. |
| 176 | ;; |
| 177 | ;; Sun Mar 21 14:39:38 1993 Joe Wells (jbw at bigbird.bu.edu) |
| 178 | ;; |
| 179 | ;; * Moved variant-method code back into main function. Handle |
| 180 | ;; underscores as spaces in comments. Handle leading nickname. Add |
| 181 | ;; flag to ignore single-word names. Other changes. |
| 182 | ;; |
| 183 | ;; Mon Feb 1 22:23:31 1993 Joe Wells (jbw at bigbird.bu.edu) |
| 184 | ;; |
| 185 | ;; * Added in changes by Rod Whitby and Jamie Zawinski. This |
| 186 | ;; includes the flag mail-extr-guess-middle-initial and the fix for |
| 187 | ;; handling multiple addresses correctly. (Whitby just changed |
| 188 | ;; a > to a <.) |
| 189 | ;; |
| 190 | ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu) |
| 191 | ;; |
| 192 | ;; * Cleaned up some more. Release version 1.0 to world. |
| 193 | ;; |
| 194 | ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu) |
| 195 | ;; |
| 196 | ;; * Cleaned up full name extraction extensively. |
| 197 | ;; |
| 198 | ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu) |
| 199 | ;; |
| 200 | ;; * Total rewrite. Integrated mail-canonicalize-address into |
| 201 | ;; mail-extract-address-components. Now handles GROUP addresses more |
| 202 | ;; or less correctly. Better handling of lots of different cases. |
| 203 | ;; |
| 204 | ;; Fri Jun 14 19:39:50 1991 |
| 205 | ;; * Created. |
| 206 | |
| 207 | ;;; Code: |
| 208 | \f |
| 209 | |
| 210 | (defgroup mail-extr nil |
| 211 | "Extract full name and address from RFC 822 mail header." |
| 212 | :prefix "mail-extr-" |
| 213 | :group 'mail) |
| 214 | |
| 215 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 216 | ;; |
| 217 | ;; User configuration variable definitions. |
| 218 | ;; |
| 219 | |
| 220 | (defcustom mail-extr-guess-middle-initial nil |
| 221 | "*Whether to try to guess middle initial from mail address. |
| 222 | If true, then when we see an address like \"John Smith <jqs@host.com>\" |
| 223 | we will assume that \"John Q. Smith\" is the fellow's name." |
| 224 | :type 'boolean |
| 225 | :group 'mail-extr) |
| 226 | |
| 227 | (defcustom mail-extr-ignore-single-names nil |
| 228 | "*Whether to ignore a name that is just a single word. |
| 229 | If true, then when we see an address like \"Idiot <dumb@stupid.com>\" |
| 230 | we will act as though we couldn't find a full name in the address." |
| 231 | :type 'boolean |
| 232 | :version "22.1" |
| 233 | :group 'mail-extr) |
| 234 | |
| 235 | (defcustom mail-extr-ignore-realname-equals-mailbox-name t |
| 236 | "*Whether to ignore a name that is equal to the mailbox name. |
| 237 | If true, then when the address is like \"Single <single@address.com>\" |
| 238 | we will act as though we couldn't find a full name in the address." |
| 239 | :type 'boolean |
| 240 | :group 'mail-extr) |
| 241 | |
| 242 | ;; Matches a leading title that is not part of the name (does not |
| 243 | ;; contribute to uniquely identifying the person). |
| 244 | (defcustom mail-extr-full-name-prefixes |
| 245 | (purecopy |
| 246 | "\\(Prof\\|D[Rr]\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.?[ \t\n]") |
| 247 | "*Matches prefixes to the full name that identify a person's position. |
| 248 | These are stripped from the full name because they do not contribute to |
| 249 | uniquely identifying the person." |
| 250 | :type 'regexp |
| 251 | :group 'mail-extr) |
| 252 | |
| 253 | (defcustom mail-extr-@-binds-tighter-than-! nil |
| 254 | "*Whether the local mail transport agent looks at ! before @." |
| 255 | :type 'boolean |
| 256 | :group 'mail-extr) |
| 257 | |
| 258 | (defcustom mail-extr-mangle-uucp nil |
| 259 | "*Whether to throw away information in UUCP addresses |
| 260 | by translating things like \"foo!bar!baz@host\" into \"baz@bar.UUCP\"." |
| 261 | :type 'boolean |
| 262 | :group 'mail-extr) |
| 263 | |
| 264 | ;;---------------------------------------------------------------------- |
| 265 | ;; what orderings are meaningful????? |
| 266 | ;;(defvar mail-operator-precedence-list '(?! ?% ?@)) |
| 267 | ;; Right operand of a % or a @ must be a domain name, period. No other |
| 268 | ;; operators allowed. Left operand of a @ is an address relative to that |
| 269 | ;; site. |
| 270 | |
| 271 | ;; Left operand of a ! must be a domain name. Right operand is an |
| 272 | ;; arbitrary address. |
| 273 | ;;---------------------------------------------------------------------- |
| 274 | |
| 275 | \f |
| 276 | |
| 277 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 278 | ;; |
| 279 | ;; Constant definitions. |
| 280 | ;; |
| 281 | |
| 282 | ;; Any character that can occur in a name, not counting characters that |
| 283 | ;; separate parts of a multipart name (hyphen and period). |
| 284 | ;; Yes, there are weird people with digits in their names. |
| 285 | ;; You will also notice the consideration for the |
| 286 | ;; Swedish/Finnish/Norwegian character set. |
| 287 | (defconst mail-extr-all-letters-but-separators |
| 288 | (purecopy "][[:alnum:]{|}'~`")) |
| 289 | |
| 290 | ;; Any character that can occur in a name in an RFC822 address including |
| 291 | ;; the separator (hyphen and possibly period) for multipart names. |
| 292 | ;; #### should . be in here? |
| 293 | (defconst mail-extr-all-letters |
| 294 | (purecopy (concat mail-extr-all-letters-but-separators "---"))) |
| 295 | |
| 296 | ;; Any character that can start a name. |
| 297 | ;; Keep this set as minimal as possible. |
| 298 | (defconst mail-extr-first-letters (purecopy "[:alpha:]")) |
| 299 | |
| 300 | ;; Any character that can end a name. |
| 301 | ;; Keep this set as minimal as possible. |
| 302 | (defconst mail-extr-last-letters (purecopy "[:alpha:]`'.")) |
| 303 | |
| 304 | (defconst mail-extr-leading-garbage "\\W+") |
| 305 | |
| 306 | ;; (defconst mail-extr-non-name-chars |
| 307 | ;; (purecopy (concat "^" mail-extr-all-letters "."))) |
| 308 | ;; (defconst mail-extr-non-begin-name-chars |
| 309 | ;; (purecopy (concat "^" mail-extr-first-letters))) |
| 310 | ;; (defconst mail-extr-non-end-name-chars |
| 311 | ;; (purecopy (concat "^" mail-extr-last-letters))) |
| 312 | |
| 313 | ;; Matches an initial not followed by both a period and a space. |
| 314 | ;; (defconst mail-extr-bad-initials-pattern |
| 315 | ;; (purecopy |
| 316 | ;; (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)" |
| 317 | ;; mail-extr-all-letters mail-extr-first-letters mail-extr-all-letters))) |
| 318 | |
| 319 | ;; Matches periods used instead of spaces. Must not match the period |
| 320 | ;; following an initial. |
| 321 | (defconst mail-extr-bad-dot-pattern |
| 322 | (purecopy |
| 323 | (format "\\([%s][%s]\\)\\.+\\([%s]\\)" |
| 324 | mail-extr-all-letters |
| 325 | mail-extr-last-letters |
| 326 | mail-extr-first-letters))) |
| 327 | |
| 328 | ;; Matches an embedded or leading nickname that should be removed. |
| 329 | ;; (defconst mail-extr-nickname-pattern |
| 330 | ;; (purecopy |
| 331 | ;; (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] " |
| 332 | ;; mail-extr-all-letters))) |
| 333 | |
| 334 | ;; Matches the occurrence of a generational name suffix, and the last |
| 335 | ;; character of the preceding name. This is important because we want to |
| 336 | ;; keep such suffixes: they help to uniquely identify the person. |
| 337 | ;; *** Perhaps this should be a user-customizable variable. However, the |
| 338 | ;; *** regular expression is fairly tricky to alter, so maybe not. |
| 339 | (defconst mail-extr-full-name-suffix-pattern |
| 340 | (purecopy |
| 341 | (format |
| 342 | "\\(,? ?\\([JjSs][Rr]\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)" |
| 343 | mail-extr-all-letters mail-extr-all-letters))) |
| 344 | |
| 345 | (defconst mail-extr-roman-numeral-pattern (purecopy "V?I+V?\\b")) |
| 346 | |
| 347 | ;; Matches a trailing uppercase (with other characters possible) acronym. |
| 348 | ;; Must not match a trailing uppercase last name or trailing initial |
| 349 | (defconst mail-extr-weird-acronym-pattern |
| 350 | (purecopy "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)")) |
| 351 | |
| 352 | ;; Matches a mixed-case or lowercase name (not an initial). |
| 353 | ;; #### Match Latin1 lower case letters here too? |
| 354 | ;; (defconst mail-extr-mixed-case-name-pattern |
| 355 | ;; (purecopy |
| 356 | ;; (format |
| 357 | ;; "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)" |
| 358 | ;; mail-extr-all-letters mail-extr-last-letters |
| 359 | ;; mail-extr-first-letters mail-extr-all-letters mail-extr-all-letters |
| 360 | ;; mail-extr-last-letters mail-extr-first-letters mail-extr-all-letters))) |
| 361 | |
| 362 | ;; Matches a trailing alternative address. |
| 363 | ;; #### Match Latin1 letters here too? |
| 364 | ;; #### Match _ before @ here too? |
| 365 | (defconst mail-extr-alternative-address-pattern |
| 366 | (purecopy "\\(aka *\\)?[a-zA-Z.]+[!@][a-zA-Z.]")) |
| 367 | |
| 368 | ;; Matches a variety of trailing comments not including comma-delimited |
| 369 | ;; comments. |
| 370 | (defconst mail-extr-trailing-comment-start-pattern |
| 371 | (purecopy " [-{]\\|--\\|[+@#></\;]")) |
| 372 | |
| 373 | ;; Matches a name (not an initial). |
| 374 | ;; This doesn't force a word boundary at the end because sometimes a |
| 375 | ;; comment is separated by a `-' with no preceding space. |
| 376 | (defconst mail-extr-name-pattern |
| 377 | (purecopy (format "\\b[%s][%s]*[%s]" |
| 378 | mail-extr-first-letters |
| 379 | mail-extr-all-letters |
| 380 | mail-extr-last-letters))) |
| 381 | |
| 382 | (defconst mail-extr-initial-pattern |
| 383 | (purecopy (format "\\b[%s]\\([. ]\\|\\b\\)" mail-extr-first-letters))) |
| 384 | |
| 385 | ;; Matches a single name before a comma. |
| 386 | ;; (defconst mail-extr-last-name-first-pattern |
| 387 | ;; (purecopy (concat "\\`" mail-extr-name-pattern ","))) |
| 388 | |
| 389 | ;; Matches telephone extensions. |
| 390 | (defconst mail-extr-telephone-extension-pattern |
| 391 | (purecopy |
| 392 | "\\(\\([Ee]xt\\|\\|[Tt]ph\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+")) |
| 393 | |
| 394 | ;; Matches ham radio call signs. |
| 395 | ;; Help from: Mat Maessen N2NJZ <maessm@rpi.edu>, Mark Feit |
| 396 | ;; <mark@era.com>, Michael Covington <mcovingt@ai.uga.edu>. |
| 397 | ;; Examples: DX504 DX515 K5MRU K8DHK KA9WGN KA9WGN KD3FU KD6EUI KD6HBW |
| 398 | ;; KE9TV KF0NV N1API N3FU N3GZE N3IGS N4KCC N7IKQ N9HHU W4YHF W6ANK WA2SUH |
| 399 | ;; WB7VZI N2NJZ NR3G KJ4KK AB4UM AL7NI KH6OH WN3KBT N4TMI W1A N0NZO |
| 400 | (defconst mail-extr-ham-call-sign-pattern |
| 401 | (purecopy "\\b\\(DX[0-9]+\\|[AKNW][A-Z]?[0-9][A-Z][A-Z]?[A-Z]?\\)")) |
| 402 | |
| 403 | ;; Possible trailing suffixes: "\\(/\\(KT\\|A[AEG]\\|[R0-9]\\)\\)?" |
| 404 | ;; /KT == Temporary Technician (has CSC but not "real" license) |
| 405 | ;; /AA == Temporary Advanced |
| 406 | ;; /AE == Temporary Extra |
| 407 | ;; /AG == Temporary General |
| 408 | ;; /R == repeater |
| 409 | ;; /# == stations operating out of home district |
| 410 | ;; I don't include these in the regexp above because I can't imagine |
| 411 | ;; anyone putting them with their name in an e-mail address. |
| 412 | |
| 413 | ;; Matches normal single-part name |
| 414 | (defconst mail-extr-normal-name-pattern |
| 415 | (purecopy (format "\\b[%s][%s]+[%s]" |
| 416 | mail-extr-first-letters |
| 417 | mail-extr-all-letters-but-separators |
| 418 | mail-extr-last-letters))) |
| 419 | |
| 420 | ;; Matches a single word name. |
| 421 | ;; (defconst mail-extr-one-name-pattern |
| 422 | ;; (purecopy (concat "\\`" mail-extr-normal-name-pattern "\\'"))) |
| 423 | |
| 424 | ;; Matches normal two names with missing middle initial |
| 425 | ;; The first name is not allowed to have a hyphen because this can cause |
| 426 | ;; false matches where the "middle initial" is actually the first letter |
| 427 | ;; of the second part of the first name. |
| 428 | (defconst mail-extr-two-name-pattern |
| 429 | (purecopy |
| 430 | (concat "\\`\\(" mail-extr-normal-name-pattern |
| 431 | "\\|" mail-extr-initial-pattern |
| 432 | "\\) +\\(" mail-extr-name-pattern "\\)\\(,\\|\\'\\)"))) |
| 433 | |
| 434 | (defconst mail-extr-listserv-list-name-pattern |
| 435 | (purecopy "Multiple recipients of list \\([-A-Z]+\\)")) |
| 436 | |
| 437 | (defconst mail-extr-stupid-vms-date-stamp-pattern |
| 438 | (purecopy |
| 439 | "[0-9][0-9]-[JFMASOND][aepuco][nbrylgptvc]-[0-9][0-9][0-9][0-9] [0-9]+ *")) |
| 440 | |
| 441 | ;;; HZ -- GB (PRC Chinese character encoding) in ASCII embedding protocol |
| 442 | ;; |
| 443 | ;; In ASCII mode, a byte is interpreted as an ASCII character, unless a '~' is |
| 444 | ;; encountered. The character '~' is an escape character. By convention, it |
| 445 | ;; must be immediately followed ONLY by '~', '{' or '\n' (<LF>), with the |
| 446 | ;; following special meaning. |
| 447 | ;; |
| 448 | ;; o The escape sequence '~~' is interpreted as a '~'. |
| 449 | ;; o The escape-to-GB sequence '~{' switches the mode from ASCII to GB. |
| 450 | ;; o The escape sequence '~\n' is a line-continuation marker to be consumed |
| 451 | ;; with no output produced. |
| 452 | ;; |
| 453 | ;; In GB mode, characters are interpreted two bytes at a time as (pure) GB |
| 454 | ;; codes until the escape-from-GB code '~}' is read. This code switches the |
| 455 | ;; mode from GB back to ASCII. (Note that the escape-from-GB code '~}' |
| 456 | ;; ($7E7D) is outside the defined GB range.) |
| 457 | (defconst mail-extr-hz-embedded-gb-encoded-chinese-pattern |
| 458 | (purecopy "~{\\([^~].\\|~[^\}]\\)+~}")) |
| 459 | |
| 460 | ;; The leading optional lowercase letters are for a bastardized version of |
| 461 | ;; the encoding, as is the optional nature of the final slash. |
| 462 | (defconst mail-extr-x400-encoded-address-pattern |
| 463 | (purecopy "[a-z]?[a-z]?\\(/[A-Za-z]+\\(\\.[A-Za-z]+\\)?=[^/]+\\)+/?\\'")) |
| 464 | |
| 465 | (defconst mail-extr-x400-encoded-address-field-pattern-format |
| 466 | (purecopy "/%s=\\([^/]+\\)\\(/\\|\\'\\)")) |
| 467 | |
| 468 | (defconst mail-extr-x400-encoded-address-surname-pattern |
| 469 | ;; S stands for Surname (family name). |
| 470 | (purecopy |
| 471 | (format mail-extr-x400-encoded-address-field-pattern-format "[Ss]"))) |
| 472 | |
| 473 | (defconst mail-extr-x400-encoded-address-given-name-pattern |
| 474 | ;; G stands for Given name. |
| 475 | (purecopy |
| 476 | (format mail-extr-x400-encoded-address-field-pattern-format "[Gg]"))) |
| 477 | |
| 478 | (defconst mail-extr-x400-encoded-address-full-name-pattern |
| 479 | ;; PN stands for Personal Name. When used it represents the combination |
| 480 | ;; of the G and S fields. |
| 481 | ;; "The one system I used having this field asked it with the prompt |
| 482 | ;; `Personal Name'. But they mapped it into G and S on outgoing real |
| 483 | ;; X.400 addresses. As they mapped G and S into PN on incoming..." |
| 484 | (purecopy |
| 485 | (format mail-extr-x400-encoded-address-field-pattern-format "[Pp][Nn]"))) |
| 486 | |
| 487 | \f |
| 488 | |
| 489 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 490 | ;; |
| 491 | ;; Syntax tables used for quick parsing. |
| 492 | ;; |
| 493 | |
| 494 | (defconst mail-extr-address-syntax-table (make-syntax-table)) |
| 495 | (defconst mail-extr-address-comment-syntax-table (make-syntax-table)) |
| 496 | (defconst mail-extr-address-domain-literal-syntax-table (make-syntax-table)) |
| 497 | (defconst mail-extr-address-text-comment-syntax-table (make-syntax-table)) |
| 498 | (defconst mail-extr-address-text-syntax-table (make-syntax-table)) |
| 499 | (mapc |
| 500 | (lambda (pair) |
| 501 | (let ((syntax-table (symbol-value (car pair)))) |
| 502 | (dolist (item (cdr pair)) |
| 503 | (if (eq 2 (length item)) |
| 504 | ;; modifying syntax of a single character |
| 505 | (modify-syntax-entry (car item) (car (cdr item)) syntax-table) |
| 506 | ;; modifying syntax of a range of characters |
| 507 | (let ((char (nth 0 item)) |
| 508 | (bound (nth 1 item)) |
| 509 | (syntax (nth 2 item))) |
| 510 | (while (<= char bound) |
| 511 | (modify-syntax-entry char syntax syntax-table) |
| 512 | (setq char (1+ char)))))))) |
| 513 | '((mail-extr-address-syntax-table |
| 514 | (?\000 ?\037 "w") ;control characters |
| 515 | (?\040 " ") ;SPC |
| 516 | (?! ?~ "w") ;printable characters |
| 517 | (?\177 "w") ;DEL |
| 518 | (?\t " ") |
| 519 | (?\r " ") |
| 520 | (?\n " ") |
| 521 | (?\( ".") |
| 522 | (?\) ".") |
| 523 | (?< ".") |
| 524 | (?> ".") |
| 525 | (?@ ".") |
| 526 | (?, ".") |
| 527 | (?\; ".") |
| 528 | (?: ".") |
| 529 | (?\\ "\\") |
| 530 | (?\" "\"") |
| 531 | (?. ".") |
| 532 | (?\[ ".") |
| 533 | (?\] ".") |
| 534 | ;; % and ! aren't RFC822 characters, but it is convenient to pretend |
| 535 | (?% ".") |
| 536 | (?! ".") ;; this needs to be word-constituent when not in .UUCP mode |
| 537 | ) |
| 538 | (mail-extr-address-comment-syntax-table |
| 539 | (?\000 ?\377 "w") |
| 540 | (?\040 " ") |
| 541 | (?\240 " ") |
| 542 | (?\t " ") |
| 543 | (?\r " ") |
| 544 | (?\n " ") |
| 545 | (?\( "\(\)") |
| 546 | (?\) "\)\(") |
| 547 | (?\\ "\\")) |
| 548 | (mail-extr-address-domain-literal-syntax-table |
| 549 | (?\000 ?\377 "w") |
| 550 | (?\040 " ") |
| 551 | (?\240 " ") |
| 552 | (?\t " ") |
| 553 | (?\r " ") |
| 554 | (?\n " ") |
| 555 | (?\[ "\(\]") ;?????? |
| 556 | (?\] "\)\[") ;?????? |
| 557 | (?\\ "\\")) |
| 558 | (mail-extr-address-text-comment-syntax-table |
| 559 | (?\000 ?\377 "w") |
| 560 | (?\040 " ") |
| 561 | (?\240 " ") |
| 562 | (?\t " ") |
| 563 | (?\r " ") |
| 564 | (?\n " ") |
| 565 | (?\( "\(\)") |
| 566 | (?\) "\)\(") |
| 567 | (?\[ "\(\]") |
| 568 | (?\] "\)\[") |
| 569 | (?\{ "\(\}") |
| 570 | (?\} "\)\{") |
| 571 | (?\\ "\\") |
| 572 | (?\" "\"") |
| 573 | ;; (?\' "\)\`") |
| 574 | ;; (?\` "\(\'") |
| 575 | ) |
| 576 | (mail-extr-address-text-syntax-table |
| 577 | (?\000 ?\177 ".") |
| 578 | (?\200 ?\377 "w") |
| 579 | (?\040 " ") |
| 580 | (?\t " ") |
| 581 | (?\r " ") |
| 582 | (?\n " ") |
| 583 | (?A ?Z "w") |
| 584 | (?a ?z "w") |
| 585 | (?- "w") |
| 586 | (?\} "w") |
| 587 | (?\{ "w") |
| 588 | (?| "w") |
| 589 | (?\' "w") |
| 590 | (?~ "w") |
| 591 | (?0 ?9 "w")) |
| 592 | )) |
| 593 | |
| 594 | \f |
| 595 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 596 | ;; |
| 597 | ;; Utility functions and macros. |
| 598 | ;; |
| 599 | |
| 600 | ;; Fixme: There are Latin-1 nbsp below. If such characters should be |
| 601 | ;; included, this is the wrong thing to do -- it should use syntax (or |
| 602 | ;; regexp char classes). |
| 603 | |
| 604 | (defsubst mail-extr-skip-whitespace-forward () |
| 605 | ;; v19 fn skip-syntax-forward is more tasteful, but not byte-coded. |
| 606 | (skip-chars-forward " \t\n\r ")) |
| 607 | |
| 608 | (defsubst mail-extr-skip-whitespace-backward () |
| 609 | ;; v19 fn skip-syntax-backward is more tasteful, but not byte-coded. |
| 610 | (skip-chars-backward " \t\n\r ")) |
| 611 | |
| 612 | |
| 613 | (defsubst mail-extr-undo-backslash-quoting (beg end) |
| 614 | (save-excursion |
| 615 | (save-restriction |
| 616 | (narrow-to-region beg end) |
| 617 | (goto-char (point-min)) |
| 618 | ;; undo \ quoting |
| 619 | (while (search-forward "\\" nil t) |
| 620 | (delete-char -1) |
| 621 | (or (eobp) |
| 622 | (forward-char 1)))))) |
| 623 | |
| 624 | (defsubst mail-extr-nuke-char-at (pos) |
| 625 | (save-excursion |
| 626 | (goto-char pos) |
| 627 | (delete-char 1) |
| 628 | (insert ?\ ))) |
| 629 | |
| 630 | (put 'mail-extr-nuke-outside-range |
| 631 | 'edebug-form-spec '(symbolp &optional form form atom)) |
| 632 | |
| 633 | (defmacro mail-extr-nuke-outside-range (list-symbol |
| 634 | beg-symbol end-symbol |
| 635 | &optional no-replace) |
| 636 | "Delete all elements outside BEG..END in LIST. |
| 637 | LIST-SYMBOL names a variable holding a list of buffer positions |
| 638 | BEG-SYMBOL and END-SYMBOL name variables delimiting a range |
| 639 | Each element of LIST-SYMBOL which lies outside of the range is |
| 640 | deleted from the list. |
| 641 | Unless NO-REPLACE is true, at each of the positions in LIST-SYMBOL |
| 642 | which lie outside of the range, one character at that position is |
| 643 | replaced with a SPC." |
| 644 | (or (memq no-replace '(t nil)) |
| 645 | (error "no-replace must be t or nil, evaluable at macroexpand-time")) |
| 646 | `(let ((temp ,list-symbol) |
| 647 | ch) |
| 648 | (while temp |
| 649 | (setq ch (car temp)) |
| 650 | (when (or (> ch ,end-symbol) |
| 651 | (< ch ,beg-symbol)) |
| 652 | ,@(if no-replace |
| 653 | nil |
| 654 | `((mail-extr-nuke-char-at ch))) |
| 655 | (setcar temp nil)) |
| 656 | (setq temp (cdr temp))) |
| 657 | (setq ,list-symbol (delq nil ,list-symbol)))) |
| 658 | |
| 659 | (defun mail-extr-demarkerize (marker) |
| 660 | ;; if arg is a marker, destroys the marker, then returns the old value. |
| 661 | ;; otherwise returns the arg. |
| 662 | (if (markerp marker) |
| 663 | (let ((temp (marker-position marker))) |
| 664 | (set-marker marker nil) |
| 665 | temp) |
| 666 | marker)) |
| 667 | |
| 668 | (defun mail-extr-markerize (pos) |
| 669 | ;; coerces pos to a marker if non-nil. |
| 670 | (if (or (markerp pos) (null pos)) |
| 671 | pos |
| 672 | (copy-marker pos))) |
| 673 | |
| 674 | (defsubst mail-extr-safe-move-sexp (arg) |
| 675 | ;; Safely skip over one balanced sexp, if there is one. Return t if success. |
| 676 | (condition-case error |
| 677 | (progn |
| 678 | (goto-char (or (scan-sexps (point) arg) (point))) |
| 679 | t) |
| 680 | (error |
| 681 | ;; #### kludge kludge kludge kludge kludge kludge kludge !!! |
| 682 | (if (string-equal (nth 1 error) "Unbalanced parentheses") |
| 683 | nil |
| 684 | (while t |
| 685 | (signal (car error) (cdr error))))))) |
| 686 | \f |
| 687 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 688 | ;; |
| 689 | ;; The main function to grind addresses |
| 690 | ;; |
| 691 | |
| 692 | (defvar disable-initial-guessing-flag) ; dynamic assignment |
| 693 | (defvar mailextr-cbeg) ; dynamic assignment |
| 694 | (defvar mailextr-cend) ; dynamic assignment |
| 695 | (defvar mail-extr-all-top-level-domains) ; Defined below. |
| 696 | |
| 697 | ;;;###autoload |
| 698 | (defun mail-extract-address-components (address &optional all) |
| 699 | "Given an RFC-822 address ADDRESS, extract full name and canonical address. |
| 700 | Returns a list of the form (FULL-NAME CANONICAL-ADDRESS). If no |
| 701 | name can be extracted, FULL-NAME will be nil. Also see |
| 702 | `mail-extr-ignore-single-names' and |
| 703 | `mail-extr-ignore-realname-equals-mailbox-name'. |
| 704 | |
| 705 | If the optional argument ALL is non-nil, then ADDRESS can contain zero |
| 706 | or more recipients, separated by commas, and we return a list of |
| 707 | the form ((FULL-NAME CANONICAL-ADDRESS) ...) with one element for |
| 708 | each recipient. If ALL is nil, then if ADDRESS contains more than |
| 709 | one recipients, all but the first is ignored. |
| 710 | |
| 711 | ADDRESS may be a string or a buffer. If it is a buffer, the visible |
| 712 | \(narrowed) portion of the buffer will be interpreted as the address. |
| 713 | \(This feature exists so that the clever caller might be able to avoid |
| 714 | consing a string.)" |
| 715 | (let ((canonicalization-buffer (get-buffer-create " *canonical address*")) |
| 716 | (extraction-buffer (get-buffer-create " *extract address components*")) |
| 717 | value-list) |
| 718 | |
| 719 | (with-current-buffer (get-buffer-create extraction-buffer) |
| 720 | (fundamental-mode) |
| 721 | (buffer-disable-undo extraction-buffer) |
| 722 | (set-syntax-table mail-extr-address-syntax-table) |
| 723 | (widen) |
| 724 | (erase-buffer) |
| 725 | (setq case-fold-search nil) |
| 726 | |
| 727 | ;; Insert extra space at beginning to allow later replacement with < |
| 728 | ;; without having to move markers. |
| 729 | (insert ?\ ) |
| 730 | |
| 731 | ;; Insert the address itself. |
| 732 | (cond ((stringp address) |
| 733 | (insert address)) |
| 734 | ((bufferp address) |
| 735 | (insert-buffer-substring address)) |
| 736 | (t |
| 737 | (error "Invalid address: %s" address))) |
| 738 | |
| 739 | (set-text-properties (point-min) (point-max) nil) |
| 740 | |
| 741 | (with-current-buffer (get-buffer-create canonicalization-buffer) |
| 742 | (fundamental-mode) |
| 743 | (buffer-disable-undo canonicalization-buffer) |
| 744 | (setq case-fold-search nil)) |
| 745 | |
| 746 | |
| 747 | ;; Unfold multiple lines. |
| 748 | (goto-char (point-min)) |
| 749 | (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t) |
| 750 | (replace-match "\\1 " t)) |
| 751 | |
| 752 | ;; Loop over addresses until we have as many as we want. |
| 753 | (while (and (or all (null value-list)) |
| 754 | (progn (goto-char (point-min)) |
| 755 | (skip-chars-forward " \t") |
| 756 | (not (eobp)))) |
| 757 | (let (char |
| 758 | end-of-address |
| 759 | <-pos >-pos @-pos colon-pos comma-pos !-pos %-pos \;-pos |
| 760 | group-:-pos group-\;-pos route-addr-:-pos |
| 761 | record-pos-symbol |
| 762 | first-real-pos last-real-pos |
| 763 | phrase-beg phrase-end |
| 764 | ;; Dynamically set in mail-extr-voodoo. |
| 765 | mailextr-cbeg mailextr-cend |
| 766 | quote-beg quote-end |
| 767 | atom-beg atom-end |
| 768 | mbox-beg mbox-end |
| 769 | \.-ends-name |
| 770 | temp |
| 771 | ;; name-suffix |
| 772 | fi mi li ; first, middle, last initial |
| 773 | saved-%-pos saved-!-pos saved-@-pos |
| 774 | domain-pos \.-pos insert-point |
| 775 | ;; mailbox-name-processed-flag |
| 776 | disable-initial-guessing-flag) ; dynamically set from -voodoo |
| 777 | |
| 778 | (set-syntax-table mail-extr-address-syntax-table) |
| 779 | (goto-char (point-min)) |
| 780 | |
| 781 | ;; Insert extra space at beginning to allow later replacement with < |
| 782 | ;; without having to move markers. |
| 783 | (or (eq (following-char) ?\ ) |
| 784 | (insert ?\ )) |
| 785 | |
| 786 | ;; First pass grabs useful information about address. |
| 787 | (while (progn |
| 788 | (mail-extr-skip-whitespace-forward) |
| 789 | (not (eobp))) |
| 790 | (setq char (char-after (point))) |
| 791 | (or first-real-pos |
| 792 | (if (not (eq char ?\()) |
| 793 | (setq first-real-pos (point)))) |
| 794 | (cond |
| 795 | ;; comment |
| 796 | ((eq char ?\() |
| 797 | (set-syntax-table mail-extr-address-comment-syntax-table) |
| 798 | ;; only record the first non-empty comment's position |
| 799 | (if (and (not mailextr-cbeg) |
| 800 | (save-excursion |
| 801 | (forward-char 1) |
| 802 | (mail-extr-skip-whitespace-forward) |
| 803 | (not (eq ?\) (char-after (point)))))) |
| 804 | (setq mailextr-cbeg (point))) |
| 805 | ;; TODO: don't record if unbalanced |
| 806 | (or (mail-extr-safe-move-sexp 1) |
| 807 | (forward-char 1)) |
| 808 | (set-syntax-table mail-extr-address-syntax-table) |
| 809 | (if (and mailextr-cbeg |
| 810 | (not mailextr-cend)) |
| 811 | (setq mailextr-cend (point)))) |
| 812 | ;; quoted text |
| 813 | ((eq char ?\") |
| 814 | ;; only record the first non-empty quote's position |
| 815 | (if (and (not quote-beg) |
| 816 | (save-excursion |
| 817 | (forward-char 1) |
| 818 | (mail-extr-skip-whitespace-forward) |
| 819 | (not (eq ?\" (char-after (point)))))) |
| 820 | (setq quote-beg (point))) |
| 821 | ;; TODO: don't record if unbalanced |
| 822 | (or (mail-extr-safe-move-sexp 1) |
| 823 | (forward-char 1)) |
| 824 | (if (and quote-beg |
| 825 | (not quote-end)) |
| 826 | (setq quote-end (point)))) |
| 827 | ;; domain literals |
| 828 | ((eq char ?\[) |
| 829 | (set-syntax-table mail-extr-address-domain-literal-syntax-table) |
| 830 | (or (mail-extr-safe-move-sexp 1) |
| 831 | (forward-char 1)) |
| 832 | (set-syntax-table mail-extr-address-syntax-table)) |
| 833 | ;; commas delimit addresses when outside < > pairs. |
| 834 | ((and (eq char ?,) |
| 835 | (or (and (null <-pos) |
| 836 | ;; Handle ROUTE-ADDR address that is missing its <. |
| 837 | (not (eq ?@ (char-after (1+ (point)))))) |
| 838 | (and >-pos |
| 839 | ;; handle weird munged addresses |
| 840 | ;; BUG FIX: This test was reversed. Thanks to the |
| 841 | ;; brilliant Rod Whitby <rwhitby@research.canon.oz.au> |
| 842 | ;; for discovering this! |
| 843 | (< (car (last <-pos)) (car >-pos))))) |
| 844 | ;; The argument contains more than one address. |
| 845 | ;; Temporarily hide everything after this one. |
| 846 | (setq end-of-address (copy-marker (1+ (point)) t)) |
| 847 | (narrow-to-region (point-min) (1+ (point))) |
| 848 | (delete-char 1) |
| 849 | (setq char ?\() ; HAVE I NO SHAME?? |
| 850 | ) |
| 851 | ;; record the position of various interesting chars, determine |
| 852 | ;; validity later. |
| 853 | ((setq record-pos-symbol |
| 854 | (cdr (assq char |
| 855 | '((?< . <-pos) (?> . >-pos) (?@ . @-pos) |
| 856 | (?: . colon-pos) (?, . comma-pos) (?! . !-pos) |
| 857 | (?% . %-pos) (?\; . \;-pos))))) |
| 858 | (set record-pos-symbol |
| 859 | (cons (point) (symbol-value record-pos-symbol))) |
| 860 | (forward-char 1)) |
| 861 | ((eq char ?.) |
| 862 | (forward-char 1)) |
| 863 | ((memq char '( |
| 864 | ;; comment terminator invalid |
| 865 | ?\) |
| 866 | ;; domain literal terminator invalid |
| 867 | ?\] |
| 868 | ;; \ allowed only within quoted strings, |
| 869 | ;; domain literals, and comments |
| 870 | ?\\ |
| 871 | )) |
| 872 | (mail-extr-nuke-char-at (point)) |
| 873 | (forward-char 1)) |
| 874 | (t |
| 875 | ;; Do `(forward-word 1)', recognizing non-ASCII characters |
| 876 | ;; except Latin-1 nbsp as words. |
| 877 | (while (progn |
| 878 | (skip-chars-forward "^\000-\177Â ") |
| 879 | (and (not (eobp)) |
| 880 | (eq ?w (char-syntax (char-after))) |
| 881 | (progn |
| 882 | (forward-word 1) |
| 883 | (and (not (eobp)) |
| 884 | (> (char-after) ?\177) |
| 885 | (not (eq (char-after) ? ))))))))) |
| 886 | (or (eq char ?\() |
| 887 | ;; At the end of first address of a multiple address header. |
| 888 | (and (eq char ?,) |
| 889 | (eobp)) |
| 890 | (setq last-real-pos (point)))) |
| 891 | |
| 892 | ;; Use only the leftmost <, if any. Replace all others with spaces. |
| 893 | (while (cdr <-pos) |
| 894 | (mail-extr-nuke-char-at (car <-pos)) |
| 895 | (setq <-pos (cdr <-pos))) |
| 896 | |
| 897 | ;; Use only the rightmost >, if any. Replace all others with spaces. |
| 898 | (while (cdr >-pos) |
| 899 | (mail-extr-nuke-char-at (nth 1 >-pos)) |
| 900 | (setcdr >-pos (nthcdr 2 >-pos))) |
| 901 | |
| 902 | ;; If multiple @s and a :, but no < and >, insert around buffer. |
| 903 | ;; Example: @foo.bar.dom,@xxx.yyy.zzz:mailbox@aaa.bbb.ccc |
| 904 | ;; This commonly happens on the UUCP "From " line. Ugh. |
| 905 | (when (and (> (length @-pos) 1) |
| 906 | (eq 1 (length colon-pos)) ;TODO: check if between last two @s |
| 907 | (not \;-pos) |
| 908 | (not <-pos)) |
| 909 | (goto-char (point-min)) |
| 910 | (delete-char 1) |
| 911 | (setq <-pos (list (point))) |
| 912 | (insert ?<)) |
| 913 | |
| 914 | ;; If < but no >, insert > in rightmost possible position |
| 915 | (when (and <-pos (null >-pos)) |
| 916 | (goto-char (point-max)) |
| 917 | (setq >-pos (list (point))) |
| 918 | (insert ?>)) |
| 919 | |
| 920 | ;; If > but no <, replace > with space. |
| 921 | (when (and >-pos (null <-pos)) |
| 922 | (mail-extr-nuke-char-at (car >-pos)) |
| 923 | (setq >-pos nil)) |
| 924 | |
| 925 | ;; Turn >-pos and <-pos into non-lists |
| 926 | (setq >-pos (car >-pos) |
| 927 | <-pos (car <-pos)) |
| 928 | |
| 929 | ;; Trim other punctuation lists of items outside < > pair to handle |
| 930 | ;; stupid MTAs. |
| 931 | (when <-pos ; don't need to check >-pos also |
| 932 | ;; handle bozo software that violates RFC 822 by sticking |
| 933 | ;; punctuation marks outside of a < > pair |
| 934 | (mail-extr-nuke-outside-range @-pos <-pos >-pos t) |
| 935 | ;; RFC 822 says nothing about these two outside < >, but |
| 936 | ;; remove those positions from the lists to make things |
| 937 | ;; easier. |
| 938 | (mail-extr-nuke-outside-range !-pos <-pos >-pos t) |
| 939 | (mail-extr-nuke-outside-range %-pos <-pos >-pos t)) |
| 940 | |
| 941 | ;; Check for : that indicates GROUP list and for : part of |
| 942 | ;; ROUTE-ADDR spec. |
| 943 | ;; Can't possibly be more than two :. Nuke any extra. |
| 944 | (while colon-pos |
| 945 | (setq temp (car colon-pos) |
| 946 | colon-pos (cdr colon-pos)) |
| 947 | (cond ((and <-pos >-pos |
| 948 | (> temp <-pos) |
| 949 | (< temp >-pos)) |
| 950 | (if (or route-addr-:-pos |
| 951 | (< (length @-pos) 2) |
| 952 | (> temp (car @-pos)) |
| 953 | (< temp (nth 1 @-pos))) |
| 954 | (mail-extr-nuke-char-at temp) |
| 955 | (setq route-addr-:-pos temp))) |
| 956 | ((or (not <-pos) |
| 957 | (and <-pos |
| 958 | (< temp <-pos))) |
| 959 | (setq group-:-pos temp)))) |
| 960 | |
| 961 | ;; Nuke any ; that is in or to the left of a < > pair or to the left |
| 962 | ;; of a GROUP starting :. Also, there may only be one ;. |
| 963 | (while \;-pos |
| 964 | (setq temp (car \;-pos) |
| 965 | \;-pos (cdr \;-pos)) |
| 966 | (cond ((and <-pos >-pos |
| 967 | (> temp <-pos) |
| 968 | (< temp >-pos)) |
| 969 | (mail-extr-nuke-char-at temp)) |
| 970 | ((and (or (not group-:-pos) |
| 971 | (> temp group-:-pos)) |
| 972 | (not group-\;-pos)) |
| 973 | (setq group-\;-pos temp)))) |
| 974 | |
| 975 | ;; Nuke unmatched GROUP syntax characters. |
| 976 | (when (and group-:-pos (not group-\;-pos)) |
| 977 | ;; *** Do I really need to erase it? |
| 978 | (mail-extr-nuke-char-at group-:-pos) |
| 979 | (setq group-:-pos nil)) |
| 980 | (when (and group-\;-pos (not group-:-pos)) |
| 981 | ;; *** Do I really need to erase it? |
| 982 | (mail-extr-nuke-char-at group-\;-pos) |
| 983 | (setq group-\;-pos nil)) |
| 984 | |
| 985 | ;; Handle junk like ";@host.company.dom" that sendmail adds. |
| 986 | ;; **** should I remember comment positions? |
| 987 | (when group-\;-pos |
| 988 | ;; this is fine for now |
| 989 | (mail-extr-nuke-outside-range !-pos group-:-pos group-\;-pos t) |
| 990 | (mail-extr-nuke-outside-range @-pos group-:-pos group-\;-pos t) |
| 991 | (mail-extr-nuke-outside-range %-pos group-:-pos group-\;-pos t) |
| 992 | (mail-extr-nuke-outside-range comma-pos group-:-pos group-\;-pos t) |
| 993 | (and last-real-pos |
| 994 | (> last-real-pos (1+ group-\;-pos)) |
| 995 | (setq last-real-pos (1+ group-\;-pos))) |
| 996 | ;; *** This may be wrong: |
| 997 | (and mailextr-cend |
| 998 | (> mailextr-cend group-\;-pos) |
| 999 | (setq mailextr-cend nil |
| 1000 | mailextr-cbeg nil)) |
| 1001 | (and quote-end |
| 1002 | (> quote-end group-\;-pos) |
| 1003 | (setq quote-end nil |
| 1004 | quote-beg nil)) |
| 1005 | ;; This was both wrong and unnecessary: |
| 1006 | ;;(narrow-to-region (point-min) group-\;-pos) |
| 1007 | |
| 1008 | ;; *** The entire handling of GROUP addresses seems rather lame. |
| 1009 | ;; *** It deserves a complete rethink, except that these addresses |
| 1010 | ;; *** are hardly ever seen. |
| 1011 | ) |
| 1012 | |
| 1013 | ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any |
| 1014 | ;; others. |
| 1015 | ;; Hell, go ahead and nuke all of the commas. |
| 1016 | ;; **** This will cause problems when we start handling commas in |
| 1017 | ;; the PHRASE part .... no it won't ... yes it will ... ????? |
| 1018 | (mail-extr-nuke-outside-range comma-pos 1 1) |
| 1019 | |
| 1020 | ;; can only have multiple @s inside < >. The fact that some MTAs |
| 1021 | ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is |
| 1022 | ;; handled above. |
| 1023 | |
| 1024 | ;; Locate PHRASE part of ROUTE-ADDR. |
| 1025 | (when <-pos |
| 1026 | (goto-char <-pos) |
| 1027 | (mail-extr-skip-whitespace-backward) |
| 1028 | (setq phrase-end (point)) |
| 1029 | (goto-char (or ;;group-:-pos |
| 1030 | (point-min))) |
| 1031 | (mail-extr-skip-whitespace-forward) |
| 1032 | (if (< (point) phrase-end) |
| 1033 | (setq phrase-beg (point)) |
| 1034 | (setq phrase-end nil))) |
| 1035 | |
| 1036 | ;; handle ROUTE-ADDRS with real ROUTEs. |
| 1037 | ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and |
| 1038 | ;; any % or ! must be semantically meaningless. |
| 1039 | ;; TODO: do this processing into canonicalization buffer |
| 1040 | (when route-addr-:-pos |
| 1041 | (setq !-pos nil |
| 1042 | %-pos nil |
| 1043 | >-pos (copy-marker >-pos) |
| 1044 | route-addr-:-pos (copy-marker route-addr-:-pos)) |
| 1045 | (goto-char >-pos) |
| 1046 | (insert-before-markers ?X) |
| 1047 | (goto-char (car @-pos)) |
| 1048 | (while (setq @-pos (cdr @-pos)) |
| 1049 | (delete-char 1) |
| 1050 | (setq %-pos (cons (point-marker) %-pos)) |
| 1051 | (insert "%") |
| 1052 | (goto-char (1- >-pos)) |
| 1053 | (save-excursion |
| 1054 | (insert-buffer-substring extraction-buffer |
| 1055 | (car @-pos) route-addr-:-pos) |
| 1056 | (delete-region (car @-pos) route-addr-:-pos)) |
| 1057 | (or (cdr @-pos) |
| 1058 | (setq saved-@-pos (list (point))))) |
| 1059 | (setq @-pos saved-@-pos) |
| 1060 | (goto-char >-pos) |
| 1061 | (delete-char -1) |
| 1062 | (mail-extr-nuke-char-at route-addr-:-pos) |
| 1063 | (mail-extr-demarkerize route-addr-:-pos) |
| 1064 | (setq route-addr-:-pos nil |
| 1065 | >-pos (mail-extr-demarkerize >-pos) |
| 1066 | %-pos (mapcar 'mail-extr-demarkerize %-pos))) |
| 1067 | |
| 1068 | ;; de-listify @-pos |
| 1069 | (setq @-pos (car @-pos)) |
| 1070 | |
| 1071 | ;; TODO: remove comments in the middle of an address |
| 1072 | |
| 1073 | (with-current-buffer canonicalization-buffer |
| 1074 | (widen) |
| 1075 | (erase-buffer) |
| 1076 | (insert-buffer-substring extraction-buffer) |
| 1077 | |
| 1078 | (if <-pos |
| 1079 | (narrow-to-region (progn |
| 1080 | (goto-char (1+ <-pos)) |
| 1081 | (mail-extr-skip-whitespace-forward) |
| 1082 | (point)) |
| 1083 | >-pos) |
| 1084 | (if (and first-real-pos last-real-pos) |
| 1085 | (narrow-to-region first-real-pos last-real-pos) |
| 1086 | ;; ****** Oh no! What if the address is completely empty! |
| 1087 | ;; *** Is this correct? |
| 1088 | (narrow-to-region (point-max) (point-max)))) |
| 1089 | |
| 1090 | (and @-pos %-pos |
| 1091 | (mail-extr-nuke-outside-range %-pos (point-min) @-pos)) |
| 1092 | (and %-pos !-pos |
| 1093 | (mail-extr-nuke-outside-range !-pos (point-min) (car %-pos))) |
| 1094 | (and @-pos !-pos (not %-pos) |
| 1095 | (mail-extr-nuke-outside-range !-pos (point-min) @-pos)) |
| 1096 | |
| 1097 | ;; Error condition:?? (and %-pos (not @-pos)) |
| 1098 | |
| 1099 | ;; WARNING: THIS CODE IS DUPLICATED BELOW. |
| 1100 | (when (and %-pos (not @-pos)) |
| 1101 | (goto-char (car %-pos)) |
| 1102 | (delete-char 1) |
| 1103 | (setq @-pos (point)) |
| 1104 | (insert "@") |
| 1105 | (setq %-pos (cdr %-pos))) |
| 1106 | |
| 1107 | (when (and mail-extr-mangle-uucp !-pos) |
| 1108 | ;; **** I don't understand this save-restriction and the |
| 1109 | ;; narrow-to-region inside it. Why did I do that? |
| 1110 | (save-restriction |
| 1111 | (cond ((and @-pos |
| 1112 | mail-extr-@-binds-tighter-than-!) |
| 1113 | (goto-char @-pos) |
| 1114 | (setq %-pos (cons (point) %-pos) |
| 1115 | @-pos nil) |
| 1116 | (delete-char 1) |
| 1117 | (insert "%") |
| 1118 | (setq insert-point (point-max))) |
| 1119 | (mail-extr-@-binds-tighter-than-! |
| 1120 | (setq insert-point (point-max))) |
| 1121 | (%-pos |
| 1122 | (setq insert-point (car (last %-pos)) |
| 1123 | saved-%-pos (mapcar 'mail-extr-markerize %-pos) |
| 1124 | %-pos nil |
| 1125 | @-pos (mail-extr-markerize @-pos))) |
| 1126 | (@-pos |
| 1127 | (setq insert-point @-pos) |
| 1128 | (setq @-pos (mail-extr-markerize @-pos))) |
| 1129 | (t |
| 1130 | (setq insert-point (point-max)))) |
| 1131 | (narrow-to-region (point-min) insert-point) |
| 1132 | (setq saved-!-pos (car !-pos)) |
| 1133 | (while !-pos |
| 1134 | (goto-char (point-max)) |
| 1135 | (cond ((and (not @-pos) |
| 1136 | (not (cdr !-pos))) |
| 1137 | (setq @-pos (point)) |
| 1138 | (insert-before-markers "@ ")) |
| 1139 | (t |
| 1140 | (setq %-pos (cons (point) %-pos)) |
| 1141 | (insert-before-markers "% "))) |
| 1142 | (backward-char 1) |
| 1143 | (insert-buffer-substring |
| 1144 | (current-buffer) |
| 1145 | (if (nth 1 !-pos) |
| 1146 | (1+ (nth 1 !-pos)) |
| 1147 | (point-min)) |
| 1148 | (car !-pos)) |
| 1149 | (delete-char 1) |
| 1150 | (or (save-excursion |
| 1151 | (mail-extr-safe-move-sexp -1) |
| 1152 | (mail-extr-skip-whitespace-backward) |
| 1153 | (eq ?. (preceding-char))) |
| 1154 | (insert-before-markers |
| 1155 | (if (save-excursion |
| 1156 | (mail-extr-skip-whitespace-backward) |
| 1157 | (eq ?. (preceding-char))) |
| 1158 | "" |
| 1159 | ".") |
| 1160 | "uucp")) |
| 1161 | (setq !-pos (cdr !-pos)))) |
| 1162 | (and saved-%-pos |
| 1163 | (setq %-pos (append (mapcar 'mail-extr-demarkerize |
| 1164 | saved-%-pos) |
| 1165 | %-pos))) |
| 1166 | (setq @-pos (mail-extr-demarkerize @-pos)) |
| 1167 | (narrow-to-region (1+ saved-!-pos) (point-max))) |
| 1168 | |
| 1169 | ;; WARNING: THIS CODE IS DUPLICATED ABOVE. |
| 1170 | (when (and %-pos (not @-pos)) |
| 1171 | (goto-char (car %-pos)) |
| 1172 | (delete-char 1) |
| 1173 | (setq @-pos (point)) |
| 1174 | (insert "@") |
| 1175 | (setq %-pos (cdr %-pos))) |
| 1176 | |
| 1177 | (when (setq %-pos (nreverse %-pos)) ; implies @-pos valid |
| 1178 | (setq temp %-pos) |
| 1179 | (catch 'truncated |
| 1180 | (while temp |
| 1181 | (goto-char (or (nth 1 temp) |
| 1182 | @-pos)) |
| 1183 | (mail-extr-skip-whitespace-backward) |
| 1184 | (save-excursion |
| 1185 | (mail-extr-safe-move-sexp -1) |
| 1186 | (setq domain-pos (point)) |
| 1187 | (mail-extr-skip-whitespace-backward) |
| 1188 | (setq \.-pos (eq ?. (preceding-char)))) |
| 1189 | (when (and \.-pos |
| 1190 | ;; #### string consing |
| 1191 | (let ((s (intern-soft |
| 1192 | (buffer-substring domain-pos (point)) |
| 1193 | mail-extr-all-top-level-domains))) |
| 1194 | (and s (get s 'domain-name)))) |
| 1195 | (narrow-to-region (point-min) (point)) |
| 1196 | (goto-char (car temp)) |
| 1197 | (delete-char 1) |
| 1198 | (setq @-pos (point)) |
| 1199 | (setcdr temp nil) |
| 1200 | (setq %-pos (delq @-pos %-pos)) |
| 1201 | (insert "@") |
| 1202 | (throw 'truncated t)) |
| 1203 | (setq temp (cdr temp))))) |
| 1204 | (setq mbox-beg (point-min) |
| 1205 | mbox-end (if %-pos (car %-pos) |
| 1206 | (or @-pos |
| 1207 | (point-max)))) |
| 1208 | |
| 1209 | (when @-pos |
| 1210 | ;; Make the domain-name part lowercase since it's case |
| 1211 | ;; insensitive anyway. |
| 1212 | (downcase-region (1+ @-pos) (point-max)))) |
| 1213 | |
| 1214 | ;; Done canonicalizing address. |
| 1215 | ;; We are now back in extraction-buffer. |
| 1216 | |
| 1217 | ;; Decide what part of the address to search to find the full name. |
| 1218 | (cond ( |
| 1219 | ;; Example: "First M. Last" <fml@foo.bar.dom> |
| 1220 | (and phrase-beg |
| 1221 | (eq quote-beg phrase-beg) |
| 1222 | (<= quote-end phrase-end)) |
| 1223 | (narrow-to-region (1+ quote-beg) (1- quote-end)) |
| 1224 | (mail-extr-undo-backslash-quoting (point-min) (point-max))) |
| 1225 | |
| 1226 | ;; Example: First Last <fml@foo.bar.dom> |
| 1227 | (phrase-beg |
| 1228 | (narrow-to-region phrase-beg phrase-end)) |
| 1229 | |
| 1230 | ;; Example: fml@foo.bar.dom (First M. Last) |
| 1231 | (mailextr-cbeg |
| 1232 | (narrow-to-region (1+ mailextr-cbeg) (1- mailextr-cend)) |
| 1233 | (mail-extr-undo-backslash-quoting (point-min) (point-max)) |
| 1234 | |
| 1235 | ;; Deal with spacing problems |
| 1236 | (goto-char (point-min)) |
| 1237 | ;;; (cond ((not (search-forward " " nil t)) |
| 1238 | ;;; (goto-char (point-min)) |
| 1239 | ;;; (cond ((search-forward "_" nil t) |
| 1240 | ;;; ;; Handle the *idiotic* use of underlines as spaces. |
| 1241 | ;;; ;; Example: fml@foo.bar.dom (First_M._Last) |
| 1242 | ;;; (goto-char (point-min)) |
| 1243 | ;;; (while (search-forward "_" nil t) |
| 1244 | ;;; (replace-match " " t))) |
| 1245 | ;;; ((search-forward "." nil t) |
| 1246 | ;;; ;; Fix . used as space |
| 1247 | ;;; ;; Example: danj1@cb.att.com (daniel.jacobson) |
| 1248 | ;;; (goto-char (point-min)) |
| 1249 | ;;; (while (re-search-forward mail-extr-bad-dot-pattern nil t) |
| 1250 | ;;; (replace-match "\\1 \\2" t)))))) |
| 1251 | ) |
| 1252 | |
| 1253 | ;; Otherwise we try to get the name from the mailbox portion |
| 1254 | ;; of the address. |
| 1255 | ;; Example: First_M_Last@foo.bar.dom |
| 1256 | (t |
| 1257 | ;; *** Work in canon buffer instead? No, can't. Hmm. |
| 1258 | (goto-char (point-max)) |
| 1259 | (narrow-to-region (point) (point)) |
| 1260 | (insert-buffer-substring canonicalization-buffer |
| 1261 | mbox-beg mbox-end) |
| 1262 | (goto-char (point-min)) |
| 1263 | |
| 1264 | ;; Example: First_Last.XXX@foo.bar.dom |
| 1265 | (setq \.-ends-name (re-search-forward "[_0-9]" nil t)) |
| 1266 | |
| 1267 | (goto-char (point-min)) |
| 1268 | |
| 1269 | (if (not mail-extr-mangle-uucp) |
| 1270 | (modify-syntax-entry ?! "w" (syntax-table))) |
| 1271 | |
| 1272 | (while (progn |
| 1273 | (mail-extr-skip-whitespace-forward) |
| 1274 | (not (eobp))) |
| 1275 | (setq char (char-after (point))) |
| 1276 | (cond |
| 1277 | ((eq char ?\") |
| 1278 | (setq quote-beg (point)) |
| 1279 | (or (mail-extr-safe-move-sexp 1) |
| 1280 | ;; TODO: handle this error condition!!!!! |
| 1281 | (forward-char 1)) |
| 1282 | ;; take into account deletions |
| 1283 | (setq quote-end (- (point) 2)) |
| 1284 | (save-excursion |
| 1285 | (backward-char 1) |
| 1286 | (delete-char 1) |
| 1287 | (goto-char quote-beg) |
| 1288 | (or (eobp) |
| 1289 | (delete-char 1))) |
| 1290 | (mail-extr-undo-backslash-quoting quote-beg quote-end) |
| 1291 | (or (eq ?\ (char-after (point))) |
| 1292 | (insert " ")) |
| 1293 | ;; (setq mailbox-name-processed-flag t) |
| 1294 | (setq \.-ends-name t)) |
| 1295 | ((eq char ?.) |
| 1296 | (if (memq (char-after (1+ (point))) '(?_ ?=)) |
| 1297 | (progn |
| 1298 | (forward-char 1) |
| 1299 | (delete-char 1) |
| 1300 | (insert ?\ )) |
| 1301 | (if \.-ends-name |
| 1302 | (narrow-to-region (point-min) (point)) |
| 1303 | (delete-char 1) |
| 1304 | (insert " "))) |
| 1305 | ;; (setq mailbox-name-processed-flag t) |
| 1306 | ) |
| 1307 | ((memq (char-syntax char) '(?. ?\\)) |
| 1308 | (delete-char 1) |
| 1309 | (insert " ") |
| 1310 | ;; (setq mailbox-name-processed-flag t) |
| 1311 | ) |
| 1312 | (t |
| 1313 | (setq atom-beg (point)) |
| 1314 | (forward-word 1) |
| 1315 | (setq atom-end (point)) |
| 1316 | (goto-char atom-beg) |
| 1317 | (save-restriction |
| 1318 | (narrow-to-region atom-beg atom-end) |
| 1319 | (cond |
| 1320 | |
| 1321 | ;; Handle X.400 addresses encoded in RFC-822. |
| 1322 | ;; *** Shit! This has to handle the case where it is |
| 1323 | ;; *** embedded in a quote too! |
| 1324 | ;; *** Shit! The input is being broken up into atoms |
| 1325 | ;; *** by periods! |
| 1326 | ((looking-at mail-extr-x400-encoded-address-pattern) |
| 1327 | |
| 1328 | ;; Copy the contents of the individual fields that |
| 1329 | ;; might hold name data to the beginning. |
| 1330 | (mapc |
| 1331 | (lambda (field-pattern) |
| 1332 | (when |
| 1333 | (save-excursion |
| 1334 | (re-search-forward field-pattern nil t)) |
| 1335 | (insert-buffer-substring (current-buffer) |
| 1336 | (match-beginning 1) |
| 1337 | (match-end 1)) |
| 1338 | (insert " "))) |
| 1339 | (list mail-extr-x400-encoded-address-given-name-pattern |
| 1340 | mail-extr-x400-encoded-address-surname-pattern |
| 1341 | mail-extr-x400-encoded-address-full-name-pattern)) |
| 1342 | |
| 1343 | ;; Discard the rest, since it contains stuff like |
| 1344 | ;; routing information, not part of a name. |
| 1345 | (mail-extr-skip-whitespace-backward) |
| 1346 | (delete-region (point) (point-max)) |
| 1347 | |
| 1348 | ;; Handle periods used for spacing. |
| 1349 | (while (re-search-forward mail-extr-bad-dot-pattern nil t) |
| 1350 | (replace-match "\\1 \\2" t)) |
| 1351 | |
| 1352 | ;; (setq mailbox-name-processed-flag t) |
| 1353 | ) |
| 1354 | |
| 1355 | ;; Handle normal addresses. |
| 1356 | (t |
| 1357 | (goto-char (point-min)) |
| 1358 | ;; Handle _ and = used for spacing. |
| 1359 | (while (re-search-forward "\\([^_=]+\\)[_=]" nil t) |
| 1360 | (replace-match "\\1 " t) |
| 1361 | ;; (setq mailbox-name-processed-flag t) |
| 1362 | ) |
| 1363 | (goto-char (point-max)))))))) |
| 1364 | |
| 1365 | ;; undo the dirty deed |
| 1366 | (if (not mail-extr-mangle-uucp) |
| 1367 | (modify-syntax-entry ?! "." (syntax-table))) |
| 1368 | ;; |
| 1369 | ;; If we derived the name from the mailbox part of the address, |
| 1370 | ;; and we only got one word out of it, don't treat that as a |
| 1371 | ;; name. "foo@bar" --> (nil "foo@bar"), not ("foo" "foo@bar") |
| 1372 | ;; (if (not mailbox-name-processed-flag) |
| 1373 | ;; (delete-region (point-min) (point-max))) |
| 1374 | )) |
| 1375 | |
| 1376 | (set-syntax-table mail-extr-address-text-syntax-table) |
| 1377 | |
| 1378 | (mail-extr-voodoo mbox-beg mbox-end canonicalization-buffer) |
| 1379 | (goto-char (point-min)) |
| 1380 | |
| 1381 | ;; If name is "First Last" and userid is "F?L", then assume |
| 1382 | ;; the middle initial is the second letter in the userid. |
| 1383 | ;; Initial code by Jamie Zawinski <jwz@lucid.com> |
| 1384 | ;; *** Make it work when there's a suffix as well. |
| 1385 | (goto-char (point-min)) |
| 1386 | (when (and mail-extr-guess-middle-initial |
| 1387 | (not disable-initial-guessing-flag) |
| 1388 | (eq 3 (- mbox-end mbox-beg)) |
| 1389 | (progn |
| 1390 | (goto-char (point-min)) |
| 1391 | (looking-at mail-extr-two-name-pattern))) |
| 1392 | (setq fi (char-after (match-beginning 0)) |
| 1393 | li (char-after (match-beginning 3))) |
| 1394 | (with-current-buffer canonicalization-buffer |
| 1395 | ;; char-equal is ignoring case here, so no need to upcase |
| 1396 | ;; or downcase. |
| 1397 | (let ((case-fold-search t)) |
| 1398 | (and (char-equal fi (char-after mbox-beg)) |
| 1399 | (char-equal li (char-after (1- mbox-end))) |
| 1400 | (setq mi (char-after (1+ mbox-beg)))))) |
| 1401 | (when (and mi |
| 1402 | ;; TODO: use better table than syntax table |
| 1403 | (eq ?w (char-syntax mi))) |
| 1404 | (goto-char (match-beginning 3)) |
| 1405 | (insert (upcase mi) ". "))) |
| 1406 | |
| 1407 | ;; Nuke name if it is the same as mailbox name. |
| 1408 | (let ((buffer-length (- (point-max) (point-min))) |
| 1409 | (i 0) |
| 1410 | (names-match-flag t)) |
| 1411 | (when (and (> buffer-length 0) |
| 1412 | (eq buffer-length (- mbox-end mbox-beg))) |
| 1413 | (goto-char (point-max)) |
| 1414 | (insert-buffer-substring canonicalization-buffer |
| 1415 | mbox-beg mbox-end) |
| 1416 | (while (and names-match-flag |
| 1417 | (< i buffer-length)) |
| 1418 | (or (eq (downcase (char-after (+ i (point-min)))) |
| 1419 | (downcase |
| 1420 | (char-after (+ i buffer-length (point-min))))) |
| 1421 | (setq names-match-flag nil)) |
| 1422 | (setq i (1+ i))) |
| 1423 | (delete-region (+ (point-min) buffer-length) (point-max)) |
| 1424 | (and names-match-flag |
| 1425 | mail-extr-ignore-realname-equals-mailbox-name |
| 1426 | (narrow-to-region (point) (point))))) |
| 1427 | |
| 1428 | ;; Nuke name if it's just one word. |
| 1429 | (goto-char (point-min)) |
| 1430 | (and mail-extr-ignore-single-names |
| 1431 | (not (re-search-forward "[- ]" nil t)) |
| 1432 | (narrow-to-region (point) (point))) |
| 1433 | |
| 1434 | ;; Record the result |
| 1435 | (setq value-list |
| 1436 | (cons (list (if (not (= (point-min) (point-max))) |
| 1437 | (buffer-string)) |
| 1438 | (with-current-buffer canonicalization-buffer |
| 1439 | (if (not (= (point-min) (point-max))) |
| 1440 | (buffer-string)))) |
| 1441 | value-list)) |
| 1442 | |
| 1443 | ;; Unless one address is all we wanted, |
| 1444 | ;; delete this one from extraction-buffer |
| 1445 | ;; and get ready to extract the next address. |
| 1446 | (when all |
| 1447 | (if end-of-address |
| 1448 | (narrow-to-region 1 end-of-address) |
| 1449 | (widen)) |
| 1450 | (delete-region (point-min) (point-max)) |
| 1451 | (widen)) |
| 1452 | ))) |
| 1453 | (if all (nreverse value-list) (car value-list)) |
| 1454 | )) |
| 1455 | |
| 1456 | (defcustom mail-extr-disable-voodoo "\\cj" |
| 1457 | "*If it is a regexp, names matching it will never be modified. |
| 1458 | If it is neither nil nor a string, modifying of names will never take |
| 1459 | place. It affects how `mail-extract-address-components' works." |
| 1460 | :type '(choice (regexp :size 0) |
| 1461 | (const :tag "Always enabled" nil) |
| 1462 | (const :tag "Always disabled" t)) |
| 1463 | :group 'mail-extr) |
| 1464 | |
| 1465 | (defun mail-extr-voodoo (mbox-beg mbox-end canonicalization-buffer) |
| 1466 | (unless (and mail-extr-disable-voodoo |
| 1467 | (or (not (stringp mail-extr-disable-voodoo)) |
| 1468 | (progn |
| 1469 | (goto-char (point-min)) |
| 1470 | (re-search-forward mail-extr-disable-voodoo nil t)))) |
| 1471 | (let ((word-count 0) |
| 1472 | (case-fold-search nil) |
| 1473 | mixed-case-flag lower-case-flag ;;upper-case-flag |
| 1474 | suffix-flag last-name-comma-flag |
| 1475 | initial |
| 1476 | begin-again-flag |
| 1477 | drop-this-word-if-trailing-flag |
| 1478 | drop-last-word-if-trailing-flag |
| 1479 | word-found-flag |
| 1480 | this-word-beg last-word-beg |
| 1481 | name-beg name-end |
| 1482 | name-done-flag |
| 1483 | ) |
| 1484 | (save-excursion |
| 1485 | (set-syntax-table mail-extr-address-text-syntax-table) |
| 1486 | |
| 1487 | ;; Get rid of comments. |
| 1488 | (goto-char (point-min)) |
| 1489 | (while (not (eobp)) |
| 1490 | ;; Initialize for this iteration of the loop. |
| 1491 | (skip-chars-forward "^({[\"'`") |
| 1492 | (let ((cbeg (point))) |
| 1493 | (set-syntax-table mail-extr-address-text-comment-syntax-table) |
| 1494 | (if (memq (following-char) '(?\' ?\`)) |
| 1495 | (search-forward "'" nil 'move |
| 1496 | (if (eq ?\' (following-char)) 2 1)) |
| 1497 | (or (mail-extr-safe-move-sexp 1) |
| 1498 | (goto-char (point-max)))) |
| 1499 | (set-syntax-table mail-extr-address-text-syntax-table) |
| 1500 | (when (eq (char-after cbeg) ?\() |
| 1501 | ;; Delete the comment itself. |
| 1502 | (delete-region cbeg (point)) |
| 1503 | ;; Canonicalize whitespace where the comment was. |
| 1504 | (skip-chars-backward " \t") |
| 1505 | (if (looking-at "\\([ \t]+$\\|[ \t]+,\\)") |
| 1506 | (replace-match "") |
| 1507 | (setq cbeg (point)) |
| 1508 | (skip-chars-forward " \t") |
| 1509 | (if (bobp) |
| 1510 | (delete-region (point) cbeg) |
| 1511 | (just-one-space)))))) |
| 1512 | |
| 1513 | ;; This was moved above. |
| 1514 | ;; Fix . used as space |
| 1515 | ;; But it belongs here because it occurs not only as |
| 1516 | ;; rypens@reks.uia.ac.be (Piet.Rypens) |
| 1517 | ;; but also as |
| 1518 | ;; "Piet.Rypens" <rypens@reks.uia.ac.be> |
| 1519 | ;;(goto-char (point-min)) |
| 1520 | ;;(while (re-search-forward mail-extr-bad-dot-pattern nil t) |
| 1521 | ;; (replace-match "\\1 \\2" t)) |
| 1522 | |
| 1523 | (unless (search-forward " " nil t) |
| 1524 | (goto-char (point-min)) |
| 1525 | (cond ((search-forward "_" nil t) |
| 1526 | ;; Handle the *idiotic* use of underlines as spaces. |
| 1527 | ;; Example: fml@foo.bar.dom (First_M._Last) |
| 1528 | (goto-char (point-min)) |
| 1529 | (while (search-forward "_" nil t) |
| 1530 | (replace-match " " t))) |
| 1531 | ((search-forward "." nil t) |
| 1532 | ;; Fix . used as space |
| 1533 | ;; Example: danj1@cb.att.com (daniel.jacobson) |
| 1534 | (goto-char (point-min)) |
| 1535 | (while (re-search-forward mail-extr-bad-dot-pattern nil t) |
| 1536 | (replace-match "\\1 \\2" t))))) |
| 1537 | |
| 1538 | ;; Loop over the words (and other junk) in the name. |
| 1539 | (goto-char (point-min)) |
| 1540 | (while (not name-done-flag) |
| 1541 | |
| 1542 | (when word-found-flag |
| 1543 | ;; Last time through this loop we skipped over a word. |
| 1544 | (setq last-word-beg this-word-beg) |
| 1545 | (setq drop-last-word-if-trailing-flag |
| 1546 | drop-this-word-if-trailing-flag) |
| 1547 | (setq word-found-flag nil)) |
| 1548 | |
| 1549 | (when begin-again-flag |
| 1550 | ;; Last time through the loop we found something that |
| 1551 | ;; indicates we should pretend we are beginning again from |
| 1552 | ;; the start. |
| 1553 | (setq word-count 0) |
| 1554 | (setq last-word-beg nil) |
| 1555 | (setq drop-last-word-if-trailing-flag nil) |
| 1556 | (setq mixed-case-flag nil) |
| 1557 | (setq lower-case-flag nil) |
| 1558 | ;; (setq upper-case-flag nil) |
| 1559 | (setq begin-again-flag nil)) |
| 1560 | |
| 1561 | ;; Initialize for this iteration of the loop. |
| 1562 | (mail-extr-skip-whitespace-forward) |
| 1563 | (if (eq word-count 0) (narrow-to-region (point) (point-max))) |
| 1564 | (setq this-word-beg (point)) |
| 1565 | (setq drop-this-word-if-trailing-flag nil) |
| 1566 | |
| 1567 | ;; Decide what to do based on what we are looking at. |
| 1568 | (cond |
| 1569 | |
| 1570 | ;; Delete title |
| 1571 | ((and (eq word-count 0) |
| 1572 | (looking-at mail-extr-full-name-prefixes)) |
| 1573 | (goto-char (match-end 0)) |
| 1574 | (narrow-to-region (point) (point-max))) |
| 1575 | |
| 1576 | ;; Stop after name suffix |
| 1577 | ((and (>= word-count 2) |
| 1578 | (looking-at mail-extr-full-name-suffix-pattern)) |
| 1579 | (mail-extr-skip-whitespace-backward) |
| 1580 | (setq suffix-flag (point)) |
| 1581 | (if (eq ?, (following-char)) |
| 1582 | (forward-char 1) |
| 1583 | (insert ?,)) |
| 1584 | ;; Enforce at least one space after comma |
| 1585 | (or (eq ?\ (following-char)) |
| 1586 | (insert ?\ )) |
| 1587 | (mail-extr-skip-whitespace-forward) |
| 1588 | (cond ((memq (following-char) '(?j ?J ?s ?S)) |
| 1589 | (capitalize-word 1) |
| 1590 | (if (eq (following-char) ?.) |
| 1591 | (forward-char 1) |
| 1592 | (insert ?.))) |
| 1593 | (t |
| 1594 | (upcase-word 1))) |
| 1595 | (setq word-found-flag t) |
| 1596 | (setq name-done-flag t)) |
| 1597 | |
| 1598 | ;; Handle SCA names |
| 1599 | ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As" |
| 1600 | (goto-char (match-beginning 1)) |
| 1601 | (narrow-to-region (point) (point-max)) |
| 1602 | (setq begin-again-flag t)) |
| 1603 | |
| 1604 | ;; Check for initial last name followed by comma |
| 1605 | ((and (eq ?, (following-char)) |
| 1606 | (eq word-count 1)) |
| 1607 | (forward-char 1) |
| 1608 | (setq last-name-comma-flag t) |
| 1609 | (or (eq ?\ (following-char)) |
| 1610 | (insert ?\ ))) |
| 1611 | |
| 1612 | ;; Stop before trailing comma-separated comment |
| 1613 | ;; THIS CASE MUST BE AFTER THE PRECEDING CASES. |
| 1614 | ;; *** This case is redundant??? |
| 1615 | ;;((eq ?, (following-char)) |
| 1616 | ;; (setq name-done-flag t)) |
| 1617 | |
| 1618 | ;; Delete parenthesized/quoted comment/nickname |
| 1619 | ((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`)) |
| 1620 | (setq mailextr-cbeg (point)) |
| 1621 | (set-syntax-table mail-extr-address-text-comment-syntax-table) |
| 1622 | (cond ((memq (following-char) '(?\' ?\`)) |
| 1623 | (or (search-forward "'" nil t |
| 1624 | (if (eq ?\' (following-char)) 2 1)) |
| 1625 | (delete-char 1))) |
| 1626 | (t |
| 1627 | (or (mail-extr-safe-move-sexp 1) |
| 1628 | (goto-char (point-max))))) |
| 1629 | (set-syntax-table mail-extr-address-text-syntax-table) |
| 1630 | (setq mailextr-cend (point)) |
| 1631 | (cond |
| 1632 | ;; Handle case of entire name being quoted |
| 1633 | ((and (eq word-count 0) |
| 1634 | (looking-at " *\\'") |
| 1635 | (>= (- mailextr-cend mailextr-cbeg) 2)) |
| 1636 | (narrow-to-region (1+ mailextr-cbeg) (1- mailextr-cend)) |
| 1637 | (goto-char (point-min))) |
| 1638 | (t |
| 1639 | ;; Handle case of quoted initial |
| 1640 | (if (and (or (= 3 (- mailextr-cend mailextr-cbeg)) |
| 1641 | (and (= 4 (- mailextr-cend mailextr-cbeg)) |
| 1642 | (eq ?. (char-after (+ 2 mailextr-cbeg))))) |
| 1643 | (not (looking-at " *\\'"))) |
| 1644 | (setq initial (char-after (1+ mailextr-cbeg))) |
| 1645 | (setq initial nil)) |
| 1646 | (delete-region mailextr-cbeg mailextr-cend) |
| 1647 | (if initial |
| 1648 | (insert initial ". "))))) |
| 1649 | |
| 1650 | ;; Handle *Stupid* VMS date stamps |
| 1651 | ((looking-at mail-extr-stupid-vms-date-stamp-pattern) |
| 1652 | (replace-match "" t)) |
| 1653 | |
| 1654 | ;; Handle Chinese characters. |
| 1655 | ((looking-at mail-extr-hz-embedded-gb-encoded-chinese-pattern) |
| 1656 | (goto-char (match-end 0)) |
| 1657 | (setq word-found-flag t)) |
| 1658 | |
| 1659 | ;; Skip initial garbage characters. |
| 1660 | ;; THIS CASE MUST BE AFTER THE PRECEDING CASES. |
| 1661 | ((and (eq word-count 0) |
| 1662 | (looking-at mail-extr-leading-garbage)) |
| 1663 | (goto-char (match-end 0)) |
| 1664 | ;; *** Skip backward over these??? |
| 1665 | ;; (skip-chars-backward "& \"") |
| 1666 | (narrow-to-region (point) (point-max))) |
| 1667 | |
| 1668 | ;; Various stopping points |
| 1669 | ((or |
| 1670 | |
| 1671 | ;; Stop before ALL CAPS acronyms, if preceded by mixed-case |
| 1672 | ;; words. Example: XT-DEM. |
| 1673 | (and (>= word-count 2) |
| 1674 | mixed-case-flag |
| 1675 | (looking-at mail-extr-weird-acronym-pattern) |
| 1676 | (not (looking-at mail-extr-roman-numeral-pattern))) |
| 1677 | |
| 1678 | ;; Stop before trailing alternative address |
| 1679 | (looking-at mail-extr-alternative-address-pattern) |
| 1680 | |
| 1681 | ;; Stop before trailing comment not introduced by comma |
| 1682 | ;; THIS CASE MUST BE AFTER AN EARLIER CASE. |
| 1683 | (looking-at mail-extr-trailing-comment-start-pattern) |
| 1684 | |
| 1685 | ;; Stop before telephone numbers |
| 1686 | (and (>= word-count 1) |
| 1687 | (looking-at mail-extr-telephone-extension-pattern))) |
| 1688 | (setq name-done-flag t)) |
| 1689 | |
| 1690 | ;; Delete ham radio call signs |
| 1691 | ((looking-at mail-extr-ham-call-sign-pattern) |
| 1692 | (delete-region (match-beginning 0) (match-end 0))) |
| 1693 | |
| 1694 | ;; Fixup initials |
| 1695 | ((looking-at mail-extr-initial-pattern) |
| 1696 | (or (eq (following-char) (upcase (following-char))) |
| 1697 | (setq lower-case-flag t)) |
| 1698 | (forward-char 1) |
| 1699 | (if (eq ?. (following-char)) |
| 1700 | (forward-char 1) |
| 1701 | (insert ?.)) |
| 1702 | (or (eq ?\ (following-char)) |
| 1703 | (insert ?\ )) |
| 1704 | (setq word-found-flag t)) |
| 1705 | |
| 1706 | ;; Handle BITNET LISTSERV list names. |
| 1707 | ((and (eq word-count 0) |
| 1708 | (looking-at mail-extr-listserv-list-name-pattern)) |
| 1709 | (narrow-to-region (match-beginning 1) (match-end 1)) |
| 1710 | (setq word-found-flag t) |
| 1711 | (setq name-done-flag t)) |
| 1712 | |
| 1713 | ;; Handle & substitution, when & is last and is not first. |
| 1714 | ((and (> word-count 0) |
| 1715 | (eq ?\ (preceding-char)) |
| 1716 | (eq (following-char) ?&) |
| 1717 | (eq (1+ (point)) (point-max))) |
| 1718 | (delete-char 1) |
| 1719 | (capitalize-region |
| 1720 | (point) |
| 1721 | (progn |
| 1722 | (insert-buffer-substring canonicalization-buffer |
| 1723 | mbox-beg mbox-end) |
| 1724 | (point))) |
| 1725 | (setq disable-initial-guessing-flag t) |
| 1726 | (setq word-found-flag t)) |
| 1727 | |
| 1728 | ;; Handle & between names, as in "Bob & Susie". |
| 1729 | ((and (> word-count 0) (eq (following-char) ?\&)) |
| 1730 | (setq name-beg (point)) |
| 1731 | (setq name-end (1+ name-beg)) |
| 1732 | (setq word-found-flag t) |
| 1733 | (goto-char name-end)) |
| 1734 | |
| 1735 | ;; Regular name words |
| 1736 | ((looking-at mail-extr-name-pattern) |
| 1737 | (setq name-beg (point)) |
| 1738 | (setq name-end (match-end 0)) |
| 1739 | |
| 1740 | ;; Certain words will be dropped if they are at the end. |
| 1741 | (and (>= word-count 2) |
| 1742 | (not lower-case-flag) |
| 1743 | (or |
| 1744 | ;; Trailing 4-or-more letter lowercase words preceded by |
| 1745 | ;; mixed case or uppercase words will be dropped. |
| 1746 | (looking-at "[[:lower:]]\\{4,\\}[ \t]*\\'") |
| 1747 | ;; Drop a trailing word which is terminated with a period. |
| 1748 | (eq ?. (char-after (1- name-end)))) |
| 1749 | (setq drop-this-word-if-trailing-flag t)) |
| 1750 | |
| 1751 | ;; Set the flags that indicate whether we have seen a lowercase |
| 1752 | ;; word, a mixed case word, and an uppercase word. |
| 1753 | (if (re-search-forward "[[:lower:]]" name-end t) |
| 1754 | (if (progn |
| 1755 | (goto-char name-beg) |
| 1756 | (re-search-forward "[[:upper:]]" name-end t)) |
| 1757 | (setq mixed-case-flag t) |
| 1758 | (setq lower-case-flag t)) |
| 1759 | ;; (setq upper-case-flag t) |
| 1760 | ) |
| 1761 | |
| 1762 | (goto-char name-end) |
| 1763 | (setq word-found-flag t)) |
| 1764 | |
| 1765 | ;; Allow a number as a word, if it doesn't mean anything else. |
| 1766 | ((looking-at "[0-9]+\\>") |
| 1767 | (setq name-beg (point)) |
| 1768 | (setq name-end (match-end 0)) |
| 1769 | (goto-char name-end) |
| 1770 | (setq word-found-flag t)) |
| 1771 | |
| 1772 | (t |
| 1773 | (setq name-done-flag t) |
| 1774 | )) |
| 1775 | |
| 1776 | ;; Count any word that we skipped over. |
| 1777 | (if word-found-flag |
| 1778 | (setq word-count (1+ word-count)))) |
| 1779 | |
| 1780 | ;; If the last thing in the name is 2 or more periods, or one or more |
| 1781 | ;; other sentence terminators (but not a single period) then keep them |
| 1782 | ;; and the preceding word. This is for the benefit of whole sentences |
| 1783 | ;; in the name field: it's better behavior than dropping the last word |
| 1784 | ;; of the sentence... |
| 1785 | (if (and (not suffix-flag) |
| 1786 | (looking-at "\\(\\.+\\|[?!;:.][?!;:.]+\\|[?!;:][?!;:.]*\\)\\'")) |
| 1787 | (goto-char (setq suffix-flag (point-max)))) |
| 1788 | |
| 1789 | ;; Drop everything after point and certain trailing words. |
| 1790 | (narrow-to-region (point-min) |
| 1791 | (or (and drop-last-word-if-trailing-flag |
| 1792 | last-word-beg) |
| 1793 | (point))) |
| 1794 | |
| 1795 | ;; Xerox's mailers SUCK!!!!!! |
| 1796 | ;; We simply refuse to believe that any last name is PARC or ADOC. |
| 1797 | ;; If it looks like that is the last name, that there is no meaningful |
| 1798 | ;; here at all. Actually I guess it would be best to map patterns |
| 1799 | ;; like foo.hoser@xerox.com into foo@hoser.xerox.com, but I don't |
| 1800 | ;; actually know that that is what's going on. |
| 1801 | (unless suffix-flag |
| 1802 | (goto-char (point-min)) |
| 1803 | (let ((case-fold-search t)) |
| 1804 | (if (looking-at "[-A-Za-z_]+[. ]\\(PARC\\|ADOC\\)\\'") |
| 1805 | (erase-buffer)))) |
| 1806 | |
| 1807 | ;; If last name first put it at end (but before suffix) |
| 1808 | (when last-name-comma-flag |
| 1809 | (goto-char (point-min)) |
| 1810 | (search-forward ",") |
| 1811 | (setq name-end (1- (point))) |
| 1812 | (goto-char (or suffix-flag (point-max))) |
| 1813 | (or (eq ?\ (preceding-char)) |
| 1814 | (insert ?\ )) |
| 1815 | (insert-buffer-substring (current-buffer) (point-min) name-end) |
| 1816 | (goto-char name-end) |
| 1817 | (skip-chars-forward "\t ,") |
| 1818 | (narrow-to-region (point) (point-max))) |
| 1819 | |
| 1820 | ;; Delete leading and trailing junk characters. |
| 1821 | ;; *** This is probably completely unneeded now. |
| 1822 | ;;(goto-char (point-max)) |
| 1823 | ;;(skip-chars-backward mail-extr-non-end-name-chars) |
| 1824 | ;;(if (eq ?. (following-char)) |
| 1825 | ;; (forward-char 1)) |
| 1826 | ;;(narrow-to-region (point) |
| 1827 | ;; (progn |
| 1828 | ;; (goto-char (point-min)) |
| 1829 | ;; (skip-chars-forward mail-extr-non-begin-name-chars) |
| 1830 | ;; (point))) |
| 1831 | |
| 1832 | ;; Compress whitespace |
| 1833 | (goto-char (point-min)) |
| 1834 | (while (re-search-forward "[ \t\n]+" nil t) |
| 1835 | (replace-match (if (eobp) "" " ") t)) |
| 1836 | )))) |
| 1837 | |
| 1838 | \f |
| 1839 | |
| 1840 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 1841 | ;; |
| 1842 | ;; Table of top-level domain names. |
| 1843 | ;; |
| 1844 | ;; This is used during address canonicalization; be careful of format changes. |
| 1845 | ;; Keep in mind that the country abbreviations follow ISO-3166. There is |
| 1846 | ;; a U.S. FIPS that specifies a different set of two-letter country |
| 1847 | ;; abbreviations. |
| 1848 | ;; |
| 1849 | ;; Updated by the RIPE Network Coordination Centre. |
| 1850 | ;; |
| 1851 | ;; Source: ISO 3166 Maintenance Agency |
| 1852 | ;; http://www.iso.org/iso/en/prods-services/iso3166ma/02iso-3166-code-lists/list-en1-semic.txt |
| 1853 | ;; http://www.iana.org/domain-names.htm |
| 1854 | ;; http://www.iana.org/cctld/cctld-whois.htm |
| 1855 | ;; Latest change: 2007/11/15 |
| 1856 | |
| 1857 | (defconst mail-extr-all-top-level-domains |
| 1858 | (let ((ob (make-vector 739 0))) |
| 1859 | (mapc |
| 1860 | (lambda (x) |
| 1861 | (put (intern (downcase (car x)) ob) |
| 1862 | 'domain-name |
| 1863 | (if (nth 2 x) |
| 1864 | (format (nth 2 x) (nth 1 x)) |
| 1865 | (nth 1 x)))) |
| 1866 | '( |
| 1867 | ;; ISO 3166 codes: |
| 1868 | ("ac" "Ascension Island") |
| 1869 | ("ad" "Andorra") |
| 1870 | ("ae" "United Arab Emirates") |
| 1871 | ("af" "Afghanistan") |
| 1872 | ("ag" "Antigua and Barbuda") |
| 1873 | ("ai" "Anguilla") |
| 1874 | ("al" "Albania") |
| 1875 | ("am" "Armenia") |
| 1876 | ("an" "Netherlands Antilles") |
| 1877 | ("ao" "Angola") |
| 1878 | ("aq" "Antarctica") ; continent |
| 1879 | ("ar" "Argentina" "Argentine Republic") |
| 1880 | ("as" "American Samoa") |
| 1881 | ("at" "Austria" "The Republic of %s") |
| 1882 | ("au" "Australia") |
| 1883 | ("aw" "Aruba") |
| 1884 | ("ax" "Aland Islands") |
| 1885 | ("az" "Azerbaijan") |
| 1886 | ("ba" "Bosnia-Herzegovina") |
| 1887 | ("bb" "Barbados") |
| 1888 | ("bd" "Bangladesh") |
| 1889 | ("be" "Belgium" "The Kingdom of %s") |
| 1890 | ("bf" "Burkina Faso") |
| 1891 | ("bg" "Bulgaria") |
| 1892 | ("bh" "Bahrain") |
| 1893 | ("bi" "Burundi") |
| 1894 | ("bj" "Benin") |
| 1895 | ("bl" "Saint Barthelemy") |
| 1896 | ("bm" "Bermuda") |
| 1897 | ("bn" "Brunei Darussalam") |
| 1898 | ("bo" "Bolivia" "Republic of %s") |
| 1899 | ("br" "Brazil" "The Federative Republic of %s") |
| 1900 | ("bs" "Bahamas") |
| 1901 | ("bt" "Bhutan") |
| 1902 | ("bv" "Bouvet Island") |
| 1903 | ("bw" "Botswana") |
| 1904 | ("by" "Belarus") |
| 1905 | ("bz" "Belize") |
| 1906 | ("ca" "Canada") |
| 1907 | ("cc" "Cocos (Keeling) Islands") |
| 1908 | ("cd" "Congo" "The Democratic Republic of the %s") |
| 1909 | ("cf" "Central African Republic") |
| 1910 | ("cg" "Congo") |
| 1911 | ("ch" "Switzerland" "The Swiss Confederation") |
| 1912 | ("ci" "Ivory Coast") ; Cote D'ivoire |
| 1913 | ("ck" "Cook Islands") |
| 1914 | ("cl" "Chile" "The Republic of %s") |
| 1915 | ("cm" "Cameroon") ; In .fr domain |
| 1916 | ("cn" "China" "The People's Republic of %s") |
| 1917 | ("co" "Colombia") |
| 1918 | ("cr" "Costa Rica" "The Republic of %s") |
| 1919 | ("cu" "Cuba") |
| 1920 | ("cv" "Cape Verde") |
| 1921 | ("cx" "Christmas Island") |
| 1922 | ("cy" "Cyprus") |
| 1923 | ("cz" "Czech Republic") |
| 1924 | ("de" "Germany") |
| 1925 | ("dj" "Djibouti") |
| 1926 | ("dk" "Denmark") |
| 1927 | ("dm" "Dominica") |
| 1928 | ("do" "Dominican Republic" "The %s") |
| 1929 | ("dz" "Algeria") |
| 1930 | ("ec" "Ecuador" "The Republic of %s") |
| 1931 | ("ee" "Estonia") |
| 1932 | ("eg" "Egypt" "The Arab Republic of %s") |
| 1933 | ("eh" "Western Sahara") |
| 1934 | ("er" "Eritrea") |
| 1935 | ("es" "Spain" "The Kingdom of %s") |
| 1936 | ("et" "Ethiopia") |
| 1937 | ("eu" "European Union") |
| 1938 | ("fi" "Finland" "The Republic of %s") |
| 1939 | ("fj" "Fiji") |
| 1940 | ("fk" "Falkland Islands (Malvinas)") |
| 1941 | ("fm" "Micronesia" "Federated States of %s") |
| 1942 | ("fo" "Faroe Islands") |
| 1943 | ("fr" "France") |
| 1944 | ("ga" "Gabon") |
| 1945 | ("gb" "United Kingdom") |
| 1946 | ("gd" "Grenada") |
| 1947 | ("ge" "Georgia") |
| 1948 | ("gf" "French Guiana") |
| 1949 | ("gg" "Guernsey") |
| 1950 | ("gh" "Ghana") |
| 1951 | ("gi" "Gibraltar") |
| 1952 | ("gl" "Greenland") |
| 1953 | ("gm" "Gambia") |
| 1954 | ("gn" "Guinea") |
| 1955 | ("gp" "Guadeloupe (Fr.)") |
| 1956 | ("gq" "Equatorial Guinea") |
| 1957 | ("gr" "Greece" "The Hellenic Republic (%s)") |
| 1958 | ("gs" "South Georgia and The South Sandwich Islands") |
| 1959 | ("gt" "Guatemala") |
| 1960 | ("gu" "Guam (U.S.)") |
| 1961 | ("gw" "Guinea-Bissau") |
| 1962 | ("gy" "Guyana") |
| 1963 | ("hk" "Hong Kong") |
| 1964 | ("hm" "Heard Island and Mcdonald Islands") |
| 1965 | ("hn" "Honduras") |
| 1966 | ("hr" "Croatia" "Croatia (Hrvatska)") |
| 1967 | ("ht" "Haiti") |
| 1968 | ("hu" "Hungary" "The Hungarian Republic") |
| 1969 | ("id" "Indonesia") |
| 1970 | ("ie" "Ireland") |
| 1971 | ("il" "Israel" "The State of %s") |
| 1972 | ("im" "Isle of Man" "The %s") ; NOT in ISO 3166-1 of 2001-02-26 |
| 1973 | ("in" "India" "The Republic of %s") |
| 1974 | ("io" "British Indian Ocean Territory") |
| 1975 | ("iq" "Iraq") |
| 1976 | ("ir" "Iran" "Islamic Republic of %s") |
| 1977 | ("is" "Iceland" "The Republic of %s") |
| 1978 | ("it" "Italy" "The Italian Republic") |
| 1979 | ("je" "Jersey") |
| 1980 | ("jm" "Jamaica") |
| 1981 | ("jo" "Jordan") |
| 1982 | ("jp" "Japan") |
| 1983 | ("ke" "Kenya") |
| 1984 | ("kg" "Kyrgyzstan") |
| 1985 | ("kh" "Cambodia") |
| 1986 | ("ki" "Kiribati") |
| 1987 | ("km" "Comoros") |
| 1988 | ("kn" "Saint Kitts and Nevis") |
| 1989 | ("kp" "Korea (North)" "Democratic People's Republic of Korea") |
| 1990 | ("kr" "Korea (South)" "Republic of Korea") |
| 1991 | ("kw" "Kuwait") |
| 1992 | ("ky" "Cayman Islands") |
| 1993 | ("kz" "Kazakhstan") |
| 1994 | ("la" "Lao People's Democratic Republic") |
| 1995 | ("lb" "Lebanon") |
| 1996 | ("lc" "Saint Lucia") |
| 1997 | ("li" "Liechtenstein") |
| 1998 | ("lk" "Sri Lanka" "The Democratic Socialist Republic of %s") |
| 1999 | ("lr" "Liberia") |
| 2000 | ("ls" "Lesotho") |
| 2001 | ("lt" "Lithuania") |
| 2002 | ("lu" "Luxembourg") |
| 2003 | ("lv" "Latvia") |
| 2004 | ("ly" "Libyan Arab Jamahiriya") |
| 2005 | ("ma" "Morocco") |
| 2006 | ("mc" "Monaco") |
| 2007 | ("md" "Moldova" "The Republic of %s") |
| 2008 | ("me" "Montenegro") |
| 2009 | ("mf" "Saint Martin (French part)") |
| 2010 | ("mg" "Madagascar") |
| 2011 | ("mh" "Marshall Islands") |
| 2012 | ("mk" "Macedonia" "The Former Yugoslav Republic of %s") |
| 2013 | ("ml" "Mali") |
| 2014 | ("mm" "Myanmar") |
| 2015 | ("mn" "Mongolia") |
| 2016 | ("mo" "Macao") |
| 2017 | ("mp" "Northern Mariana Islands") |
| 2018 | ("mq" "Martinique") |
| 2019 | ("mr" "Mauritania") |
| 2020 | ("ms" "Montserrat") |
| 2021 | ("mt" "Malta") |
| 2022 | ("mu" "Mauritius") |
| 2023 | ("mv" "Maldives") |
| 2024 | ("mw" "Malawi") |
| 2025 | ("mx" "Mexico" "The United Mexican States") |
| 2026 | ("my" "Malaysia") |
| 2027 | ("mz" "Mozambique") |
| 2028 | ("na" "Namibia") |
| 2029 | ("nc" "New Caledonia (Fr.)") |
| 2030 | ("ne" "Niger") ; In .fr domain |
| 2031 | ("nf" "Norfolk Island") |
| 2032 | ("ng" "Nigeria") |
| 2033 | ("ni" "Nicaragua" "The Republic of %s") |
| 2034 | ("nl" "Netherlands" "The Kingdom of the %s") |
| 2035 | ("no" "Norway" "The Kingdom of %s") |
| 2036 | ("np" "Nepal") ; Via .in domain |
| 2037 | ("nr" "Nauru") |
| 2038 | ("nu" "Niue") |
| 2039 | ("nz" "New Zealand") |
| 2040 | ("om" "Oman") |
| 2041 | ("pa" "Panama") |
| 2042 | ("pe" "Peru") |
| 2043 | ("pf" "French Polynesia") |
| 2044 | ("pg" "Papua New Guinea") |
| 2045 | ("ph" "Philippines" "The Republic of the %s") |
| 2046 | ("pk" "Pakistan") |
| 2047 | ("pl" "Poland") |
| 2048 | ("pm" "Saint Pierre and Miquelon") |
| 2049 | ("pn" "Pitcairn") |
| 2050 | ("pr" "Puerto Rico (U.S.)") |
| 2051 | ("ps" "Palestinian Territory, Occupied") |
| 2052 | ("pt" "Portugal" "The Portuguese Republic") |
| 2053 | ("pw" "Palau") |
| 2054 | ("py" "Paraguay") |
| 2055 | ("qa" "Qatar") |
| 2056 | ("re" "Reunion (Fr.)") ; In .fr domain |
| 2057 | ("ro" "Romania") |
| 2058 | ("rs" "Serbia") |
| 2059 | ("ru" "Russia" "Russian Federation") |
| 2060 | ("rw" "Rwanda") |
| 2061 | ("sa" "Saudi Arabia") |
| 2062 | ("sb" "Solomon Islands") |
| 2063 | ("sc" "Seychelles") |
| 2064 | ("sd" "Sudan") |
| 2065 | ("se" "Sweden" "The Kingdom of %s") |
| 2066 | ("sg" "Singapore" "The Republic of %s") |
| 2067 | ("sh" "Saint Helena") |
| 2068 | ("si" "Slovenia") |
| 2069 | ("sj" "Svalbard and Jan Mayen") ; In .no domain |
| 2070 | ("sk" "Slovakia" "The Slovak Republic") |
| 2071 | ("sl" "Sierra Leone") |
| 2072 | ("sm" "San Marino") |
| 2073 | ("sn" "Senegal") |
| 2074 | ("so" "Somalia") |
| 2075 | ("sr" "Suriname") |
| 2076 | ("st" "Sao Tome and Principe") |
| 2077 | ("su" "U.S.S.R." "The Union of Soviet Socialist Republics") |
| 2078 | ("sv" "El Salvador") |
| 2079 | ("sy" "Syrian Arab Republic") |
| 2080 | ("sz" "Swaziland") |
| 2081 | ("tc" "Turks and Caicos Islands") |
| 2082 | ("td" "Chad") |
| 2083 | ("tf" "French Southern Territories") |
| 2084 | ("tg" "Togo") |
| 2085 | ("th" "Thailand" "The Kingdom of %s") |
| 2086 | ("tj" "Tajikistan") |
| 2087 | ("tk" "Tokelau") |
| 2088 | ("tl" "East Timor") |
| 2089 | ("tm" "Turkmenistan") |
| 2090 | ("tn" "Tunisia") |
| 2091 | ("to" "Tonga") |
| 2092 | ("tp" "East Timor") |
| 2093 | ("tr" "Turkey" "The Republic of %s") |
| 2094 | ("tt" "Trinidad and Tobago") |
| 2095 | ("tv" "Tuvalu") |
| 2096 | ("tw" "Taiwan" "%s, Province of China") |
| 2097 | ("tz" "Tanzania" "United Republic of %s") |
| 2098 | ("ua" "Ukraine") |
| 2099 | ("ug" "Uganda") |
| 2100 | ("uk" "United Kingdom" "The %s of Great Britain and Northern Ireland") |
| 2101 | ("um" "United States Minor Outlying Islands") |
| 2102 | ("us" "United States" "The %s of America") |
| 2103 | ("uy" "Uruguay" "The Eastern Republic of %s") |
| 2104 | ("uz" "Uzbekistan") |
| 2105 | ("va" "Holy See (Vatican City State)") |
| 2106 | ("vc" "Saint Vincent and the Grenadines") |
| 2107 | ("ve" "Venezuela" "The Republic of %s") |
| 2108 | ("vg" "Virgin Islands, British") |
| 2109 | ("vi" "Virgin Islands, U.S.") |
| 2110 | ("vn" "Vietnam") |
| 2111 | ("vu" "Vanuatu") |
| 2112 | ("wf" "Wallis and Futuna") |
| 2113 | ("ws" "Samoa") |
| 2114 | ("ye" "Yemen") |
| 2115 | ("yt" "Mayotte") |
| 2116 | ("yu" "Yugoslavia" "Yugoslavia, AKA Serbia-Montenegro") |
| 2117 | ("za" "South Africa" "The Republic of %s") |
| 2118 | ("zm" "Zambia") |
| 2119 | ("zw" "Zimbabwe" "Republic of %s") |
| 2120 | ;; Generic Domains: |
| 2121 | ("aero" t "Air Transport Industry") |
| 2122 | ("asia" t "Pan-Asia and Asia Pacific community") |
| 2123 | ("biz" t "Businesses") |
| 2124 | ("cat" t "Catalan language and culture") |
| 2125 | ("com" t "Commercial") |
| 2126 | ("coop" t "Cooperative Associations") |
| 2127 | ("info" t "Info") |
| 2128 | ("jobs" t "Employment") |
| 2129 | ("mobi" t "Mobile products") |
| 2130 | ("museum" t "Museums") |
| 2131 | ("name" t "Individuals") |
| 2132 | ("net" t "Network") |
| 2133 | ("org" t "Non-profit Organization") |
| 2134 | ("pro" t "Credentialed professionals") |
| 2135 | ("tel" t "Contact data") |
| 2136 | ("travel" t "Travel industry") |
| 2137 | ;;("bitnet" t "Because It's Time NET") |
| 2138 | ("gov" t "United States Government") |
| 2139 | ("edu" t "Educational") |
| 2140 | ("mil" t "United States Military") |
| 2141 | ("int" t "International Treaties") |
| 2142 | ;;("nato" t "North Atlantic Treaty Organization") |
| 2143 | ("uucp" t "Unix to Unix CoPy") |
| 2144 | ;; Infrastructure Domains: |
| 2145 | ("arpa" t "Advanced Research Projects Agency (U.S. DoD)") |
| 2146 | )) |
| 2147 | ob)) |
| 2148 | |
| 2149 | ;;;###autoload |
| 2150 | (defun what-domain (domain) |
| 2151 | "Convert mail domain DOMAIN to the country it corresponds to." |
| 2152 | (interactive |
| 2153 | (let ((completion-ignore-case t)) |
| 2154 | (list (completing-read "Domain: " |
| 2155 | mail-extr-all-top-level-domains nil t)))) |
| 2156 | (or (setq domain (intern-soft (downcase domain) |
| 2157 | mail-extr-all-top-level-domains)) |
| 2158 | (error "No such domain")) |
| 2159 | (message "%s: %s" (upcase (symbol-name domain)) (get domain 'domain-name))) |
| 2160 | |
| 2161 | \f |
| 2162 | ;(let ((all nil)) |
| 2163 | ; (mapatoms #'(lambda (x) |
| 2164 | ; (if (and (boundp x) |
| 2165 | ; (string-match "^mail-extr-" (symbol-name x))) |
| 2166 | ; (setq all (cons x all))))) |
| 2167 | ; (setq all (sort all #'string-lessp)) |
| 2168 | ; (cons 'setq |
| 2169 | ; (apply 'nconc (mapcar #'(lambda (x) |
| 2170 | ; (list x (symbol-value x))) |
| 2171 | ; all)))) |
| 2172 | |
| 2173 | \f |
| 2174 | (provide 'mail-extr) |
| 2175 | |
| 2176 | ;;; mail-extr.el ends here |