1 ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages
2 ;; Copyright (C) 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
4 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
5 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
6 ;; This file is part of GNU Emacs.
8 ;; GNU Emacs is free software; you can redistribute it and/or modify
9 ;; it under the terms of the GNU General Public License as published by
10 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; GNU Emacs is distributed in the hope that it will be useful,
14 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;; GNU General Public License for more details.
18 ;; You should have received a copy of the GNU General Public License
19 ;; along with GNU Emacs; see the file COPYING. If not, write to the
20 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 ;; Boston, MA 02111-1307, USA.
25 ;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part
26 ;; Three: Message Header Extensions for Non-ASCII Text".
32 (defvar message-posting-charset
))
36 ;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus.
39 (autoload 'mm-body-7-or-8
"mm-bodies")
41 (defvar rfc2047-header-encoding-alist
42 '(("Newsgroups" . nil
)
44 ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" .
47 "*Header/encoding method alist.
48 The list is traversed sequentially. The keys can either be
53 1) nil, in which case no encoding is done;
54 2) `mime', in which case the header will be encoded according to RFC2047;
55 3) `address-mime', like `mime', but takes account of the rules for address
56 fields (where quoted strings and comments must be treated separately);
57 4) a charset, in which case it will be encoded as that charset;
58 5) `default', in which case the field will be encoded as the rest
61 (defvar rfc2047-charset-encoding-alist
84 "Alist of MIME charsets to RFC2047 encodings.
85 Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding,
86 quoted-printable and base64 respectively.")
88 (defvar rfc2047-encoding-function-alist
89 '((Q . rfc2047-q-encode-region
)
90 (B . rfc2047-b-encode-region
)
92 "Alist of RFC2047 encodings to encoding functions.")
94 (defvar rfc2047-q-encoding-alist
95 '(("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\):"
97 ;; = (\075), _ (\137), ? (\077) are used in the encoded word.
98 ;; Avoid using 8bit characters.
99 ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?"
100 ("." .
"\010\012\014\040-\074\076\100-\136\140-\177"))
101 "Alist of header regexps and valid Q characters.")
104 ;;; Functions for encoding RFC2047 messages
107 (defun rfc2047-narrow-to-field ()
108 "Narrow the buffer to the header on the current line."
114 (if (re-search-forward "^[^ \n\t]" nil t
)
119 (goto-char (point-min)))
121 (defvar rfc2047-encoding-type
'address-mime
122 "The type of encoding done by `rfc2047-encode-region'.
123 This should be dynamically bound around calls to
124 `rfc2047-encode-region' to either `mime' or `address-mime'. See
125 `rfc2047-header-encoding-alist', for definitions.")
127 (defun rfc2047-encode-message-header ()
128 "Encode the message header according to `rfc2047-header-encoding-alist'.
129 Should be called narrowed to the head of the message."
132 (goto-char (point-min))
133 (let (alist elem method
)
136 (rfc2047-narrow-to-field)
137 (if (not (rfc2047-encodable-p))
138 (if (and (eq (mm-body-7-or-8) '8bit
)
141 (car message-posting-charset
)))
142 ;; 8 bit must be decoded.
143 ;; Is message-posting-charset a coding system?
144 (mm-encode-coding-region
145 (point-min) (point-max)
146 (car message-posting-charset
)))
147 ;; We found something that may perhaps be encoded.
149 alist rfc2047-header-encoding-alist
)
150 (while (setq elem
(pop alist
))
151 (when (or (and (stringp (car elem
))
152 (looking-at (car elem
)))
156 (goto-char (point-min))
157 (re-search-forward "^[^:]+: *" nil t
)
159 ((eq method
'address-mime
)
160 (rfc2047-encode-region (point) (point-max)))
162 (let (rfc2047-encoding-type)
163 (rfc2047-encode-region (point) (point-max))))
164 ((eq method
'default
)
165 (if (and (featurep 'mule
)
166 (if (boundp 'default-enable-multibyte-characters
)
167 default-enable-multibyte-characters
)
169 (mm-encode-coding-region (point) (point-max)
170 mail-parse-charset
)))
171 ((mm-coding-system-p method
)
172 (if (and (featurep 'mule
)
173 (if (boundp 'default-enable-multibyte-characters
)
174 default-enable-multibyte-characters
))
175 (mm-encode-coding-region (point) (point-max) method
)))
178 (goto-char (point-max)))))))
180 ;; Fixme: This, and the require below may not be the Right Thing, but
181 ;; should be safe just before release. -- fx 2001-02-08
182 (eval-when-compile (defvar message-posting-charset
))
184 (defun rfc2047-encodable-p ()
185 "Return non-nil if any characters in current buffer need encoding in headers.
186 The buffer may be narrowed."
187 (require 'message
) ; for message-posting-charset
189 (mm-find-mime-charset-region (point-min) (point-max))))
190 (and charsets
(not (equal charsets
(list message-posting-charset
))))))
192 ;; Use this syntax table when parsing into regions that may need
193 ;; encoding. Double quotes are string delimiters, backslash is
194 ;; character quoting, and all other RFC 2822 special characters are
195 ;; treated as punctuation so we can use forward-sexp/forward-word to
196 ;; skip to the end of regions appropriately. Nb. ietf-drums does
197 ;; things differently.
198 (defconst rfc2047-syntax-table
199 (let ((table (make-char-table 'syntax-table
'(2))))
200 (modify-syntax-entry ?
\\ "\\" table
)
201 (modify-syntax-entry ?
\" "\"" table
)
202 (modify-syntax-entry ?\
( "." table
)
203 (modify-syntax-entry ?\
) "." table
)
204 (modify-syntax-entry ?\
< "." table
)
205 (modify-syntax-entry ?\
> "." table
)
206 (modify-syntax-entry ?\
[ "." table
)
207 (modify-syntax-entry ?\
] "." table
)
208 (modify-syntax-entry ?
: "." table
)
209 (modify-syntax-entry ?\
; "." table)
210 (modify-syntax-entry ?
, "." table
)
211 (modify-syntax-entry ?
@ "." table
)
214 (defun rfc2047-encode-region (b e
)
215 "Encode words in region B to E that need encoding.
216 By default, the region is treated as containing RFC2822 addresses.
217 Dynamically bind `rfc2047-encoding-type' to change that."
219 (narrow-to-region b e
)
220 (if (eq 'mime rfc2047-encoding-type
)
221 ;; Simple case -- treat as single word.
223 (goto-char (point-min))
224 ;; Does it need encoding?
225 (skip-chars-forward "\000-\177" e
)
227 (rfc2047-encode b e
)))
228 ;; `address-mime' case -- take care of quoted words, comments.
229 (with-syntax-table rfc2047-syntax-table
230 (let ((start (point)) ; start of current token
231 end
; end of current token
232 ;; Whether there's an encoded word before the current
233 ;; tpken, either immediately or separated by space.
235 (goto-char (point-min))
236 (condition-case nil
; in case of unbalanced quotes
237 ;; Look for rfc2822-style: sequences of atoms, quoted
238 ;; strings, specials, whitespace. (Specials mustn't be
243 (unless (= 0 (skip-chars-forward " \t"))
244 (setq start
(point)))
246 ((not (char-after))) ; eob
248 ((eq ?
\" (char-syntax (char-after)))
252 ;; Does it need encoding?
254 (skip-chars-forward "\000-\177" end
)
256 (setq last-encoded nil
)
257 ;; It needs encoding. Strip the quotes first,
258 ;; since encoded words can't occur in quotes.
260 (delete-backward-char 1)
264 ;; There was a preceding quoted word. We need
265 ;; to include any separating whitespace in this
266 ;; word to avoid it getting lost.
267 (skip-chars-backward " \t")
268 ;; A space is needed between the encoded words.
272 ;; Adjust the end position for the deleted quotes.
273 (rfc2047-encode start
(- end
2))
274 (setq last-encoded t
))) ; record that it was encoded
275 ((eq ?.
(char-syntax (char-after)))
276 ;; Skip other delimiters, but record that they've
277 ;; potentially separated quoted words.
279 (setq last-encoded nil
))
280 (t ; normal token/whitespace sequence
283 (skip-chars-backward " \t")
285 ;; Deal with encoding and leading space as for
288 (skip-chars-forward "\000-\177" end
)
290 (setq last-encoded nil
)
293 (skip-chars-backward " \t")
297 (rfc2047-encode start end
)
298 (setq last-encoded t
)))))
299 (error (error "Invalid data for rfc2047 encoding: %s"
300 (buffer-substring b e
)))))))
301 (rfc2047-fold-region b
(point))))
303 (defun rfc2047-encode-string (string)
304 "Encode words in STRING.
305 By default, the string is treated as containing addresses (see
306 `rfc2047-special-chars')."
309 (rfc2047-encode-region (point-min) (point-max))
312 (defun rfc2047-encode (b e
)
313 "Encode the word(s) in the region B to E.
314 By default, the region is treated as containing addresses (see
315 `rfc2047-special-chars')."
316 (let* ((mime-charset (mm-find-mime-charset-region b e
))
317 (cs (if (> (length mime-charset
) 1)
318 ;; Fixme: Instead of this, try to break region into
319 ;; parts that can be encoded separately.
320 (error "Can't rfc2047-encode `%s'"
321 (buffer-substring b e
))
322 (setq mime-charset
(car mime-charset
))
323 (mm-charset-to-coding-system mime-charset
)))
324 ;; Fixme: Better, calculate the number of non-ASCII
325 ;; characters, at least for 8-bit charsets.
326 (encoding (if (assq mime-charset
327 rfc2047-charset-encoding-alist
)
328 (cdr (assq mime-charset
329 rfc2047-charset-encoding-alist
))
332 "=?" (downcase (symbol-name mime-charset
)) "?"
333 (downcase (symbol-name encoding
)) "?"))
337 (narrow-to-region b e
)
338 (when (eq encoding
'B
)
339 ;; break into lines before encoding
340 (goto-char (point-min))
342 (goto-char (min (point-max) (+ 15 (point))))
345 (if (and (mm-multibyte-p)
346 (mm-coding-system-p cs
))
347 (mm-encode-coding-region (point-min) (point-max) cs
))
348 (funcall (cdr (assq encoding rfc2047-encoding-function-alist
))
349 (point-min) (point-max))
350 (goto-char (point-min))
358 (forward-line 1))))))
360 (defun rfc2047-fold-region (b e
)
361 "Fold long lines in region B to E."
363 (narrow-to-region b e
)
364 (goto-char (point-min))
367 (bol (save-restriction
371 (when (and (or break qword-break
) (> (- (point) bol
) 76))
372 (goto-char (or break qword-break
))
375 (if (looking-at " \t")
378 (setq bol
(1- (point)))
379 ;; Don't break before the first non-LWSP characters.
380 (skip-chars-forward " \t")
381 (unless (eobp) (forward-char 1)))
383 ((eq (char-after) ?
\n)
388 (skip-chars-forward " \t")
389 (unless (or (eobp) (eq (char-after) ?
\n))
391 ((eq (char-after) ?
\r)
393 ((memq (char-after) '(? ?
\t))
394 (skip-chars-forward " \t")
395 (setq break
(1- (point))))
397 (if (not (looking-at "=\\?[^=]"))
398 (if (eq (char-after) ?
=)
400 (skip-chars-forward "^ \t\n\r="))
401 (setq qword-break
(point))
402 (skip-chars-forward "^ \t\n\r")))
404 (skip-chars-forward "^ \t\n\r"))))
405 (when (and (or break qword-break
) (> (- (point) bol
) 76))
406 (goto-char (or break qword-break
))
409 (if (looking-at " \t")
412 (setq bol
(1- (point)))
413 ;; Don't break before the first non-LWSP characters.
414 (skip-chars-forward " \t")
415 (unless (eobp) (forward-char 1))))))
417 (defun rfc2047-unfold-region (b e
)
418 "Unfold lines in region B to E."
420 (narrow-to-region b e
)
421 (goto-char (point-min))
422 (let ((bol (save-restriction
425 (eol (mm-point-at-eol))
429 (looking-at "[ \t]*")
430 (setq leading
(- (match-end 0) (match-beginning 0)))
431 (if (< (- (mm-point-at-eol) bol leading
) 76)
434 (delete-region eol
(progn
435 (skip-chars-forward "[ \t\n\r]+")
437 (setq bol
(mm-point-at-bol)))
438 (setq eol
(mm-point-at-eol))
441 (defun rfc2047-b-encode-region (b e
)
442 "Base64-encode the header contained in region B to E."
444 (narrow-to-region (goto-char b
) e
)
446 (base64-encode-region (point) (progn (end-of-line) (point)) t
)
447 (if (and (bolp) (eolp))
448 (delete-backward-char 1))
451 (defun rfc2047-q-encode-region (b e
)
452 "Quoted-printable-encode the header in region B to E."
455 (narrow-to-region (goto-char b
) e
)
456 (let ((alist rfc2047-q-encoding-alist
)
457 (bol (save-restriction
461 (when (looking-at (caar alist
))
462 (quoted-printable-encode-region b e nil
(cdar alist
))
463 (subst-char-in-region (point-min) (point-max) ? ?_
)
466 ;; The size of QP encapsulation is about 20, so set limit to
468 (unless (< (- (point-max) (point-min)) 56)
469 ;; Don't break if it could fit in one line.
470 ;; Let rfc2047-encode-region break it later.
471 (goto-char (1+ (point-min)))
472 (while (and (not (bobp)) (not (eobp)))
473 (goto-char (min (point-max) (+ 56 bol
)))
474 (search-backward "=" (- (point) 2) t
)
475 (unless (or (bobp) (eobp))
477 (setq bol
(point)))))))))
480 ;;; Functions for decoding RFC2047 messages
484 (defvar rfc2047-encoded-word-regexp
485 "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\
486 \\([!->@-~ +]+\\)\\?="))
488 (defun rfc2047-decode-region (start end
)
489 "Decode MIME-encoded words in region between START and END."
491 (let ((case-fold-search t
)
492 (undoing (not (eq t buffer-undo-list
)))
498 (narrow-to-region start end
)
499 (goto-char (point-min))
500 ;; Remove whitespace between encoded words.
501 (while (re-search-forward
503 (concat "\\(" rfc2047-encoded-word-regexp
"\\)"
505 "\\(" rfc2047-encoded-word-regexp
"\\)"))
507 (delete-region (goto-char (match-end 1)) (match-beginning 6)))
508 ;; Decode the encoded words.
509 (setq b
(goto-char (point-min)))
510 (while (re-search-forward rfc2047-encoded-word-regexp nil t
)
511 (setq e
(match-beginning 0))
512 (rfc2047-parse-and-decode (match-beginning 0) (match-end 0)))
513 (when (and (mm-multibyte-p)
515 (not (eq mail-parse-charset
'us-ascii
))
516 (not (eq mail-parse-charset
'gnus-decoded
)))
517 (mm-decode-coding-region b
(point-max) mail-parse-charset
))
518 (rfc2047-unfold-region (point-min) (point-max))))
520 (buffer-disable-undo)))))
522 (defun rfc2047-decode-string (string)
523 "Decode the quoted-printable-encoded STRING and return the results."
524 (let ((m (mm-multibyte-p)))
527 (mm-enable-multibyte))
530 (rfc2047-decode-region (point-min) (point-max)))
533 (defun rfc2047-parse-and-decode (b e
)
534 "Decode WORD and return it if it is an encoded word.
537 (narrow-to-region b e
)
539 (when (looking-at (eval-when-compile
540 (concat "\\`" rfc2047-encoded-word-regexp
"\\'")))
542 (let ((charset (match-string 1))
543 (encoding (upcase (match-string 2))))
545 (delete-region (match-beginning 0) (1+ (match-end 2)))
546 (delete-region (- (point-max) 2) (point-max))
547 (rfc2047-decode charset encoding
(point-min) (point-max)))
548 ;; If we get an error, undo the change
551 (defun rfc2047-decode (charset encoding b e
)
552 "Decode from the given MIME CHARSET in the given ENCODING in region B to E.
553 Valid ENCODINGs are \"B\" and \"Q\".
554 If your Emacs implementation can't decode CHARSET, return nil."
555 (if (stringp charset
)
556 (setq charset
(intern (downcase charset
))))
557 (if (or (not charset
)
558 (eq 'gnus-all mail-parse-ignored-charsets
)
559 (memq 'gnus-all mail-parse-ignored-charsets
)
560 (memq charset mail-parse-ignored-charsets
))
561 (setq charset mail-parse-charset
))
562 (let ((cs (mm-charset-to-coding-system charset
)))
563 (if (and (not cs
) charset
564 (listp mail-parse-ignored-charsets
)
565 (memq 'gnus-unknown mail-parse-ignored-charsets
))
566 (setq cs
(mm-charset-to-coding-system mail-parse-charset
)))
568 (when (and (eq cs
'ascii
)
570 (setq cs mail-parse-charset
))
572 (narrow-to-region b e
)
574 ((equal "B" encoding
)
575 (base64-decode-region b e
))
576 ((equal "Q" encoding
)
577 (subst-char-in-region b e ?_ ? t
)
578 (quoted-printable-decode-region b e
))
579 (t (error "Invalid encoding: %s" encoding
)))
580 (mm-decode-coding-region (point-min) (point-max) cs
)))))
584 ;;; rfc2047.el ends here