Commit | Line | Data |
---|---|---|
c38e0c97 | 1 | ;;; utf7.el --- UTF-7 encoding/decoding for Emacs -*-coding: utf-8;-*- |
e84b4b86 | 2 | |
ab422c4d | 3 | ;; Copyright (C) 1999-2013 Free Software Foundation, Inc. |
c113de23 GM |
4 | |
5 | ;; Author: Jon K Hellan <hellan@acm.org> | |
23f87bed | 6 | ;; Maintainer: bugs@gnus.org |
c113de23 GM |
7 | ;; Keywords: mail |
8 | ||
9 | ;; This file is part of GNU Emacs. | |
10 | ||
5e809f55 | 11 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
c113de23 | 12 | ;; it under the terms of the GNU General Public License as published by |
5e809f55 GM |
13 | ;; the Free Software Foundation, either version 3 of the License, or |
14 | ;; (at your option) any later version. | |
c113de23 GM |
15 | |
16 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;; GNU General Public License for more details. | |
20 | ||
21 | ;; You should have received a copy of the GNU General Public License | |
5e809f55 | 22 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
c113de23 GM |
23 | |
24 | ;;; Commentary: | |
23f87bed MB |
25 | |
26 | ;; UTF-7 - A Mail-Safe Transformation Format of Unicode - RFC 2152 | |
27 | ;; This is a transformation format of Unicode that contains only 7-bit | |
28 | ;; ASCII octets and is intended to be readable by humans in the limiting | |
29 | ;; case that the document consists of characters from the US-ASCII | |
30 | ;; repertoire. | |
31 | ;; In short, runs of characters outside US-ASCII are encoded as base64 | |
32 | ;; inside delimiters. | |
33 | ;; A variation of UTF-7 is specified in IMAP 4rev1 (RFC 2060) as the way | |
34 | ;; to represent characters outside US-ASCII in mailbox names in IMAP. | |
35 | ;; This library supports both variants, but the IMAP variation was the | |
36 | ;; reason I wrote it. | |
37 | ;; The routines convert UTF-7 -> UTF-16 (16 bit encoding of Unicode) | |
38 | ;; -> current character set, and vice versa. | |
39 | ;; However, until Emacs supports Unicode, the only Emacs character set | |
40 | ;; supported here is ISO-8859.1, which can trivially be converted to/from | |
41 | ;; Unicode. | |
42 | ;; When decoding results in a character outside the Emacs character set, | |
43 | ;; an error is thrown. It is up to the application to recover. | |
44 | ||
45 | ;; UTF-7 should be done by providing a coding system. Mule-UCS does | |
46 | ;; already, but I don't know if it does the IMAP version and it's not | |
47 | ;; clear whether that should really be a coding system. The UTF-16 | |
48 | ;; part of the conversion can be done with coding systems available | |
49 | ;; with Mule-UCS or some versions of Emacs. Unfortunately these were | |
50 | ;; done wrongly (regarding handling of byte-order marks and how the | |
51 | ;; variants were named), so we don't have a consistent name for the | |
52 | ;; necessary coding system. The code below doesn't seem to DTRT | |
53 | ;; generally. E.g.: | |
54 | ;; | |
c38e0c97 | 55 | ;; (utf7-encode "a+£") |
23f87bed MB |
56 | ;; => "a+ACsAow-" |
57 | ;; | |
c38e0c97 | 58 | ;; $ echo "a+£"|iconv -f utf-8 -t utf-7 |
23f87bed MB |
59 | ;; a+-+AKM |
60 | ;; | |
bf247b6e | 61 | ;; -- fx |
23f87bed | 62 | |
c113de23 GM |
63 | |
64 | ;;; Code: | |
65 | ||
66 | (require 'base64) | |
5ce3df57 | 67 | (eval-when-compile (require 'cl)) |
23f87bed | 68 | (require 'mm-util) |
c113de23 | 69 | |
23f87bed | 70 | (defconst utf7-direct-encoding-chars " -%'-*,-[]-}" |
c113de23 GM |
71 | "Character ranges which do not need escaping in UTF-7.") |
72 | ||
23f87bed | 73 | (defconst utf7-imap-direct-encoding-chars |
c113de23 GM |
74 | (concat utf7-direct-encoding-chars "+\\~") |
75 | "Character ranges which do not need escaping in the IMAP variant of UTF-7.") | |
76 | ||
23f87bed MB |
77 | (defconst utf7-utf-16-coding-system |
78 | (cond ((mm-coding-system-p 'utf-16-be-no-signature) ; Mule-UCS | |
79 | 'utf-16-be-no-signature) | |
0c43b6f8 | 80 | ((and (mm-coding-system-p 'utf-16-be) ; Emacs |
23f87bed MB |
81 | ;; Avoid versions with BOM. |
82 | (= 2 (length (encode-coding-string "a" 'utf-16-be)))) | |
83 | 'utf-16-be) | |
84 | ((mm-coding-system-p 'utf-16-be-nosig) ; ? | |
85 | 'utf-16-be-nosig)) | |
86 | "Coding system which encodes big endian UTF-16 without a BOM signature.") | |
87 | ||
c113de23 GM |
88 | (defsubst utf7-imap-get-pad-length (len modulus) |
89 | "Return required length of padding for IMAP modified base64 fragment." | |
90 | (mod (- len) modulus)) | |
91 | ||
92 | (defun utf7-encode-internal (&optional for-imap) | |
93 | "Encode text in (temporary) buffer as UTF-7. | |
94 | Use IMAP modification if FOR-IMAP is non-nil." | |
95 | (let ((start (point-min)) | |
96 | (end (point-max))) | |
97 | (narrow-to-region start end) | |
98 | (goto-char start) | |
23f87bed MB |
99 | (let* ((esc-char (if for-imap ?& ?+)) |
100 | (direct-encoding-chars | |
101 | (if for-imap utf7-imap-direct-encoding-chars | |
102 | utf7-direct-encoding-chars)) | |
103 | (not-direct-encoding-chars (concat "^" direct-encoding-chars))) | |
c113de23 GM |
104 | (while (not (eobp)) |
105 | (skip-chars-forward direct-encoding-chars) | |
106 | (unless (eobp) | |
107 | (insert esc-char) | |
108 | (let ((p (point)) | |
109 | (fc (following-char)) | |
110 | (run-length | |
23f87bed | 111 | (skip-chars-forward not-direct-encoding-chars))) |
c113de23 GM |
112 | (if (and (= fc esc-char) |
113 | (= run-length 1)) ; Lone esc-char? | |
d355a0b7 | 114 | (delete-char -1) ; Now there's one too many |
c113de23 GM |
115 | (utf7-fragment-encode p (point) for-imap)) |
116 | (insert "-"))))))) | |
117 | ||
118 | (defun utf7-fragment-encode (start end &optional for-imap) | |
119 | "Encode text from START to END in buffer as UTF-7 escape fragment. | |
120 | Use IMAP modification if FOR-IMAP is non-nil." | |
121 | (save-restriction | |
122 | (narrow-to-region start end) | |
123 | (funcall (utf7-get-u16char-converter 'to-utf-16)) | |
23f87bed MB |
124 | (mm-with-unibyte-current-buffer |
125 | (base64-encode-region start (point-max))) | |
c113de23 GM |
126 | (goto-char start) |
127 | (let ((pm (point-max))) | |
128 | (when for-imap | |
129 | (while (search-forward "/" nil t) | |
130 | (replace-match ","))) | |
131 | (skip-chars-forward "^= \t\n" pm) | |
132 | (delete-region (point) pm)))) | |
133 | ||
134 | (defun utf7-decode-internal (&optional for-imap) | |
135 | "Decode UTF-7 text in (temporary) buffer. | |
136 | Use IMAP modification if FOR-IMAP is non-nil." | |
137 | (let ((start (point-min)) | |
138 | (end (point-max))) | |
139 | (goto-char start) | |
140 | (let* ((esc-pattern (concat "^" (char-to-string (if for-imap ?& ?+)))) | |
141 | (base64-chars (concat "A-Za-z0-9+" | |
142 | (char-to-string (if for-imap ?, ?/))))) | |
143 | (while (not (eobp)) | |
144 | (skip-chars-forward esc-pattern) | |
145 | (unless (eobp) | |
146 | (forward-char) | |
147 | (let ((p (point)) | |
148 | (run-length (skip-chars-forward base64-chars))) | |
149 | (when (and (not (eobp)) (= (following-char) ?-)) | |
150 | (delete-char 1)) | |
151 | (unless (= run-length 0) ; Encoded lone esc-char? | |
152 | (save-excursion | |
153 | (utf7-fragment-decode p (point) for-imap) | |
154 | (goto-char p) | |
d355a0b7 | 155 | (delete-char -1))))))))) |
c113de23 GM |
156 | |
157 | (defun utf7-fragment-decode (start end &optional for-imap) | |
158 | "Decode base64 encoded fragment from START to END of UTF-7 text in buffer. | |
159 | Use IMAP modification if FOR-IMAP is non-nil." | |
160 | (save-restriction | |
161 | (narrow-to-region start end) | |
162 | (when for-imap | |
163 | (goto-char start) | |
164 | (while (search-forward "," nil 'move-to-end) (replace-match "/"))) | |
165 | (let ((pl (utf7-imap-get-pad-length (- end start) 4))) | |
166 | (insert (make-string pl ?=)) | |
167 | (base64-decode-region start (+ end pl))) | |
168 | (funcall (utf7-get-u16char-converter 'from-utf-16)))) | |
169 | ||
170 | (defun utf7-get-u16char-converter (which-way) | |
171 | "Return a function to convert between UTF-16 and current character set." | |
23f87bed MB |
172 | (if utf7-utf-16-coding-system |
173 | (if (eq which-way 'to-utf-16) | |
174 | (lambda () | |
175 | (encode-coding-region (point-min) (point-max) | |
176 | utf7-utf-16-coding-system)) | |
177 | (lambda () | |
178 | (decode-coding-region (point-min) (point-max) | |
179 | utf7-utf-16-coding-system))) | |
180 | ;; Add test to check if we are really Latin-1. | |
181 | (if (eq which-way 'to-utf-16) | |
182 | 'utf7-latin1-u16-char-converter | |
183 | 'utf7-u16-latin1-char-converter))) | |
c113de23 GM |
184 | |
185 | (defun utf7-latin1-u16-char-converter () | |
186 | "Convert latin 1 (ISO-8859.1) characters to 16 bit Unicode. | |
187 | Characters are converted to raw byte pairs in narrowed buffer." | |
23f87bed MB |
188 | (mm-encode-coding-region (point-min) (point-max) 'iso-8859-1) |
189 | (mm-disable-multibyte) | |
c113de23 GM |
190 | (goto-char (point-min)) |
191 | (while (not (eobp)) | |
192 | (insert 0) | |
193 | (forward-char))) | |
194 | ||
195 | (defun utf7-u16-latin1-char-converter () | |
196 | "Convert 16 bit Unicode characters to latin 1 (ISO-8859.1). | |
197 | Characters are in raw byte pairs in narrowed buffer." | |
198 | (goto-char (point-min)) | |
199 | (while (not (eobp)) | |
200 | (if (= 0 (following-char)) | |
201 | (delete-char 1) | |
202 | (error "Unable to convert from Unicode")) | |
23f87bed MB |
203 | (forward-char)) |
204 | (mm-decode-coding-region (point-min) (point-max) 'iso-8859-1) | |
205 | (mm-enable-multibyte)) | |
c113de23 | 206 | |
20a673b2 | 207 | ;;;###autoload |
c113de23 GM |
208 | (defun utf7-encode (string &optional for-imap) |
209 | "Encode UTF-7 STRING. Use IMAP modification if FOR-IMAP is non-nil." | |
f21470ef RS |
210 | (if (and (coding-system-p 'utf-7) (coding-system-p 'utf-7-imap)) |
211 | ;; Emacs 23 with proper support for IMAP | |
212 | (encode-coding-string string (if for-imap 'utf-7-imap 'utf-7)) | |
5f3710a2 GM |
213 | (mm-with-multibyte-buffer |
214 | (insert string) | |
215 | (utf7-encode-internal for-imap) | |
216 | (buffer-string)))) | |
c113de23 GM |
217 | |
218 | (defun utf7-decode (string &optional for-imap) | |
219 | "Decode UTF-7 STRING. Use IMAP modification if FOR-IMAP is non-nil." | |
f21470ef RS |
220 | (if (and (coding-system-p 'utf-7) (coding-system-p 'utf-7-imap)) |
221 | ;; Emacs 23 with proper support for IMAP | |
222 | (decode-coding-string string (if for-imap 'utf-7-imap 'utf-7)) | |
5f3710a2 GM |
223 | (mm-with-unibyte-buffer |
224 | (insert string) | |
225 | (utf7-decode-internal for-imap) | |
226 | (mm-enable-multibyte) | |
227 | (buffer-string)))) | |
c113de23 GM |
228 | |
229 | (provide 'utf7) | |
230 | ||
231 | ;;; utf7.el ends here |