Commit | Line | Data |
---|---|---|
55535639 | 1 | ;;; codepage.el --- MS-DOS/MS-Windows specific coding systems |
75e98450 | 2 | |
2fd125a3 | 3 | ;; Copyright (C) 1998, 1999, 2000, 2002 Free Software Foundation, Inc. |
d7a0267c | 4 | ;; Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 |
2fd125a3 KH |
5 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
6 | ;; Registration Number H14PRO021 | |
75e98450 EZ |
7 | |
8 | ;; Author: Eli Zaretskii | |
9 | ;; Maintainer: FSF | |
0ac646aa | 10 | ;; Keywords: i18n ms-dos ms-windows codepage |
75e98450 EZ |
11 | |
12 | ;; This file is part of GNU Emacs. | |
13 | ||
14 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
15 | ;; it under the terms of the GNU General Public License as published by | |
d7142f3e | 16 | ;; the Free Software Foundation; either version 3, or (at your option) |
75e98450 EZ |
17 | ;; any later version. |
18 | ||
19 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
20 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 | ;; GNU General Public License for more details. | |
23 | ||
24 | ;; You should have received a copy of the GNU General Public License | |
25 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
3a35cf56 LK |
26 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
27 | ;; Boston, MA 02110-1301, USA. | |
75e98450 EZ |
28 | |
29 | ;;; Commentary: | |
30 | ||
0ac646aa | 31 | ;; Special coding systems for DOS/Windows codepage support. |
75e98450 | 32 | ;; |
0ac646aa EZ |
33 | ;; These coding systems perform conversion from the DOS/Windows |
34 | ;; codepage encoding to one of the ISO-8859 character sets. Each | |
35 | ;; codepage has its corresponding ISO-8859 charset, chosen so as to be | |
36 | ;; able to convert all (or most) of the characters. The idea is that | |
37 | ;; Emacs internally works with the usual MULE charsets, and the | |
38 | ;; conversion to and from the DOS codepage is performed on I/O only. | |
39 | ;; See term/internal.el for the complementary setup of the DOS | |
40 | ;; terminal display and input methods. | |
75e98450 EZ |
41 | ;; |
42 | ;; Thanks to Ken'ichi Handa <handa@etl.go.jp> for writing the CCL | |
a1506d29 | 43 | ;; encoders/decoders, and for help in debugging this code. |
75e98450 EZ |
44 | |
45 | ;;; Code: | |
46 | ||
e5b05b08 JB |
47 | (defvar dos-unsupported-char-glyph) |
48 | ||
75e98450 EZ |
49 | (defun cp-coding-system-for-codepage-1 (coding mnemonic iso-name |
50 | decoder encoder) | |
51 | "Make coding system CODING for a DOS codepage using translation tables. | |
52 | MNEMONIC is a character to be displayed on mode line for the coding system. | |
53 | ISO-NAME is the name of the ISO-8859 charset which corresponds to this | |
54 | codepage. | |
55 | DECODER is a translation table for converting characters in the DOS codepage | |
56 | encoding to Emacs multibyte characters. | |
57 | ENCODER is a translation table for encoding Emacs multibyte characters into | |
0ac646aa | 58 | external DOS codepage codes." |
75e98450 EZ |
59 | (save-match-data |
60 | (let* ((coding-name (symbol-name coding)) | |
a7bc7c2a EZ |
61 | (undef (if (eq system-type 'ms-dos) |
62 | (if dos-unsupported-char-glyph | |
63 | (logand dos-unsupported-char-glyph 255) | |
64 | 127) | |
65 | ??)) | |
ba2a2753 | 66 | (safe-chars (make-char-table 'safe-chars)) |
7bd53962 | 67 | (ccl-decoder |
776ca83d | 68 | (ccl-compile |
7bd53962 KH |
69 | ;; The 4 here supplies the buf_magnification parameter |
70 | ;; for the CCL program. A multibyte character may take | |
0ac646aa | 71 | ;; at most 4-bytes. |
776ca83d EZ |
72 | `(4 (loop (read r1) |
73 | (if (r1 >= 128) | |
74 | ((r0 = ,(charset-id 'ascii)) | |
75 | (translate-character ,decoder r0 r1) | |
7bd53962 | 76 | (write-multibyte-character r0 r1)) |
776ca83d EZ |
77 | (write r1)) |
78 | (repeat))))) | |
7bd53962 | 79 | (ccl-encoder |
776ca83d | 80 | (ccl-compile |
07a96c46 AI |
81 | ;; The 2 here supplies the buf_magnification parameter for |
82 | ;; the CCL program. Since the -dos coding system generates | |
83 | ;; \r\n for each \n, a factor of 2 covers even the worst case | |
84 | ;; of empty lines with a single \n. | |
85 | `(2 (loop (read-multibyte-character r0 r1) | |
776ca83d EZ |
86 | (if (r0 != ,(charset-id 'ascii)) |
87 | ((translate-character ,encoder r0 r1) | |
88 | (if (r0 == ,(charset-id 'japanese-jisx0208)) | |
a7bc7c2a | 89 | ((r1 = ,undef) |
776ca83d EZ |
90 | (write r1))))) |
91 | (write-repeat r1)))))) | |
776ca83d | 92 | |
ba2a2753 KH |
93 | ;; Set elements of safe multibyte characters for this codepage |
94 | ;; to t in the char-table safe-chars. | |
95 | (let ((tbl (get decoder 'translation-table)) | |
96 | (i 128) | |
97 | ch) | |
98 | (while (< i 256) | |
99 | (setq ch (aref tbl i)) | |
100 | (if ch (aset safe-chars ch t)) | |
101 | (setq i (1+ i)))) | |
102 | ||
776ca83d | 103 | ;; Make coding system CODING. |
75e98450 EZ |
104 | (make-coding-system |
105 | coding 4 mnemonic | |
106 | (concat "8-bit encoding of " (symbol-name iso-name) | |
776ca83d | 107 | " characters using IBM codepage " coding-name) |
7bd53962 | 108 | (cons ccl-decoder ccl-encoder) |
a205e32a | 109 | `((safe-charsets ascii eight-bit-control eight-bit-graphic ,iso-name) |
ba2a2753 | 110 | (safe-chars . ,safe-chars) |
7bd53962 | 111 | (valid-codes (0 . 255))))))) |
75e98450 EZ |
112 | |
113 | (defun cp-decoding-vector-for-codepage (table charset offset) | |
114 | "Create a vector for decoding IBM PC characters using conversion table | |
115 | TABLE into an ISO-8859 character set CHARSET whose first non-ASCII | |
e714d2cf | 116 | character is generated by (make-char CHARSET OFFSET)." |
75e98450 EZ |
117 | (let* ((len (length table)) |
118 | (undefined-char | |
119 | (if (eq system-type 'ms-dos) | |
120 | (if dos-unsupported-char-glyph | |
121 | (logand dos-unsupported-char-glyph 255) | |
122 | 127) | |
123 | 32)) | |
124 | (vec1 (make-vector 256 undefined-char)) | |
125 | (i 0)) | |
14028d57 | 126 | (while (< i 256) |
75e98450 EZ |
127 | (aset vec1 i i) |
128 | (setq i (1+ i))) | |
129 | (setq i 0) | |
130 | (while (< i len) | |
131 | (if (aref table i) | |
132 | (aset vec1 (aref table i) (make-char charset (+ i offset)))) | |
133 | (setq i (1+ i))) | |
134 | vec1)) | |
135 | ||
136 | ;;; You don't think I created all these tables below by hand, do you? | |
137 | ;;; The following Awk script will create the table for cp850-to-Latin-1 | |
138 | ;;; conversion from the RFC 1345 file (the other tables are left as an | |
139 | ;;; excercise): | |
140 | ;;; BEGIN { n_pages = 11; | |
141 | ;;; pn["IBM437"] = 0; pn["IBM850"] = 1; pn["IBM851"] = 2; | |
142 | ;;; pn["IBM852"] = 3; pn["IBM855"] = 4; pn["IBM860"] = 5; | |
143 | ;;; pn["IBM861"] = 6; pn["IBM862"] = 7; pn["IBM863"] = 8; | |
144 | ;;; pn["IBM864"] = 9; pn["IBM865"] = 10; | |
145 | ;;; } | |
146 | ;;; $1 == "&charset" { charset = $2; } | |
147 | ;;; $1 == "&code" { code = $2; } | |
148 | ;;; /^ [^&]/ { | |
149 | ;;; if ((charset ~ /^IBM(437|8(5[0125]|6[0-5]))$/) || (charset ~ /^ISO_8859-1/)) | |
150 | ;;; { | |
151 | ;;; for (i = 1; i <= NF; i++) | |
152 | ;;; chars[charset,code++] = $i; | |
153 | ;;; } | |
154 | ;;; } | |
155 | ;;; | |
156 | ;;; END { | |
157 | ;;; for (i = 160; i < 256; i++) | |
158 | ;;; { | |
159 | ;;; c = chars["ISO_8859-1:1987",i]; | |
160 | ;;; if (c == "??") # skip unused positions | |
161 | ;;; { | |
162 | ;;; printf " nil"; | |
163 | ;;; if ((i - 159)%16 == 0) | |
164 | ;;; printf "\n"; | |
165 | ;;; continue; | |
166 | ;;; } | |
167 | ;;; found = 0; | |
168 | ;;; for (j in pn) | |
169 | ;;; map[j] = "nil"; | |
170 | ;;; for (combined in chars) | |
171 | ;;; { | |
172 | ;;; candidate = chars[combined]; | |
173 | ;;; split (combined, separate, SUBSEP); | |
174 | ;;; if (separate[1] == "IBM850" && candidate == c) | |
175 | ;;; { | |
176 | ;;; found = 1; | |
177 | ;;; map[separate[1]] = separate[2]; | |
178 | ;;; } | |
179 | ;;; } | |
180 | ;;; printf " %s", map["IBM850"]; | |
181 | ;;; if ((i - 159)%16 == 0) | |
182 | ;;; printf "\n"; | |
183 | ;;; } | |
184 | ;;; } | |
185 | ||
186 | ;;; WARNING WARNING WARNING!!! | |
187 | ;;; | |
188 | ;;; If you want to get fancy with these tables, remember that the inverse | |
189 | ;;; tables, created by `cp-decoding-vector-for-codepage' above, are installed | |
190 | ;;; on MS-DOS as nonascii-translation-table (see `dos-codepage-setup' on | |
191 | ;;; internal.el). Therefore, you should NOT put any codes below 128 in | |
192 | ;;; these tables! Otherwise, various Emacs commands and functions will | |
193 | ;;; mysteriously fail! For example, a typical screwup is to map the Latin-N | |
194 | ;;; acute accent character to the apostrophe, and have all regexps which | |
195 | ;;; end with "\\'" begin to fail (e.g., the automatic setting of the major | |
196 | ;;; mode by file name extension will stop working). | |
197 | ;;; | |
198 | ;;; You HAVE BEEN warned! | |
199 | ||
200 | ;; US/English/PC-8/IBM-2. This doesn't support Latin-1 characters very | |
201 | ;; well, but why not use what we can salvage? | |
202 | (defvar cp437-decode-table | |
203 | ;; Nth element is the code of a cp437 glyph for the multibyte | |
204 | ;; character created by (make-char 'latin-iso8859-1 (+ N 160)). | |
d3fcfdea | 205 | ;; The element nil means there's no corresponding cp437 glyph. |
75e98450 EZ |
206 | [ |
207 | 255 173 155 156 nil 157 179 nil nil nil 166 174 170 196 nil nil | |
208 | 248 241 253 nil nil nil nil 249 nil nil 167 175 172 171 nil 168 | |
209 | nil nil nil nil 142 143 146 128 nil 144 nil nil nil nil nil nil | |
210 | nil 165 nil nil nil nil 153 nil nil nil nil nil 154 nil nil 225 | |
211 | 133 160 131 nil 132 134 145 135 138 130 136 137 141 161 140 139 | |
212 | nil 164 149 162 147 nil 148 246 nil 151 163 150 129 nil nil 152] | |
213 | "Table for converting ISO-8859-1 characters into codepage 437 glyphs.") | |
214 | (setplist 'cp437-decode-table | |
215 | '(charset latin-iso8859-1 language "Latin-1" offset 160)) | |
216 | ||
217 | ;; Multilingual (Latin-1) | |
218 | (defvar cp850-decode-table | |
219 | ;; Nth element is the code of a cp850 glyph for the multibyte | |
220 | ;; character created by (make-char 'latin-iso8859-1 (+ N 160)). | |
221 | ;; The element nil means there's no corresponding cp850 glyph. | |
222 | [ | |
fbb0d4f9 | 223 | 255 173 189 156 207 190 221 245 249 184 166 174 170 240 169 238 |
dd73abe6 | 224 | 248 241 253 252 239 230 244 250 247 251 167 175 172 171 243 168 |
75e98450 | 225 | 183 181 182 199 142 143 146 128 212 144 210 211 222 214 215 216 |
fbb0d4f9 | 226 | 209 165 227 224 226 229 153 158 157 235 233 234 154 237 232 225 |
75e98450 | 227 | 133 160 131 198 132 134 145 135 138 130 136 137 141 161 140 139 |
fbb0d4f9 | 228 | 208 164 149 162 147 228 148 246 155 151 163 150 129 236 231 152] |
75e98450 EZ |
229 | "Table for converting ISO-8859-1 characters into codepage 850 glyphs.") |
230 | (setplist 'cp850-decode-table | |
231 | '(charset latin-iso8859-1 language "Latin-1" offset 160)) | |
232 | ||
fbb0d4f9 EZ |
233 | ;; Multilingual (Latin-9) |
234 | (defvar cp858-decode-table | |
235 | ;; Nth element is the code of a cp858 glyph for the multibyte | |
236 | ;; character created by (make-char 'latin-iso8859-15 (+ N 160)). | |
237 | ;; The element nil means there's no corresponding cp858 glyph. | |
238 | [ | |
239 | 255 173 189 156 213 190 221 245 249 184 166 174 170 240 169 238 | |
240 | 248 241 253 252 239 230 244 250 247 251 167 175 172 171 243 168 | |
241 | 183 181 182 199 142 143 146 128 212 144 210 211 222 214 215 216 | |
242 | 209 165 227 224 226 229 153 158 157 235 233 234 154 237 232 225 | |
243 | 133 160 131 198 132 134 145 135 138 130 136 137 141 161 140 139 | |
244 | 208 164 149 162 147 228 148 246 155 151 163 150 129 236 231 152] | |
245 | "Table for converting ISO-8859-15 characters into codepage 858 glyphs.") | |
246 | (setplist 'cp858-decode-table | |
247 | '(charset latin-iso8859-15 language "Latin-9" offset 160)) | |
248 | ||
75e98450 EZ |
249 | ;; Greek |
250 | (defvar cp851-decode-table | |
251 | [ | |
252 | 255 nil nil 156 nil nil nil 245 249 nil nil 174 nil 240 nil nil | |
253 | 248 241 nil nil 239 nil 134 nil 141 143 144 175 146 171 149 152 | |
254 | 161 164 165 166 167 168 169 170 172 173 181 182 184 183 189 190 | |
255 | 198 199 nil 207 208 209 210 211 212 213 nil nil 155 157 158 159 | |
256 | 252 214 215 216 221 222 224 225 226 227 228 229 230 231 232 233 | |
257 | 234 235 237 236 238 242 243 244 246 250 160 251 162 163 253 nil] | |
258 | "Table for converting ISO-8859-7 characters into codepage 851 glyphs.") | |
259 | (setplist 'cp851-decode-table | |
260 | '(charset greek-iso8859-7 language "Greek" offset 160)) | |
261 | ||
262 | ;; Slavic/Eastern Europe (Latin-2) | |
263 | (defvar cp852-decode-table | |
264 | [ | |
265 | 255 164 244 157 207 149 151 245 249 230 184 155 141 240 166 189 | |
266 | 248 165 247 136 239 150 152 243 242 231 173 156 171 241 167 190 | |
795537a2 | 267 | 232 181 182 198 142 145 143 128 172 144 168 211 183 214 215 210 |
75e98450 EZ |
268 | 209 227 213 224 226 138 153 158 252 222 233 235 154 237 221 225 |
269 | 234 160 131 199 132 146 134 135 159 130 169 137 216 161 140 212 | |
270 | 208 228 229 162 147 139 148 246 253 133 163 251 129 236 238 250] | |
271 | "Table for converting ISO-8859-2 characters into codepage 852 glyphs.") | |
272 | (setplist 'cp852-decode-table | |
273 | '(charset latin-iso8859-2 language "Latin-2" offset 160)) | |
274 | ||
275 | ;; Russian | |
276 | (defvar cp855-decode-table | |
277 | [ | |
dd73abe6 | 278 | 255 133 129 131 135 137 139 141 143 145 147 149 151 240 153 155 |
75e98450 EZ |
279 | 161 163 236 173 167 169 234 244 184 190 199 209 211 213 215 221 |
280 | 226 228 230 232 171 182 165 252 246 250 159 242 238 248 157 224 | |
281 | 160 162 235 172 166 168 233 243 183 189 198 208 210 212 214 216 | |
282 | 225 227 229 231 170 181 164 251 245 249 158 241 237 247 156 222 | |
dd73abe6 | 283 | 239 132 128 130 134 136 138 140 142 144 146 148 150 253 152 154] |
75e98450 EZ |
284 | "Table for converting ISO-8859-5 characters into codepage 855 glyphs.") |
285 | (setplist 'cp855-decode-table | |
286 | '(charset cyrillic-iso8859-5 language "Cyrillic-ISO" offset 160)) | |
287 | ||
288 | ;; Turkish | |
289 | (defvar cp857-decode-table | |
290 | [ | |
291 | 255 nil nil 156 207 nil 245 249 152 158 166 nil 240 nil | |
292 | 248 nil 253 252 239 nil nil nil nil 141 159 167 nil 171 nil | |
293 | 183 181 182 142 nil nil 128 212 144 210 211 222 214 215 216 | |
294 | 165 227 224 226 nil 153 232 nil 235 233 234 154 nil nil 225 | |
295 | 133 160 131 132 nil nil 135 138 130 136 137 236 161 140 139 | |
296 | 164 149 162 147 nil 148 246 nil 151 163 150 129 nil nil 250] | |
297 | "Table for converting ISO-8859-3 characters into codepage 857 glyphs.") | |
298 | (setplist 'cp857-decode-table | |
299 | '(charset latin-iso8859-3 language "Latin-3" offset 160)) | |
300 | ||
301 | ;; Portuguese | |
302 | (defvar cp860-decode-table | |
303 | [ | |
304 | 255 173 155 156 nil nil 179 nil nil nil 166 174 170 nil nil nil | |
305 | nil 241 253 nil nil nil nil 249 nil nil 167 175 172 171 nil 168 | |
306 | 145 134 143 142 nil nil nil 128 146 144 137 nil 152 nil 139 nil | |
307 | nil 165 159 169 140 153 nil nil nil 157 150 nil 154 nil nil nil | |
308 | 133 160 131 132 nil nil nil 135 138 130 136 nil 141 161 nil nil | |
309 | nil 164 149 162 147 148 nil 246 nil 151 163 nil 129 nil nil nil] | |
310 | "Table for converting ISO-8859-1 characters into codepage 860 glyphs.") | |
311 | (setplist 'cp860-decode-table | |
312 | '(charset latin-iso8859-1 language "Latin-1" offset 160)) | |
313 | ||
314 | ;; Icelandic | |
315 | (defvar cp861-decode-table | |
316 | [ | |
317 | 255 173 nil 156 nil nil nil nil nil nil nil 174 170 nil nil nil | |
318 | nil 241 253 nil nil nil nil 249 nil nil nil 175 172 171 nil 168 | |
319 | nil 164 nil nil 142 143 146 128 nil 144 nil nil nil 165 nil nil | |
320 | 139 nil 159 166 nil nil 153 nil 157 nil 167 nil 154 151 141 nil | |
321 | 133 160 131 nil 132 134 145 135 138 130 136 137 nil 161 nil nil | |
322 | 140 nil nil 162 147 nil 148 246 155 nil 163 150 129 152 149 nil] | |
323 | "Table for converting ISO-8859-1 characters into codepage 861 glyphs.") | |
324 | (setplist 'cp861-decode-table | |
325 | '(charset latin-iso8859-1 language "Latin-1" offset 160)) | |
326 | ||
327 | ;; Hebrew | |
328 | (defvar cp862-decode-table | |
329 | ;; Nth element is the code of a cp862 glyph for the multibyte | |
330 | ;; character created by (make-char 'hebrew-iso8859-8 (+ N 160)). | |
0ac646aa | 331 | ;; The element nil means there's no corresponding cp862 glyph. |
75e98450 EZ |
332 | [ |
333 | 255 173 155 156 nil 157 179 nil nil nil nil 174 170 196 nil nil | |
334 | 248 241 253 nil nil 230 nil 249 nil nil 246 175 172 171 nil nil | |
335 | nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil | |
336 | nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil 205 | |
337 | 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | |
338 | 144 145 146 147 148 149 150 151 152 153 154 nil nil nil nil nil] | |
339 | "Table for converting ISO-8859-8 characters into codepage 862 glyphs.") | |
340 | (setplist 'cp862-decode-table | |
341 | '(charset hebrew-iso8859-8 language "Hebrew" offset 160)) | |
342 | ||
343 | ;; French Canadian | |
344 | (defvar cp863-decode-table | |
345 | [ | |
346 | 255 nil 155 156 152 nil 160 143 164 nil nil 174 170 nil nil 167 | |
347 | nil 241 253 166 161 nil 134 249 165 nil nil 175 172 171 173 nil | |
348 | 142 nil 132 nil nil nil nil 128 145 144 146 148 nil nil 168 149 | |
349 | nil nil nil nil 153 nil nil nil nil 157 nil 158 154 nil nil nil | |
350 | 133 nil 131 nil nil nil nil 135 138 130 136 137 141 nil 140 139 | |
351 | nil nil nil 162 147 nil nil 246 nil 151 163 150 129 nil nil nil] | |
352 | "Table for converting ISO-8859-1 characters into codepage 863 glyphs.") | |
353 | (setplist 'cp863-decode-table | |
354 | '(charset latin-iso8859-1 language "Latin-1" offset 160)) | |
355 | ||
356 | ;; Arabic | |
357 | ;; FIXME: Emacs doesn't seem to support the "Arabic" language | |
358 | ;; environment yet. So this is only partially usable, for now | |
359 | (defvar cp864-decode-table | |
360 | [ | |
361 | 255 nil nil nil 164 nil nil nil nil nil nil nil 172 161 nil nil | |
362 | nil nil nil nil nil nil nil nil nil nil nil 187 nil nil nil 191 | |
363 | nil 193 194 195 196 nil 198 199 169 201 170 171 173 174 175 207 | |
364 | 208 209 210 188 189 190 235 215 216 223 238 nil nil nil nil nil | |
365 | 224 247 248 252 251 239 242 243 232 233 253 nil nil nil nil nil | |
366 | nil 241 nil nil nil nil nil nil nil nil nil nil nil nil nil nil] | |
0ac646aa | 367 | "Table for converting ISO-8859-6 characters into codepage 864 glyphs.") |
75e98450 EZ |
368 | (setplist 'cp864-decode-table |
369 | '(charset arabic-iso8859-6 language nil offset 160)) | |
370 | ||
0ac646aa EZ |
371 | ;; Arabic OEM codepage used by Windows |
372 | ;; FIXME: Emacs doesn't seem to support the "Arabic" language | |
373 | ;; environment yet. So this is only partially usable, for now | |
374 | (defvar cp720-decode-table | |
375 | [ | |
376 | 255 nil nil nil 148 nil nil nil nil nil nil nil nil 196 nil nil | |
377 | nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil | |
378 | nil 152 153 154 155 157 158 159 160 161 162 163 164 165 166 167 | |
379 | 168 169 170 171 172 173 224 225 226 227 228 nil nil nil nil nil | |
380 | 149 229 231 232 233 234 235 236 237 238 239 241 242 243 244 245 | |
381 | 246 145 146 nil nil nil nil nil nil nil nil nil nil nil nil nil] | |
382 | "Table for converting ISO-8859-6 characters into codepage 720 glyphs.") | |
383 | (setplist 'cp720-decode-table | |
384 | '(charset arabic-iso8859-6 language nil offset 160)) | |
385 | ||
386 | ||
75e98450 EZ |
387 | ;; Nordic (Norwegian/Danish) |
388 | (defvar cp865-decode-table | |
389 | [ | |
390 | 255 173 nil 156 nil nil nil nil nil nil 166 174 170 nil nil nil | |
391 | nil 241 253 nil nil nil nil 249 nil nil 167 175 172 171 nil 168 | |
392 | nil nil nil nil 142 143 146 128 nil 144 nil nil nil nil nil nil | |
393 | nil 165 nil nil nil nil 153 nil 157 nil nil nil 154 nil nil nil | |
394 | 133 160 131 nil 132 134 145 135 138 130 136 137 141 161 140 139 | |
395 | nil 164 149 162 147 nil 148 246 155 151 163 150 129 nil nil 152] | |
396 | "Table for converting ISO-8859-1 characters into codepage 865 glyphs.") | |
397 | (setplist 'cp865-decode-table | |
398 | '(charset latin-iso8859-1 language "Latin-1" offset 160)) | |
399 | ||
573191a1 EZ |
400 | ;; Russian (Yes, another one! This one's supposed to be used |
401 | ;; on Windows as the Russian OEM code page.) | |
402 | (defvar cp866-decode-table | |
403 | [ | |
404 | 255 240 nil nil 242 nil nil 244 nil nil nil nil nil nil 246 nil | |
405 | 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | |
406 | 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | |
407 | 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | |
ffd5cede | 408 | 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
573191a1 EZ |
409 | 252 241 nil nil 243 nil nil 245 nil nil nil nil nil nil 247 nil] |
410 | "Table for converting ISO-8859-5 characters into codepage 866 glyphs.") | |
411 | (setplist 'cp866-decode-table | |
412 | '(charset cyrillic-iso8859-5 language "Cyrillic-ISO" offset 160)) | |
413 | ||
75e98450 EZ |
414 | ;; Greek (yes, another one!) |
415 | (defvar cp869-decode-table | |
416 | [ | |
417 | 255 139 140 156 nil nil 138 245 249 151 nil 174 137 240 nil 142 | |
418 | 248 241 153 154 239 247 134 136 141 143 144 175 146 171 149 152 | |
419 | 161 164 165 166 167 168 169 170 172 173 181 182 183 184 189 190 | |
420 | 198 199 nil 207 208 209 210 211 212 213 145 150 155 157 158 159 | |
421 | 252 214 215 216 221 222 224 225 226 227 228 229 230 231 232 233 | |
422 | 234 235 237 236 238 242 243 244 246 250 160 251 162 163 253 nil] | |
423 | "Table for converting ISO-8859-7 characters into codepage 869 glyphs.") | |
424 | (setplist 'cp869-decode-table | |
425 | '(charset greek-iso8859-7 language "Greek" offset 160)) | |
426 | ||
0ac646aa EZ |
427 | ;; Greek OEM codepage used by Windows |
428 | (defvar cp737-decode-table | |
429 | [ | |
430 | 255 nil nil nil nil nil 179 nil nil nil nil nil nil 196 nil nil | |
431 | 248 241 253 nil nil nil 234 250 235 236 237 nil 238 nil 239 240 | |
432 | nil 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | |
433 | 143 144 nil 145 146 147 148 149 150 151 244 245 225 226 227 229 | |
434 | nil 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | |
435 | 167 168 170 169 171 172 173 174 175 224 228 232 230 231 233 nil] | |
436 | "Table for converting ISO-8859-7 characters into codepage 737 glyphs.") | |
437 | (setplist 'cp737-decode-table | |
438 | '(charset greek-iso8859-7 language "Greek" offset 160)) | |
439 | ||
106e96bd EZ |
440 | ;; Conversion from codepages 770-775 to Latin-4 for Baltic countries. |
441 | ;; FIXME: Once we support Latin-7, these should be remapped into it. | |
442 | (defvar cp770-decode-table | |
443 | [ | |
444 | 255 143 nil nil 155 nil 156 nil 157 159 137 168 nil 196 146 nil | |
445 | 248 133 nil nil nil nil 134 nil nil 158 136 152 nil nil 145 nil | |
446 | 160 nil nil nil 142 nil nil 173 128 nil 139 nil 144 nil nil 161 | |
447 | nil nil nil 163 nil 149 153 nil nil 167 nil nil 154 nil 166 225 | |
448 | 131 nil nil nil 132 nil nil 141 135 nil 138 nil 130 nil nil 140 | |
449 | nil nil nil 162 nil 147 148 247 nil 151 nil nil 129 nil 150 nil] | |
450 | "Table for converting ISO-8859-4 characters into codepage 770 glyphs.") | |
451 | (setplist 'cp770-decode-table | |
452 | '(charset latin-iso8859-4 language "Latin-4" offset 160)) | |
453 | ||
454 | (defvar cp773-decode-table | |
455 | [ | |
456 | 255 220 nil 138 150 nil 234 190 166 246 237 149 173 196 252 nil | |
457 | 208 nil nil 139 239 nil 235 nil nil 247 137 133 136 nil 253 nil | |
458 | 160 nil nil nil 142 143 146 244 222 144 240 nil 242 nil nil 161 | |
459 | nil 238 226 232 nil 229 153 158 157 248 nil nil 154 nil 250 225 | |
460 | 131 nil nil nil 132 134 145 245 223 130 241 nil 243 nil nil 140 | |
461 | nil 236 147 233 nil 228 148 198 155 249 nil nil 129 nil 251 nil] | |
462 | "Table for converting ISO-8859-4 characters into codepage 773 glyphs.") | |
463 | (setplist 'cp773-decode-table | |
464 | '(charset latin-iso8859-4 language "Latin-4" offset 160)) | |
465 | ||
466 | (defvar cp774-decode-table | |
467 | [ | |
468 | 255 181 nil nil 155 nil nil nil 245 190 nil nil nil 196 207 nil | |
469 | 248 208 nil nil nil nil nil nil nil 213 nil nil nil nil 216 nil | |
470 | nil nil nil nil 142 143 146 189 182 144 183 nil 184 nil nil nil | |
471 | nil nil nil nil nil nil 153 nil nil 198 nil nil 154 nil 199 225 | |
472 | nil 160 nil nil 132 134 145 212 209 130 210 137 211 161 140 nil | |
473 | nil nil nil nil 147 nil 148 246 237 214 163 150 129 nil 215 248] | |
474 | "Table for converting ISO-8859-4 characters into codepage 774 glyphs.") | |
475 | (setplist 'cp774-decode-table | |
476 | '(charset latin-iso8859-4 language "Latin-4" offset 160)) | |
477 | ||
75e98450 EZ |
478 | (defvar cp775-decode-table |
479 | [ | |
480 | 255 181 nil 138 150 nil 234 245 166 190 237 149 173 240 207 nil | |
106e96bd | 481 | 248 208 nil 139 239 nil 235 nil nil 213 137 133 136 nil 216 nil |
75e98450 EZ |
482 | 160 nil nil nil 142 143 146 189 182 144 183 nil 184 nil nil 161 |
483 | nil 238 226 232 nil 229 153 158 157 198 nil nil 154 nil 199 225 | |
484 | 131 nil nil nil 132 134 145 212 209 130 210 nil 211 nil nil 140 | |
106e96bd | 485 | nil 236 147 233 nil 228 148 247 155 214 nil nil 129 nil 215 nil] |
75e98450 EZ |
486 | "Table for converting ISO-8859-4 characters into codepage 775 glyphs.") |
487 | (setplist 'cp775-decode-table | |
488 | '(charset latin-iso8859-4 language "Latin-4" offset 160)) | |
489 | ||
7e37faa3 EZ |
490 | ;; Support for the Windows 12xx series of codepages that MS has |
491 | ;; butchered from the ISO-8859 specs. This does not add support for | |
492 | ;; the extended characters that MS has added in the 128 - 159 coding | |
493 | ;; range, only translates those characters that can be expressed in | |
0ac646aa | 494 | ;; the corresponding iso-8859 charset. |
7e37faa3 EZ |
495 | |
496 | ;; Codepage Mapping: | |
497 | ;; | |
498 | ;; Windows-1250: ISO-8859-2 (Central Europe) - differs in some positions | |
499 | ;; Windows-1251: ISO-8859-5 (Cyrillic) - differs wildly | |
500 | ;; Windows-1252: ISO-8859-1 (West Europe) - exact match | |
501 | ;; Windows-1253: ISO-8859-7 (Greek) - differs in some positions | |
502 | ;; Windows-1254: ISO-8859-9 (Turkish) - exact match | |
503 | ;; Windows-1255: ISO-8859-8 (Hebrew) - exact match | |
504 | ;; Windows-1256: ISO-8859-6 (Arabic) - half match | |
505 | ;; Windows-1257: ISO-8859-4 (Baltic) - differs, future Latin-7 | |
506 | ;; Windows-1258: VISCII (Vietnamese) - Completely different | |
507 | ||
508 | (defvar cp1250-decode-table | |
509 | [ | |
510 | 160 165 162 163 164 188 140 167 168 138 170 141 143 173 142 175 | |
511 | 176 185 178 179 180 190 156 161 184 154 186 157 159 189 158 191 | |
512 | 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | |
513 | 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | |
514 | 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | |
515 | 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 ] | |
adab83c8 | 516 | "ISO-8859-2 to Windows-1250 (Central Europe) codepage decoding table.") |
7e37faa3 EZ |
517 | (setplist 'cp1250-decode-table |
518 | '(charset latin-iso8859-2 language "Latin-2" offset 160)) | |
519 | ||
520 | (defvar cp1251-decode-table | |
521 | [ | |
522 | 160 168 128 129 170 189 178 175 163 138 140 142 141 173 161 143 | |
523 | 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | |
524 | 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | |
525 | 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | |
526 | 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | |
527 | 185 184 144 131 186 190 179 191 188 154 156 158 157 167 162 159 ] | |
adab83c8 | 528 | "ISO-8859-5 to Windows-1251 (Cyrillic) codepage decoding table.") |
7e37faa3 EZ |
529 | (setplist 'cp1251-decode-table |
530 | '(charset cyrillic-iso8859-5 language "Cyrillic-ISO" offset 160)) | |
531 | ||
532 | ;; cp1253 is missing nbsp so we cannot quite translate perfectly. It | |
533 | ;; also has two micro/mu characters which would require more complex | |
534 | ;; processing to accomodate. | |
535 | (defvar cp1253-decode-table | |
536 | [ | |
537 | nil 145 146 163 nil nil 166 167 168 169 nil 171 172 173 nil 151 | |
538 | 176 177 178 179 180 161 162 183 184 185 186 187 188 189 190 191 | |
539 | 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | |
540 | 208 209 nil 211 212 213 214 215 216 217 218 219 220 221 222 223 | |
541 | 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | |
542 | 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 nil ] | |
adab83c8 | 543 | "ISO-8859-7 to Windows-1253 (Greek) codepage decoding table.") |
7e37faa3 EZ |
544 | (setplist 'cp1253-decode-table |
545 | '(charset greek-iso8859-7 language "Greek" offset 160)) | |
546 | ||
547 | ;; Since Latin-7 is not yet official, and Emacs does not support it, | |
548 | ;; provide translation between Windows-1257 and Latin-4 the best we | |
549 | ;; can. | |
550 | (defvar cp1257-decode-table | |
551 | [ | |
552 | 160 192 nil 170 164 nil 207 167 nil 208 199 204 nil 173 222 nil | |
553 | 176 224 nil 186 nil nil 239 nil nil 240 231 236 nil nil 254 nil | |
554 | 194 nil nil nil 196 197 175 193 200 201 198 nil 203 nil nil 206 | |
555 | nil 210 212 205 nil 213 214 215 168 216 nil nil 220 nil 219 223 | |
556 | 226 nil nil nil 228 229 191 225 232 233 230 nil 235 nil nil 238 | |
557 | nil 242 244 237 nil 245 246 247 184 248 nil nil 252 nil 251 nil ] | |
adab83c8 | 558 | "ISO-8859-4 to Windows-1257 (Baltic) codepage decoding table.") |
7e37faa3 EZ |
559 | (setplist 'cp1257-decode-table |
560 | '(charset latin-iso8859-4 language "Latin-4" offset 160)) | |
561 | ||
75e98450 EZ |
562 | ;;;###autoload |
563 | (defun cp-make-coding-systems-for-codepage (codepage iso-name offset) | |
708b0815 | 564 | "Create a coding system to convert IBM CODEPAGE into charset ISO-NAME |
75e98450 EZ |
565 | whose first character is at offset OFFSET from the beginning of 8-bit |
566 | ASCII table. | |
567 | ||
708b0815 EZ |
568 | The created coding system has the usual 3 subsidiary systems: for Unix-, |
569 | DOS- and Mac-style EOL conversion. However, unlike built-in coding | |
570 | systems, the Mac-style EOL conversion is currently not supported by the | |
571 | decoder and encoder created by this function." | |
75e98450 EZ |
572 | (let* ((decode-table (intern (format "%s-decode-table" codepage))) |
573 | (nonascii-table | |
574 | (intern (format "%s-nonascii-translation-table" codepage))) | |
575 | (decode-translation | |
576 | (intern (format "%s-decode-translation-table" codepage))) | |
577 | (encode-translation | |
776ca83d | 578 | (intern (format "%s-encode-translation-table" codepage)))) |
75e98450 EZ |
579 | (set nonascii-table |
580 | (make-translation-table-from-vector | |
581 | (cp-decoding-vector-for-codepage | |
582 | (symbol-value decode-table) iso-name offset))) | |
583 | (define-translation-table encode-translation | |
584 | (char-table-extra-slot (symbol-value nonascii-table) 0)) | |
0d35b92b EZ |
585 | ;; For charsets other than ascii, eight-bit-* and ISO-NAME, set |
586 | ;; `?' for one-column charsets, and some Japanese character for | |
75e98450 | 587 | ;; wide-column charsets. CCL encoder convert that Japanese |
a7bc7c2a | 588 | ;; character to either dos-unsupported-char-glyph or "??". |
75e98450 | 589 | (let ((tbl (char-table-extra-slot (symbol-value nonascii-table) 0)) |
a7bc7c2a EZ |
590 | (undef (if (eq system-type 'ms-dos) |
591 | (if dos-unsupported-char-glyph | |
592 | (logand dos-unsupported-char-glyph 255) | |
593 | 127) | |
594 | ??)) | |
0d35b92b EZ |
595 | (charsets (delq 'ascii |
596 | (delq 'eight-bit-control | |
597 | (delq 'eight-bit-graphic | |
598 | (delq iso-name | |
599 | (copy-sequence charset-list)))))) | |
75e98450 EZ |
600 | (wide-column-char (make-char 'japanese-jisx0208 32 32))) |
601 | (while charsets | |
602 | (aset tbl (make-char (car charsets)) | |
a7bc7c2a | 603 | (if (= (charset-width (car charsets)) 1) undef wide-column-char)) |
75e98450 EZ |
604 | (setq charsets (cdr charsets)))) |
605 | (define-translation-table decode-translation | |
606 | (symbol-value nonascii-table)) | |
607 | (cp-coding-system-for-codepage-1 | |
776ca83d EZ |
608 | (intern codepage) ?D iso-name decode-translation encode-translation) |
609 | )) | |
75e98450 EZ |
610 | |
611 | (defun cp-codepage-decoder (codepage) | |
8157ac14 PJ |
612 | "If CODEPAGE is the name of a supported codepage, return its decode table. |
613 | Otherwise return nil." | |
75e98450 EZ |
614 | (let ((cp (if (symbolp codepage) (symbol-name codepage) codepage))) |
615 | (cond | |
616 | ((stringp cp) | |
617 | (intern-soft (format "%s-decode-table" cp))) | |
618 | (t nil)))) | |
619 | ||
620 | ;;;###autoload | |
621 | (defun cp-charset-for-codepage (codepage) | |
622 | "Return the charset for which there is a translation table to DOS CODEPAGE. | |
623 | CODEPAGE must be the name of a DOS codepage, a string." | |
624 | (let ((cp-decoder (cp-codepage-decoder codepage))) | |
625 | (if (null cp-decoder) | |
626 | (error "Unsupported codepage %s" codepage) | |
627 | (get cp-decoder 'charset)))) | |
628 | ||
629 | ;;;###autoload | |
630 | (defun cp-language-for-codepage (codepage) | |
631 | "Return the name of the MULE language environment for CODEPAGE. | |
632 | CODEPAGE must be the name of a DOS codepage, a string." | |
633 | (let ((cp-decoder (cp-codepage-decoder codepage))) | |
634 | (if (null cp-decoder) | |
635 | (error "Unsupported codepage %s" codepage) | |
636 | (get cp-decoder 'language)))) | |
637 | ||
638 | ;;;###autoload | |
639 | (defun cp-offset-for-codepage (codepage) | |
640 | "Return the offset to be used in setting up coding systems for CODEPAGE. | |
641 | CODEPAGE must be the name of a DOS codepage, a string." | |
642 | (let ((cp-decoder (cp-codepage-decoder codepage))) | |
643 | (if (null cp-decoder) | |
644 | (error "Unsupported codepage %s" codepage) | |
645 | (get cp-decoder 'offset)))) | |
646 | ||
647 | ;;;###autoload | |
648 | (defun cp-supported-codepages () | |
649 | "Return an alist of supported codepages. | |
650 | ||
651 | Each association in the alist has the form (NNN . CHARSET), where NNN is the | |
652 | codepage number, and CHARSET is the MULE charset which is the closest match | |
653 | for the character set supported by that codepage. | |
654 | ||
655 | A codepage NNN is supported if a variable called `cpNNN-decode-table' exists, | |
656 | is a vector, and has a charset property." | |
657 | (save-match-data | |
658 | (let (alist chset sname) | |
659 | (mapatoms | |
660 | (function | |
661 | (lambda (sym) | |
662 | (if (and (boundp sym) | |
7e37faa3 | 663 | (string-match "\\`cp\\([1-9][0-9][0-9][0-9]?\\)-decode-table\\'" |
75e98450 EZ |
664 | (setq sname (symbol-name sym))) |
665 | (vectorp (symbol-value sym)) | |
666 | (setq chset (get sym 'charset))) | |
667 | (setq alist | |
668 | (cons (cons (match-string 1 sname) chset) alist)))))) | |
669 | alist))) | |
670 | ||
671 | ;;;###autoload | |
672 | (defun codepage-setup (codepage) | |
e714d2cf | 673 | "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE. |
75e98450 EZ |
674 | |
675 | These coding systems are meant for encoding and decoding 8-bit non-ASCII | |
676 | characters used by the IBM codepages, typically in conjunction with files | |
e714d2cf | 677 | read/written by MS-DOS software, or for display on the MS-DOS terminal." |
75e98450 EZ |
678 | (interactive |
679 | (let ((completion-ignore-case t) | |
680 | (candidates (cp-supported-codepages))) | |
5b76833f | 681 | (list (completing-read "Setup DOS Codepage (default 437): " candidates |
75e98450 | 682 | nil t nil nil "437")))) |
7dd4fd43 EZ |
683 | (let* ((cp (format "cp%s" codepage)) |
684 | (cp-defined (intern-soft cp))) | |
685 | (or (and cp-defined ;; avoid defining if already defined | |
686 | (coding-system-p cp-defined)) | |
687 | (cp-make-coding-systems-for-codepage | |
688 | cp (cp-charset-for-codepage cp) (cp-offset-for-codepage cp))))) | |
75e98450 | 689 | |
2c3245ae DL |
690 | ;; Add DOS codepages to `non-iso-charset-alist'. |
691 | (eval-after-load "mule-diag" | |
692 | '(let ((tail (cp-supported-codepages)) | |
693 | elt) | |
694 | (while tail | |
695 | (setq elt (car tail) tail (cdr tail)) | |
696 | ;; Now ELT is (CODEPAGE . CHARSET), where CODEPAGE is a string | |
697 | ;; (e.g. "850"), CHARSET is a charset that characters in CODEPAGE | |
698 | ;; are mapped to. | |
699 | (unless (assq (intern (concat "cp" (car elt))) non-iso-charset-alist) | |
700 | (setq non-iso-charset-alist | |
701 | (cons (list (intern (concat "cp" (car elt))) | |
702 | (list 'ascii (cdr elt)) | |
703 | `(lambda (code) | |
704 | (decode-codepage-char ,(string-to-int (car elt)) | |
705 | code)) | |
706 | (list (list 0 255))) | |
707 | non-iso-charset-alist)))))) | |
708 | ||
75e98450 EZ |
709 | (provide 'codepage) |
710 | ||
ab5796a9 | 711 | ;;; arch-tag: 80328de8-b94e-4386-be26-5876105731f0 |
55535639 | 712 | ;;; codepage.el ends here |