Commit | Line | Data |
---|---|---|
64b4e1f1 | 1 | ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
4ed46869 | 2 | |
2254377e | 3 | ;; Copyright (C) 1995, 2003 Electrotechnical Laboratory, JAPAN. |
fa526c4a | 4 | ;; Licensed to the Free Software Foundation. |
64b4e1f1 | 5 | ;; Copyright (C) 1995, 2001 Free Software Foundation, Inc. |
4ed46869 KH |
6 | |
7 | ;; Keywords: mule, multilingual, Chinese | |
8 | ||
9 | ;; This file is part of GNU Emacs. | |
10 | ||
11 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
12 | ;; it under the terms of the GNU General Public License as published by | |
13 | ;; the Free Software Foundation; either version 2, or (at your option) | |
14 | ;; any later version. | |
15 | ||
16 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;; GNU General Public License for more details. | |
20 | ||
21 | ;; You should have received a copy of the GNU General Public License | |
369314dc KH |
22 | ;; along with GNU Emacs; see the file COPYING. If not, write to the |
23 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 | ;; Boston, MA 02111-1307, USA. | |
4ed46869 | 25 | |
60370d40 PJ |
26 | ;;; Commentary: |
27 | ||
4ed46869 KH |
28 | ;;; Code: |
29 | ||
64b4e1f1 | 30 | ;; Hz/ZW/EUC-TW encoding stuff |
4ed46869 KH |
31 | |
32 | ;; HZ is an encoding method for Chinese character set GB2312 used | |
33 | ;; widely in Internet. It is very similar to 7-bit environment of | |
34 | ;; ISO-2022. The difference is that HZ uses the sequence "~{" and | |
35 | ;; "~}" for designating GB2312 and ASCII respectively, hence, it | |
36 | ;; doesn't uses ESC (0x1B) code. | |
37 | ||
38 | ;; ZW is another encoding method for Chinese character set GB2312. It | |
39 | ;; encodes Chinese characters line by line by starting each line with | |
40 | ;; the sequence "zW". It also uses only 7-bit as HZ. | |
41 | ||
64b4e1f1 WL |
42 | ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is |
43 | ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | |
44 | ;; a single shift escape followed by three bytes: the first gives the | |
45 | ;; plane, the second and third the character code. Note that characters | |
46 | ;; of plane 1 are (redundantly) accessible with a single shift escape | |
47 | ;; also. | |
48 | ||
4ed46869 KH |
49 | ;; ISO-2022 escape sequence to designate GB2312. |
50 | (defvar iso2022-gb-designation "\e$A") | |
51 | ;; HZ escape sequence to designate GB2312. | |
52 | (defvar hz-gb-designnation "~{") | |
53 | ;; ISO-2022 escape sequence to designate ASCII. | |
54 | (defvar iso2022-ascii-designation "\e(B") | |
55 | ;; HZ escape sequence to designate ASCII. | |
56 | (defvar hz-ascii-designnation "~}") | |
57 | ;; Regexp of ZW sequence to start GB2312. | |
58 | (defvar zw-start-gb "^zW") | |
59 | ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW. | |
8f3969f8 KH |
60 | (defvar hz/zw-start-gb |
61 | (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]")) | |
4ed46869 KH |
62 | |
63 | (defvar decode-hz-line-continuation nil | |
64 | "Flag to tell if we should care line continuation convention of Hz.") | |
65 | ||
8f3969f8 | 66 | (defconst hz-set-msb-table |
2254377e SM |
67 | (eval-when-compile |
68 | (let ((chars nil) | |
69 | (i 0)) | |
70 | (while (< i 33) | |
71 | (push i chars) | |
72 | (setq i (1+ i))) | |
73 | (while (< i 127) | |
74 | (push (+ i 128) chars) | |
75 | (setq i (1+ i))) | |
76 | (apply 'string (nreverse chars))))) | |
8f3969f8 | 77 | |
4ed46869 KH |
78 | ;;;###autoload |
79 | (defun decode-hz-region (beg end) | |
80 | "Decode HZ/ZW encoded text in the current region. | |
81 | Return the length of resulting text." | |
82 | (interactive "r") | |
83 | (save-excursion | |
84 | (save-restriction | |
8f3969f8 KH |
85 | (let (pos ch) |
86 | (narrow-to-region beg end) | |
87 | ||
88 | ;; We, at first, convert HZ/ZW to `euc-china', | |
89 | ;; then decode it. | |
90 | ||
91 | ;; "~\n" -> "\n", "~~" -> "~" | |
92 | (goto-char (point-min)) | |
93 | (while (search-forward "~" nil t) | |
94 | (setq ch (following-char)) | |
95 | (if (or (= ch ?\n) (= ch ?~)) (delete-char -1))) | |
96 | ||
97 | ;; "^zW...\n" -> Chinese GB2312 | |
98 | ;; "~{...~}" -> Chinese GB2312 | |
99 | (goto-char (point-min)) | |
100 | (setq beg nil) | |
4ed46869 | 101 | (while (re-search-forward hz/zw-start-gb nil t) |
8f3969f8 KH |
102 | (setq pos (match-beginning 0) |
103 | ch (char-after pos)) | |
104 | ;; Record the first position to start conversion. | |
105 | (or beg (setq beg pos)) | |
106 | (end-of-line) | |
107 | (setq end (point)) | |
108 | (if (>= ch 128) ; 8bit GB2312 | |
109 | nil | |
110 | (goto-char pos) | |
111 | (delete-char 2) | |
112 | (setq end (- end 2)) | |
113 | (if (= ch ?z) ; ZW -> euc-china | |
114 | (progn | |
115 | (translate-region (point) end hz-set-msb-table) | |
116 | (goto-char end)) | |
117 | (if (search-forward hz-ascii-designnation | |
118 | (if decode-hz-line-continuation nil end) | |
119 | t) | |
120 | (delete-char -2)) | |
121 | (setq end (point)) | |
122 | (translate-region pos (point) hz-set-msb-table)))) | |
123 | (if beg | |
124 | (decode-coding-region beg end 'euc-china))) | |
4ed46869 KH |
125 | (- (point-max) (point-min))))) |
126 | ||
127 | ;;;###autoload | |
128 | (defun decode-hz-buffer () | |
129 | "Decode HZ/ZW encoded text in the current buffer." | |
130 | (interactive) | |
131 | (decode-hz-region (point-min) (point-max))) | |
132 | ||
133 | ;;;###autoload | |
134 | (defun encode-hz-region (beg end) | |
135 | "Encode the text in the current region to HZ. | |
136 | Return the length of resulting text." | |
137 | (interactive "r") | |
138 | (save-excursion | |
139 | (save-restriction | |
140 | (narrow-to-region beg end) | |
141 | ||
142 | ;; "~" -> "~~" | |
143 | (goto-char (point-min)) | |
144 | (while (search-forward "~" nil t) (insert ?~)) | |
145 | ||
146 | ;; Chinese GB2312 -> "~{...~}" | |
147 | (goto-char (point-min)) | |
148 | (if (re-search-forward "\\cc" nil t) | |
39e0da62 | 149 | (let (pos) |
4ed46869 | 150 | (goto-char (setq pos (match-beginning 0))) |
5dd921df | 151 | (encode-coding-region pos (point-max) 'iso-2022-7bit) |
4ed46869 KH |
152 | (goto-char pos) |
153 | (while (search-forward iso2022-gb-designation nil t) | |
154 | (delete-char -3) | |
155 | (insert hz-gb-designnation)) | |
156 | (goto-char pos) | |
157 | (while (search-forward iso2022-ascii-designation nil t) | |
158 | (delete-char -3) | |
159 | (insert hz-ascii-designnation)))) | |
160 | (- (point-max) (point-min))))) | |
161 | ||
162 | ;;;###autoload | |
163 | (defun encode-hz-buffer () | |
164 | "Encode the text in the current buffer to HZ." | |
165 | (interactive) | |
166 | (encode-hz-region (point-min) (point-max))) | |
167 | ||
64b4e1f1 WL |
168 | ;; The following sets up a translation table (big5-to-cns) from Big 5 |
169 | ;; to CNS encoding, using some auxiliary functions to make the code | |
170 | ;; more readable. | |
171 | ||
172 | ;; Many kudos to Himi! The used code has been adapted from his | |
173 | ;; mule-ucs package. | |
174 | ||
2254377e | 175 | (eval-when-compile |
64b4e1f1 WL |
176 | (defun big5-to-flat-code (num) |
177 | "Convert NUM in Big 5 encoding to a `flat code'. | |
178 | 0xA140 will be mapped to position 0, 0xA141 to position 1, etc. | |
179 | There are no gaps in the flat code." | |
180 | ||
181 | (let ((hi (/ num 256)) | |
182 | (lo (% num 256))) | |
183 | (+ (* 157 (- hi #xa1)) | |
184 | (- lo (if (>= lo #xa1) 98 64))))) | |
185 | ||
186 | (defun flat-code-to-big5 (num) | |
187 | "Convert NUM from a `flat code' to Big 5 encoding. | |
188 | This is the inverse function of `big5-to-flat-code'." | |
189 | ||
190 | (let ((hi (/ num 157)) | |
191 | (lo (% num 157))) | |
192 | (+ (* 256 (+ hi #xa1)) | |
193 | (+ lo (if (< lo 63) 64 98))))) | |
194 | ||
195 | (defun euc-to-flat-code (num) | |
196 | "Convert NUM in EUC encoding (in GL representation) to a `flat code'. | |
197 | 0x2121 will be mapped to position 0, 0x2122 to position 1, etc. | |
198 | There are no gaps in the flat code." | |
199 | ||
200 | (let ((hi (/ num 256)) | |
201 | (lo (% num 256))) | |
202 | (+ (* 94 (- hi #x21)) | |
203 | (- lo #x21)))) | |
204 | ||
205 | (defun flat-code-to-euc (num) | |
206 | "Convert NUM from a `flat code' to EUC encoding (in GL representation). | |
207 | The inverse function of `euc-to-flat-code'. The high and low bytes are | |
208 | returned in a list." | |
209 | ||
210 | (let ((hi (/ num 94)) | |
211 | (lo (% num 94))) | |
212 | (list (+ hi #x21) (+ lo #x21)))) | |
213 | ||
214 | (defun expand-euc-big5-alist (alist) | |
215 | "Create a translation table and fills it with data given in ALIST. | |
216 | Elements of ALIST can be either given as | |
217 | ||
218 | ((euc-charset . startchar) . (big5-range-begin . big5-range-end)) | |
219 | ||
220 | or as | |
221 | ||
222 | (euc-character . big5-charcode) | |
223 | ||
224 | The former maps a range of glyphs in an EUC charset (where STARTCHAR | |
225 | is in GL representation) to a certain range of Big 5 encoded | |
226 | characters, the latter maps a single glyph. Glyphs which can't be | |
227 | mapped will be represented with the byte 0xFF. | |
228 | ||
229 | The return value is the filled translation table." | |
230 | ||
2254377e | 231 | (let ((chartable (make-char-table 'translation-table #xFF)) |
64b4e1f1 WL |
232 | char |
233 | big5 | |
234 | i | |
235 | end | |
236 | codepoint | |
237 | charset) | |
2254377e SM |
238 | (dolist (elem alist) |
239 | (setq char (car elem) | |
240 | big5 (cdr elem)) | |
64b4e1f1 WL |
241 | (cond ((and (consp char) |
242 | (consp big5)) | |
2254377e SM |
243 | (setq i (big5-to-flat-code (car big5)) |
244 | end (big5-to-flat-code (cdr big5)) | |
245 | codepoint (euc-to-flat-code (cdr char)) | |
246 | charset (car char)) | |
247 | (while (>= end i) | |
248 | (aset chartable | |
249 | (decode-big5-char (flat-code-to-big5 i)) | |
250 | (apply (function make-char) | |
251 | charset | |
252 | (flat-code-to-euc codepoint))) | |
253 | (setq i (1+ i) | |
254 | codepoint (1+ codepoint)))) | |
64b4e1f1 WL |
255 | ((and (char-valid-p char) |
256 | (numberp big5)) | |
2254377e SM |
257 | (setq i (decode-big5-char big5)) |
258 | (aset chartable i char)) | |
64b4e1f1 | 259 | (t |
2254377e | 260 | (error "Unknown slot type: %S" elem)))) |
64b4e1f1 | 261 | ;; the return value |
2254377e | 262 | chartable))) |
64b4e1f1 WL |
263 | |
264 | ;; All non-CNS encodings are commented out. | |
265 | ||
266 | (define-translation-table 'big5-to-cns | |
2254377e | 267 | (eval-when-compile |
64b4e1f1 WL |
268 | (expand-euc-big5-alist |
269 | '( | |
270 | ;; Symbols | |
271 | ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5)) | |
272 | (?\e$(G"X\e(B . #xA1F6) | |
273 | (?\e$(G"W\e(B . #xA1F7) | |
274 | ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE)) | |
275 | ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF)) | |
276 | ;; Control codes (vendor dependent) | |
277 | ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0)) | |
278 | ;; Level 1 Ideographs | |
279 | ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD)) | |
280 | (?\e$(GWS\e(B . #xACFE) | |
281 | ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF)) | |
282 | ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7)) | |
283 | ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51)) | |
284 | (?\e$(GkP\e(B . #xBE52) | |
285 | ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA)) | |
286 | ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA)) | |
287 | (?\e$(Gu5\e(B . #xC2CB) | |
288 | ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360)) | |
289 | ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8)) | |
290 | (?\e$(Gxe\e(B . #xC3B9) | |
291 | (?\e$(Gxd\e(B . #xC3BA) | |
292 | ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455)) | |
293 | (?\e$(Gx-\e(B . #xC456) | |
294 | ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E)) | |
295 | ;; Symbols | |
296 | ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE)) | |
297 | ;; Radicals | |
298 | (?\e$(G'#\e(B . #xC6BF) | |
299 | (?\e$(G'$\e(B . #xC6C0) | |
300 | (?\e$(G'&\e(B . #xC6C1) | |
301 | (?\e$(G'(\e(B . #xC6C2) | |
302 | (?\e$(G'-\e(B . #xC6C3) | |
303 | (?\e$(G'.\e(B . #xC6C4) | |
304 | (?\e$(G'/\e(B . #xC6C5) | |
305 | (?\e$(G'4\e(B . #xC6C6) | |
306 | (?\e$(G'7\e(B . #xC6C7) | |
307 | (?\e$(G':\e(B . #xC6C8) | |
308 | (?\e$(G'<\e(B . #xC6C9) | |
309 | (?\e$(G'B\e(B . #xC6CA) | |
310 | (?\e$(G'G\e(B . #xC6CB) | |
311 | (?\e$(G'N\e(B . #xC6CC) | |
312 | (?\e$(G'S\e(B . #xC6CD) | |
313 | (?\e$(G'T\e(B . #xC6CE) | |
314 | (?\e$(G'U\e(B . #xC6CF) | |
315 | (?\e$(G'Y\e(B . #xC6D0) | |
316 | (?\e$(G'Z\e(B . #xC6D1) | |
317 | (?\e$(G'a\e(B . #xC6D2) | |
318 | (?\e$(G'f\e(B . #xC6D3) | |
319 | (?\e$(G()\e(B . #xC6D4) | |
320 | (?\e$(G(*\e(B . #xC6D5) | |
321 | (?\e$(G(c\e(B . #xC6D6) | |
322 | (?\e$(G(l\e(B . #xC6D7) | |
323 | ;; Diacritical Marks | |
324 | ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9)) | |
325 | ;; Japanese Kana Supplement | |
326 | ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3)) | |
327 | ;; Japanese Hiragana | |
328 | ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A)) | |
329 | ;; Japanese Katakana | |
330 | ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2)) | |
331 | ;; Cyrillic Characters | |
332 | ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854)) | |
333 | ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875)) | |
334 | ;; Special Chinese Characters | |
335 | (?\e$(J!#\e(B . #xC879) | |
336 | (?\e$(J!$\e(B . #xC87B) | |
337 | (?\e$(J!*\e(B . #xC87D) | |
338 | (?\e$(J!R\e(B . #xC8A2) | |
339 | ||
340 | ;; JIS X 0208 NOT SIGN (cf. U+00AC) | |
341 | ; (?\e$B"L\e(B . #xC8CD) | |
342 | ;; JIS X 0212 BROKEN BAR (cf. U+00A6) | |
343 | ; (?\e$(D"C\e(B . #xC8CE) | |
344 | ||
345 | ;; GB 2312 characters | |
346 | ; (?\e$A!d\e(B . #xC8CF) | |
347 | ; (?\e$A!e\e(B . #xC8D0) | |
348 | ;;;;; C8D1 - Japanese `(\e$B3t\e(B)' | |
349 | ; (?\e$A!m\e(B . #xC8D2) | |
350 | ;;;;; C8D2 - Tel. | |
351 | ||
352 | ;; Level 2 Ideographs | |
353 | ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949)) | |
354 | (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461 | |
355 | ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B)) | |
356 | ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD)) | |
357 | (?\e$(H!L\e(B . #xC9BE) | |
358 | ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC)) | |
359 | ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6)) | |
360 | (?\e$(H"M\e(B . #xCAF7) | |
361 | ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB)) | |
362 | (?\e$(H>c\e(B . #xD6CC) | |
363 | ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779)) | |
364 | (?\e$(H?j\e(B . #xD77A) | |
365 | ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE)) | |
366 | (?\e$(H7o\e(B . #xDADF) | |
367 | ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6)) | |
368 | ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB)) | |
369 | (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1 | |
370 | ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2)) | |
371 | ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975)) | |
372 | ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A)) | |
373 | ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0)) | |
374 | (?\e$(HUK\e(B . #xEBF1) | |
375 | ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD)) | |
376 | (?\e$(HW"\e(B . #xECDE) | |
377 | ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9)) | |
378 | ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA)) | |
379 | (?\e$(Hd/\e(B . #xEEEB) | |
380 | ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055)) | |
381 | (?\e$(H]t\e(B . #xF056) | |
382 | ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA)) | |
383 | (?\e$(HZ(\e(B . #xF0CB) | |
384 | ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162)) | |
385 | ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A)) | |
386 | (?\e$(Hga\e(B . #xF16B) | |
387 | ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267)) | |
388 | (?\e$(Hi4\e(B . #xF268) | |
389 | ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2)) | |
390 | ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374)) | |
391 | ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465)) | |
392 | ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4)) | |
393 | (?\e$(HfM\e(B . #xF4B5) | |
394 | ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC)) | |
395 | ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662)) | |
396 | (?\e$(HjK\e(B . #xF663) | |
397 | ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976)) | |
398 | ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3)) | |
399 | (?\e$(Hqf\e(B . #xF9C4) | |
400 | (?\e$(Hr4\e(B . #xF9C5) | |
401 | (?\e$(Hr@\e(B . #xF9C6) | |
402 | ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1)) | |
403 | ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5)) | |
404 | ||
405 | ;; Additional Ideographs | |
406 | (?\e$(IC7\e(B . #xF9D6) | |
407 | (?\e$(IOP\e(B . #xF9D7) | |
408 | (?\e$(IDN\e(B . #xF9D8) | |
409 | (?\e$(IPJ\e(B . #xF9D9) | |
410 | (?\e$(I,]\e(B . #xF9DA) | |
411 | (?\e$(I=~\e(B . #xF9DB) | |
412 | (?\e$(IK\\e(B . #xF9DC) | |
413 | ) | |
2254377e | 414 | )) |
64b4e1f1 WL |
415 | ) |
416 | ||
4ed46869 | 417 | ;; |
650e8505 | 418 | (provide 'china-util) |
4ed46869 | 419 | |
ab5796a9 | 420 | ;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836 |
4ed46869 | 421 | ;;; china-util.el ends here |