Commit | Line | Data |
---|---|---|
64b4e1f1 | 1 | ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
4ed46869 | 2 | |
eaa61218 KH |
3 | ;; Copyright (C) 1995, 2001, 2003 |
4 | ;; Free Software Foundation, Inc. | |
5 | ;; Copyright (C) 1995, 1997, 2003 | |
6 | ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
7 | ;; Registration Number H14PRO021 | |
4ed46869 KH |
8 | |
9 | ;; Keywords: mule, multilingual, Chinese | |
10 | ||
11 | ;; This file is part of GNU Emacs. | |
12 | ||
13 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
14 | ;; it under the terms of the GNU General Public License as published by | |
15 | ;; the Free Software Foundation; either version 2, or (at your option) | |
16 | ;; any later version. | |
17 | ||
18 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | ;; GNU General Public License for more details. | |
22 | ||
23 | ;; You should have received a copy of the GNU General Public License | |
369314dc KH |
24 | ;; along with GNU Emacs; see the file COPYING. If not, write to the |
25 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
26 | ;; Boston, MA 02111-1307, USA. | |
4ed46869 | 27 | |
60370d40 PJ |
28 | ;;; Commentary: |
29 | ||
4ed46869 KH |
30 | ;;; Code: |
31 | ||
64b4e1f1 | 32 | ;; Hz/ZW/EUC-TW encoding stuff |
4ed46869 KH |
33 | |
34 | ;; HZ is an encoding method for Chinese character set GB2312 used | |
35 | ;; widely in Internet. It is very similar to 7-bit environment of | |
36 | ;; ISO-2022. The difference is that HZ uses the sequence "~{" and | |
37 | ;; "~}" for designating GB2312 and ASCII respectively, hence, it | |
38 | ;; doesn't uses ESC (0x1B) code. | |
39 | ||
40 | ;; ZW is another encoding method for Chinese character set GB2312. It | |
41 | ;; encodes Chinese characters line by line by starting each line with | |
42 | ;; the sequence "zW". It also uses only 7-bit as HZ. | |
43 | ||
64b4e1f1 WL |
44 | ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is |
45 | ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | |
46 | ;; a single shift escape followed by three bytes: the first gives the | |
47 | ;; plane, the second and third the character code. Note that characters | |
48 | ;; of plane 1 are (redundantly) accessible with a single shift escape | |
49 | ;; also. | |
50 | ||
4ed46869 KH |
51 | ;; ISO-2022 escape sequence to designate GB2312. |
52 | (defvar iso2022-gb-designation "\e$A") | |
53 | ;; HZ escape sequence to designate GB2312. | |
54 | (defvar hz-gb-designnation "~{") | |
55 | ;; ISO-2022 escape sequence to designate ASCII. | |
56 | (defvar iso2022-ascii-designation "\e(B") | |
57 | ;; HZ escape sequence to designate ASCII. | |
58 | (defvar hz-ascii-designnation "~}") | |
59 | ;; Regexp of ZW sequence to start GB2312. | |
60 | (defvar zw-start-gb "^zW") | |
61 | ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW. | |
8f3969f8 KH |
62 | (defvar hz/zw-start-gb |
63 | (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]")) | |
4ed46869 KH |
64 | |
65 | (defvar decode-hz-line-continuation nil | |
66 | "Flag to tell if we should care line continuation convention of Hz.") | |
67 | ||
8f3969f8 | 68 | (defconst hz-set-msb-table |
2254377e SM |
69 | (eval-when-compile |
70 | (let ((chars nil) | |
71 | (i 0)) | |
72 | (while (< i 33) | |
73 | (push i chars) | |
74 | (setq i (1+ i))) | |
75 | (while (< i 127) | |
76 | (push (+ i 128) chars) | |
77 | (setq i (1+ i))) | |
78 | (apply 'string (nreverse chars))))) | |
8f3969f8 | 79 | |
4ed46869 KH |
80 | ;;;###autoload |
81 | (defun decode-hz-region (beg end) | |
82 | "Decode HZ/ZW encoded text in the current region. | |
83 | Return the length of resulting text." | |
84 | (interactive "r") | |
85 | (save-excursion | |
86 | (save-restriction | |
8f3969f8 KH |
87 | (let (pos ch) |
88 | (narrow-to-region beg end) | |
89 | ||
90 | ;; We, at first, convert HZ/ZW to `euc-china', | |
91 | ;; then decode it. | |
92 | ||
93 | ;; "~\n" -> "\n", "~~" -> "~" | |
94 | (goto-char (point-min)) | |
95 | (while (search-forward "~" nil t) | |
96 | (setq ch (following-char)) | |
97 | (if (or (= ch ?\n) (= ch ?~)) (delete-char -1))) | |
98 | ||
99 | ;; "^zW...\n" -> Chinese GB2312 | |
100 | ;; "~{...~}" -> Chinese GB2312 | |
101 | (goto-char (point-min)) | |
102 | (setq beg nil) | |
4ed46869 | 103 | (while (re-search-forward hz/zw-start-gb nil t) |
8f3969f8 KH |
104 | (setq pos (match-beginning 0) |
105 | ch (char-after pos)) | |
106 | ;; Record the first position to start conversion. | |
107 | (or beg (setq beg pos)) | |
108 | (end-of-line) | |
109 | (setq end (point)) | |
110 | (if (>= ch 128) ; 8bit GB2312 | |
111 | nil | |
112 | (goto-char pos) | |
113 | (delete-char 2) | |
114 | (setq end (- end 2)) | |
115 | (if (= ch ?z) ; ZW -> euc-china | |
116 | (progn | |
117 | (translate-region (point) end hz-set-msb-table) | |
118 | (goto-char end)) | |
119 | (if (search-forward hz-ascii-designnation | |
120 | (if decode-hz-line-continuation nil end) | |
121 | t) | |
122 | (delete-char -2)) | |
123 | (setq end (point)) | |
124 | (translate-region pos (point) hz-set-msb-table)))) | |
125 | (if beg | |
126 | (decode-coding-region beg end 'euc-china))) | |
4ed46869 KH |
127 | (- (point-max) (point-min))))) |
128 | ||
129 | ;;;###autoload | |
130 | (defun decode-hz-buffer () | |
131 | "Decode HZ/ZW encoded text in the current buffer." | |
132 | (interactive) | |
133 | (decode-hz-region (point-min) (point-max))) | |
134 | ||
135 | ;;;###autoload | |
136 | (defun encode-hz-region (beg end) | |
137 | "Encode the text in the current region to HZ. | |
138 | Return the length of resulting text." | |
139 | (interactive "r") | |
140 | (save-excursion | |
141 | (save-restriction | |
142 | (narrow-to-region beg end) | |
143 | ||
144 | ;; "~" -> "~~" | |
145 | (goto-char (point-min)) | |
146 | (while (search-forward "~" nil t) (insert ?~)) | |
147 | ||
148 | ;; Chinese GB2312 -> "~{...~}" | |
149 | (goto-char (point-min)) | |
150 | (if (re-search-forward "\\cc" nil t) | |
39e0da62 | 151 | (let (pos) |
4ed46869 | 152 | (goto-char (setq pos (match-beginning 0))) |
5dd921df | 153 | (encode-coding-region pos (point-max) 'iso-2022-7bit) |
4ed46869 KH |
154 | (goto-char pos) |
155 | (while (search-forward iso2022-gb-designation nil t) | |
156 | (delete-char -3) | |
157 | (insert hz-gb-designnation)) | |
158 | (goto-char pos) | |
159 | (while (search-forward iso2022-ascii-designation nil t) | |
160 | (delete-char -3) | |
161 | (insert hz-ascii-designnation)))) | |
162 | (- (point-max) (point-min))))) | |
163 | ||
164 | ;;;###autoload | |
165 | (defun encode-hz-buffer () | |
166 | "Encode the text in the current buffer to HZ." | |
167 | (interactive) | |
168 | (encode-hz-region (point-min) (point-max))) | |
169 | ||
64b4e1f1 WL |
170 | ;; The following sets up a translation table (big5-to-cns) from Big 5 |
171 | ;; to CNS encoding, using some auxiliary functions to make the code | |
172 | ;; more readable. | |
173 | ||
174 | ;; Many kudos to Himi! The used code has been adapted from his | |
175 | ;; mule-ucs package. | |
176 | ||
2254377e | 177 | (eval-when-compile |
64b4e1f1 WL |
178 | (defun big5-to-flat-code (num) |
179 | "Convert NUM in Big 5 encoding to a `flat code'. | |
180 | 0xA140 will be mapped to position 0, 0xA141 to position 1, etc. | |
181 | There are no gaps in the flat code." | |
182 | ||
183 | (let ((hi (/ num 256)) | |
184 | (lo (% num 256))) | |
185 | (+ (* 157 (- hi #xa1)) | |
186 | (- lo (if (>= lo #xa1) 98 64))))) | |
187 | ||
188 | (defun flat-code-to-big5 (num) | |
189 | "Convert NUM from a `flat code' to Big 5 encoding. | |
190 | This is the inverse function of `big5-to-flat-code'." | |
191 | ||
192 | (let ((hi (/ num 157)) | |
193 | (lo (% num 157))) | |
194 | (+ (* 256 (+ hi #xa1)) | |
195 | (+ lo (if (< lo 63) 64 98))))) | |
196 | ||
197 | (defun euc-to-flat-code (num) | |
198 | "Convert NUM in EUC encoding (in GL representation) to a `flat code'. | |
199 | 0x2121 will be mapped to position 0, 0x2122 to position 1, etc. | |
200 | There are no gaps in the flat code." | |
201 | ||
202 | (let ((hi (/ num 256)) | |
203 | (lo (% num 256))) | |
204 | (+ (* 94 (- hi #x21)) | |
205 | (- lo #x21)))) | |
206 | ||
207 | (defun flat-code-to-euc (num) | |
208 | "Convert NUM from a `flat code' to EUC encoding (in GL representation). | |
209 | The inverse function of `euc-to-flat-code'. The high and low bytes are | |
210 | returned in a list." | |
211 | ||
212 | (let ((hi (/ num 94)) | |
213 | (lo (% num 94))) | |
214 | (list (+ hi #x21) (+ lo #x21)))) | |
215 | ||
216 | (defun expand-euc-big5-alist (alist) | |
217 | "Create a translation table and fills it with data given in ALIST. | |
218 | Elements of ALIST can be either given as | |
219 | ||
220 | ((euc-charset . startchar) . (big5-range-begin . big5-range-end)) | |
221 | ||
222 | or as | |
223 | ||
224 | (euc-character . big5-charcode) | |
225 | ||
226 | The former maps a range of glyphs in an EUC charset (where STARTCHAR | |
227 | is in GL representation) to a certain range of Big 5 encoded | |
228 | characters, the latter maps a single glyph. Glyphs which can't be | |
229 | mapped will be represented with the byte 0xFF. | |
230 | ||
231 | The return value is the filled translation table." | |
232 | ||
2254377e | 233 | (let ((chartable (make-char-table 'translation-table #xFF)) |
64b4e1f1 WL |
234 | char |
235 | big5 | |
236 | i | |
237 | end | |
238 | codepoint | |
239 | charset) | |
2254377e SM |
240 | (dolist (elem alist) |
241 | (setq char (car elem) | |
242 | big5 (cdr elem)) | |
64b4e1f1 WL |
243 | (cond ((and (consp char) |
244 | (consp big5)) | |
2254377e SM |
245 | (setq i (big5-to-flat-code (car big5)) |
246 | end (big5-to-flat-code (cdr big5)) | |
247 | codepoint (euc-to-flat-code (cdr char)) | |
248 | charset (car char)) | |
249 | (while (>= end i) | |
250 | (aset chartable | |
251 | (decode-big5-char (flat-code-to-big5 i)) | |
252 | (apply (function make-char) | |
253 | charset | |
254 | (flat-code-to-euc codepoint))) | |
255 | (setq i (1+ i) | |
256 | codepoint (1+ codepoint)))) | |
64b4e1f1 WL |
257 | ((and (char-valid-p char) |
258 | (numberp big5)) | |
2254377e SM |
259 | (setq i (decode-big5-char big5)) |
260 | (aset chartable i char)) | |
64b4e1f1 | 261 | (t |
2254377e | 262 | (error "Unknown slot type: %S" elem)))) |
64b4e1f1 | 263 | ;; the return value |
2254377e | 264 | chartable))) |
64b4e1f1 WL |
265 | |
266 | ;; All non-CNS encodings are commented out. | |
267 | ||
268 | (define-translation-table 'big5-to-cns | |
2254377e | 269 | (eval-when-compile |
64b4e1f1 WL |
270 | (expand-euc-big5-alist |
271 | '( | |
272 | ;; Symbols | |
273 | ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5)) | |
274 | (?\e$(G"X\e(B . #xA1F6) | |
275 | (?\e$(G"W\e(B . #xA1F7) | |
276 | ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE)) | |
277 | ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF)) | |
278 | ;; Control codes (vendor dependent) | |
279 | ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0)) | |
280 | ;; Level 1 Ideographs | |
281 | ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD)) | |
282 | (?\e$(GWS\e(B . #xACFE) | |
283 | ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF)) | |
284 | ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7)) | |
285 | ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51)) | |
286 | (?\e$(GkP\e(B . #xBE52) | |
287 | ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA)) | |
288 | ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA)) | |
289 | (?\e$(Gu5\e(B . #xC2CB) | |
290 | ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360)) | |
291 | ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8)) | |
292 | (?\e$(Gxe\e(B . #xC3B9) | |
293 | (?\e$(Gxd\e(B . #xC3BA) | |
294 | ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455)) | |
295 | (?\e$(Gx-\e(B . #xC456) | |
296 | ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E)) | |
297 | ;; Symbols | |
298 | ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE)) | |
299 | ;; Radicals | |
300 | (?\e$(G'#\e(B . #xC6BF) | |
301 | (?\e$(G'$\e(B . #xC6C0) | |
302 | (?\e$(G'&\e(B . #xC6C1) | |
303 | (?\e$(G'(\e(B . #xC6C2) | |
304 | (?\e$(G'-\e(B . #xC6C3) | |
305 | (?\e$(G'.\e(B . #xC6C4) | |
306 | (?\e$(G'/\e(B . #xC6C5) | |
307 | (?\e$(G'4\e(B . #xC6C6) | |
308 | (?\e$(G'7\e(B . #xC6C7) | |
309 | (?\e$(G':\e(B . #xC6C8) | |
310 | (?\e$(G'<\e(B . #xC6C9) | |
311 | (?\e$(G'B\e(B . #xC6CA) | |
312 | (?\e$(G'G\e(B . #xC6CB) | |
313 | (?\e$(G'N\e(B . #xC6CC) | |
314 | (?\e$(G'S\e(B . #xC6CD) | |
315 | (?\e$(G'T\e(B . #xC6CE) | |
316 | (?\e$(G'U\e(B . #xC6CF) | |
317 | (?\e$(G'Y\e(B . #xC6D0) | |
318 | (?\e$(G'Z\e(B . #xC6D1) | |
319 | (?\e$(G'a\e(B . #xC6D2) | |
320 | (?\e$(G'f\e(B . #xC6D3) | |
321 | (?\e$(G()\e(B . #xC6D4) | |
322 | (?\e$(G(*\e(B . #xC6D5) | |
323 | (?\e$(G(c\e(B . #xC6D6) | |
324 | (?\e$(G(l\e(B . #xC6D7) | |
325 | ;; Diacritical Marks | |
326 | ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9)) | |
327 | ;; Japanese Kana Supplement | |
328 | ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3)) | |
329 | ;; Japanese Hiragana | |
330 | ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A)) | |
331 | ;; Japanese Katakana | |
332 | ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2)) | |
333 | ;; Cyrillic Characters | |
334 | ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854)) | |
335 | ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875)) | |
336 | ;; Special Chinese Characters | |
337 | (?\e$(J!#\e(B . #xC879) | |
338 | (?\e$(J!$\e(B . #xC87B) | |
339 | (?\e$(J!*\e(B . #xC87D) | |
340 | (?\e$(J!R\e(B . #xC8A2) | |
341 | ||
342 | ;; JIS X 0208 NOT SIGN (cf. U+00AC) | |
343 | ; (?\e$B"L\e(B . #xC8CD) | |
344 | ;; JIS X 0212 BROKEN BAR (cf. U+00A6) | |
345 | ; (?\e$(D"C\e(B . #xC8CE) | |
346 | ||
347 | ;; GB 2312 characters | |
348 | ; (?\e$A!d\e(B . #xC8CF) | |
349 | ; (?\e$A!e\e(B . #xC8D0) | |
350 | ;;;;; C8D1 - Japanese `(\e$B3t\e(B)' | |
351 | ; (?\e$A!m\e(B . #xC8D2) | |
352 | ;;;;; C8D2 - Tel. | |
353 | ||
354 | ;; Level 2 Ideographs | |
355 | ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949)) | |
356 | (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461 | |
357 | ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B)) | |
358 | ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD)) | |
359 | (?\e$(H!L\e(B . #xC9BE) | |
360 | ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC)) | |
361 | ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6)) | |
362 | (?\e$(H"M\e(B . #xCAF7) | |
363 | ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB)) | |
364 | (?\e$(H>c\e(B . #xD6CC) | |
365 | ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779)) | |
366 | (?\e$(H?j\e(B . #xD77A) | |
367 | ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE)) | |
368 | (?\e$(H7o\e(B . #xDADF) | |
369 | ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6)) | |
370 | ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB)) | |
371 | (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1 | |
372 | ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2)) | |
373 | ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975)) | |
374 | ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A)) | |
375 | ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0)) | |
376 | (?\e$(HUK\e(B . #xEBF1) | |
377 | ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD)) | |
378 | (?\e$(HW"\e(B . #xECDE) | |
379 | ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9)) | |
380 | ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA)) | |
381 | (?\e$(Hd/\e(B . #xEEEB) | |
382 | ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055)) | |
383 | (?\e$(H]t\e(B . #xF056) | |
384 | ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA)) | |
385 | (?\e$(HZ(\e(B . #xF0CB) | |
386 | ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162)) | |
387 | ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A)) | |
388 | (?\e$(Hga\e(B . #xF16B) | |
389 | ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267)) | |
390 | (?\e$(Hi4\e(B . #xF268) | |
391 | ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2)) | |
392 | ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374)) | |
393 | ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465)) | |
394 | ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4)) | |
395 | (?\e$(HfM\e(B . #xF4B5) | |
396 | ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC)) | |
397 | ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662)) | |
398 | (?\e$(HjK\e(B . #xF663) | |
399 | ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976)) | |
400 | ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3)) | |
401 | (?\e$(Hqf\e(B . #xF9C4) | |
402 | (?\e$(Hr4\e(B . #xF9C5) | |
403 | (?\e$(Hr@\e(B . #xF9C6) | |
404 | ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1)) | |
405 | ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5)) | |
406 | ||
407 | ;; Additional Ideographs | |
408 | (?\e$(IC7\e(B . #xF9D6) | |
409 | (?\e$(IOP\e(B . #xF9D7) | |
410 | (?\e$(IDN\e(B . #xF9D8) | |
411 | (?\e$(IPJ\e(B . #xF9D9) | |
412 | (?\e$(I,]\e(B . #xF9DA) | |
413 | (?\e$(I=~\e(B . #xF9DB) | |
414 | (?\e$(IK\\e(B . #xF9DC) | |
415 | ) | |
2254377e | 416 | )) |
64b4e1f1 WL |
417 | ) |
418 | ||
4ed46869 | 419 | ;; |
650e8505 | 420 | (provide 'china-util) |
4ed46869 | 421 | |
ab5796a9 | 422 | ;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836 |
4ed46869 | 423 | ;;; china-util.el ends here |