Merge from emacs--rel--22
[bpt/emacs.git] / lisp / language / china-util.el
1 ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
2
3 ;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006, 2007, 2008
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
9
10 ;; Keywords: mule, multilingual, Chinese
11
12 ;; This file is part of GNU Emacs.
13
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 3, or (at your option)
17 ;; any later version.
18
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
23
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 ;; Boston, MA 02110-1301, USA.
28
29 ;;; Commentary:
30
31 ;;; Code:
32
33 ;; Hz/ZW/EUC-TW encoding stuff
34
35 ;; HZ is an encoding method for Chinese character set GB2312 used
36 ;; widely in Internet. It is very similar to 7-bit environment of
37 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and
38 ;; "~}" for designating GB2312 and ASCII respectively, hence, it
39 ;; doesn't uses ESC (0x1B) code.
40
41 ;; ZW is another encoding method for Chinese character set GB2312. It
42 ;; encodes Chinese characters line by line by starting each line with
43 ;; the sequence "zW". It also uses only 7-bit as HZ.
44
45 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
46 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
47 ;; a single shift escape followed by three bytes: the first gives the
48 ;; plane, the second and third the character code. Note that characters
49 ;; of plane 1 are (redundantly) accessible with a single shift escape
50 ;; also.
51
52 ;; ISO-2022 escape sequence to designate GB2312.
53 (defvar iso2022-gb-designation "\e$A")
54 ;; HZ escape sequence to designate GB2312.
55 (defvar hz-gb-designnation "~{")
56 ;; ISO-2022 escape sequence to designate ASCII.
57 (defvar iso2022-ascii-designation "\e(B")
58 ;; HZ escape sequence to designate ASCII.
59 (defvar hz-ascii-designnation "~}")
60 ;; Regexp of ZW sequence to start GB2312.
61 (defvar zw-start-gb "^zW")
62 ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
63 (defvar hz/zw-start-gb
64 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
65
66 (defvar decode-hz-line-continuation nil
67 "Flag to tell if we should care line continuation convention of Hz.")
68
69 (defconst hz-set-msb-table
70 (eval-when-compile
71 (let ((chars nil)
72 (i 0))
73 (while (< i 33)
74 (push i chars)
75 (setq i (1+ i)))
76 (while (< i 127)
77 (push (+ i 128) chars)
78 (setq i (1+ i)))
79 (apply 'string (nreverse chars)))))
80
81 ;;;###autoload
82 (defun decode-hz-region (beg end)
83 "Decode HZ/ZW encoded text in the current region.
84 Return the length of resulting text."
85 (interactive "r")
86 (save-excursion
87 (save-restriction
88 (let (pos ch)
89 (narrow-to-region beg end)
90
91 ;; We, at first, convert HZ/ZW to `euc-china',
92 ;; then decode it.
93
94 ;; "~\n" -> "\n", "~~" -> "~"
95 (goto-char (point-min))
96 (while (search-forward "~" nil t)
97 (setq ch (following-char))
98 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
99
100 ;; "^zW...\n" -> Chinese GB2312
101 ;; "~{...~}" -> Chinese GB2312
102 (goto-char (point-min))
103 (setq beg nil)
104 (while (re-search-forward hz/zw-start-gb nil t)
105 (setq pos (match-beginning 0)
106 ch (char-after pos))
107 ;; Record the first position to start conversion.
108 (or beg (setq beg pos))
109 (end-of-line)
110 (setq end (point))
111 (if (>= ch 128) ; 8bit GB2312
112 nil
113 (goto-char pos)
114 (delete-char 2)
115 (setq end (- end 2))
116 (if (= ch ?z) ; ZW -> euc-china
117 (progn
118 (translate-region (point) end hz-set-msb-table)
119 (goto-char end))
120 (if (search-forward hz-ascii-designnation
121 (if decode-hz-line-continuation nil end)
122 t)
123 (delete-char -2))
124 (setq end (point))
125 (translate-region pos (point) hz-set-msb-table))))
126 (if beg
127 (decode-coding-region beg end 'euc-china)))
128 (- (point-max) (point-min)))))
129
130 ;;;###autoload
131 (defun decode-hz-buffer ()
132 "Decode HZ/ZW encoded text in the current buffer."
133 (interactive)
134 (decode-hz-region (point-min) (point-max)))
135
136 ;;;###autoload
137 (defun encode-hz-region (beg end)
138 "Encode the text in the current region to HZ.
139 Return the length of resulting text."
140 (interactive "r")
141 (save-excursion
142 (save-restriction
143 (narrow-to-region beg end)
144
145 ;; "~" -> "~~"
146 (goto-char (point-min))
147 (while (search-forward "~" nil t) (insert ?~))
148
149 ;; Chinese GB2312 -> "~{...~}"
150 (goto-char (point-min))
151 (if (re-search-forward "\\cc" nil t)
152 (let (pos)
153 (goto-char (setq pos (match-beginning 0)))
154 (encode-coding-region pos (point-max) 'iso-2022-7bit)
155 (goto-char pos)
156 (while (search-forward iso2022-gb-designation nil t)
157 (delete-char -3)
158 (insert hz-gb-designnation))
159 (goto-char pos)
160 (while (search-forward iso2022-ascii-designation nil t)
161 (delete-char -3)
162 (insert hz-ascii-designnation))))
163 (- (point-max) (point-min)))))
164
165 ;;;###autoload
166 (defun encode-hz-buffer ()
167 "Encode the text in the current buffer to HZ."
168 (interactive)
169 (encode-hz-region (point-min) (point-max)))
170
171 ;; The following sets up a translation table (big5-to-cns) from Big 5
172 ;; to CNS encoding, using some auxiliary functions to make the code
173 ;; more readable.
174
175 ;; Many kudos to Himi! The used code has been adapted from his
176 ;; mule-ucs package.
177
178 ;; Silence the compiler, which otherwise warns that these functions
179 ;; might not be defined at runtime. They are only used when compiling.
180 (declare-function big5-to-flat-code "china-util" (num))
181 (declare-function flat-code-to-big5 "china-util" (num))
182 (declare-function euc-to-flat-code "china-util" (num))
183 (declare-function flat-code-to-euc "china-util" (num))
184
185 (eval-when-compile
186 (defun big5-to-flat-code (num)
187 "Convert NUM in Big 5 encoding to a `flat code'.
188 0xA140 will be mapped to position 0, 0xA141 to position 1, etc.
189 There are no gaps in the flat code."
190
191 (let ((hi (/ num 256))
192 (lo (% num 256)))
193 (+ (* 157 (- hi #xa1))
194 (- lo (if (>= lo #xa1) 98 64)))))
195
196 (defun flat-code-to-big5 (num)
197 "Convert NUM from a `flat code' to Big 5 encoding.
198 This is the inverse function of `big5-to-flat-code'."
199
200 (let ((hi (/ num 157))
201 (lo (% num 157)))
202 (+ (* 256 (+ hi #xa1))
203 (+ lo (if (< lo 63) 64 98)))))
204
205 (defun euc-to-flat-code (num)
206 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
207 0x2121 will be mapped to position 0, 0x2122 to position 1, etc.
208 There are no gaps in the flat code."
209
210 (let ((hi (/ num 256))
211 (lo (% num 256)))
212 (+ (* 94 (- hi #x21))
213 (- lo #x21))))
214
215 (defun flat-code-to-euc (num)
216 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
217 The inverse function of `euc-to-flat-code'. The high and low bytes are
218 returned in a list."
219
220 (let ((hi (/ num 94))
221 (lo (% num 94)))
222 (list (+ hi #x21) (+ lo #x21))))
223
224 (defun expand-euc-big5-alist (alist)
225 "Create a translation table and fills it with data given in ALIST.
226 Elements of ALIST can be either given as
227
228 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
229
230 or as
231
232 (euc-character . big5-charcode)
233
234 The former maps a range of glyphs in an EUC charset (where STARTCHAR
235 is in GL representation) to a certain range of Big 5 encoded
236 characters, the latter maps a single glyph. Glyphs which can't be
237 mapped will be represented with the byte 0xFF.
238
239 The return value is the filled translation table."
240
241 (let ((chartable (make-char-table 'translation-table #xFF))
242 char
243 big5
244 i
245 end
246 codepoint
247 charset)
248 (dolist (elem alist)
249 (setq char (car elem)
250 big5 (cdr elem))
251 (cond ((and (consp char)
252 (consp big5))
253 (setq i (big5-to-flat-code (car big5))
254 end (big5-to-flat-code (cdr big5))
255 codepoint (euc-to-flat-code (cdr char))
256 charset (car char))
257 (while (>= end i)
258 (aset chartable
259 (decode-big5-char (flat-code-to-big5 i))
260 (apply (function make-char)
261 charset
262 (flat-code-to-euc codepoint)))
263 (setq i (1+ i)
264 codepoint (1+ codepoint))))
265 ((and (char-valid-p char)
266 (numberp big5))
267 (setq i (decode-big5-char big5))
268 (aset chartable i char))
269 (t
270 (error "Unknown slot type: %S" elem))))
271 ;; the return value
272 chartable)))
273
274 ;; All non-CNS encodings are commented out.
275
276 (define-translation-table 'big5-to-cns
277 (eval-when-compile
278 (expand-euc-big5-alist
279 '(
280 ;; Symbols
281 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
282 (?\e$(G"X\e(B . #xA1F6)
283 (?\e$(G"W\e(B . #xA1F7)
284 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
285 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
286 ;; Control codes (vendor dependent)
287 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
288 ;; Level 1 Ideographs
289 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
290 (?\e$(GWS\e(B . #xACFE)
291 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
292 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
293 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
294 (?\e$(GkP\e(B . #xBE52)
295 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
296 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
297 (?\e$(Gu5\e(B . #xC2CB)
298 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
299 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
300 (?\e$(Gxe\e(B . #xC3B9)
301 (?\e$(Gxd\e(B . #xC3BA)
302 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
303 (?\e$(Gx-\e(B . #xC456)
304 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
305 ;; Symbols
306 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
307 ;; Radicals
308 (?\e$(G'#\e(B . #xC6BF)
309 (?\e$(G'$\e(B . #xC6C0)
310 (?\e$(G'&\e(B . #xC6C1)
311 (?\e$(G'(\e(B . #xC6C2)
312 (?\e$(G'-\e(B . #xC6C3)
313 (?\e$(G'.\e(B . #xC6C4)
314 (?\e$(G'/\e(B . #xC6C5)
315 (?\e$(G'4\e(B . #xC6C6)
316 (?\e$(G'7\e(B . #xC6C7)
317 (?\e$(G':\e(B . #xC6C8)
318 (?\e$(G'<\e(B . #xC6C9)
319 (?\e$(G'B\e(B . #xC6CA)
320 (?\e$(G'G\e(B . #xC6CB)
321 (?\e$(G'N\e(B . #xC6CC)
322 (?\e$(G'S\e(B . #xC6CD)
323 (?\e$(G'T\e(B . #xC6CE)
324 (?\e$(G'U\e(B . #xC6CF)
325 (?\e$(G'Y\e(B . #xC6D0)
326 (?\e$(G'Z\e(B . #xC6D1)
327 (?\e$(G'a\e(B . #xC6D2)
328 (?\e$(G'f\e(B . #xC6D3)
329 (?\e$(G()\e(B . #xC6D4)
330 (?\e$(G(*\e(B . #xC6D5)
331 (?\e$(G(c\e(B . #xC6D6)
332 (?\e$(G(l\e(B . #xC6D7)
333 ;; Diacritical Marks
334 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
335 ;; Japanese Kana Supplement
336 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
337 ;; Japanese Hiragana
338 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
339 ;; Japanese Katakana
340 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
341 ;; Cyrillic Characters
342 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
343 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
344 ;; Special Chinese Characters
345 (?\e$(J!#\e(B . #xC879)
346 (?\e$(J!$\e(B . #xC87B)
347 (?\e$(J!*\e(B . #xC87D)
348 (?\e$(J!R\e(B . #xC8A2)
349
350 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
351 ; (?\e$B"L\e(B . #xC8CD)
352 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
353 ; (?\e$(D"C\e(B . #xC8CE)
354
355 ;; GB 2312 characters
356 ; (?\e$A!d\e(B . #xC8CF)
357 ; (?\e$A!e\e(B . #xC8D0)
358 ;;;;; C8D1 - Japanese `(\e$B3t\e(B)'
359 ; (?\e$A!m\e(B . #xC8D2)
360 ;;;;; C8D2 - Tel.
361
362 ;; Level 2 Ideographs
363 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
364 (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461
365 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
366 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
367 (?\e$(H!L\e(B . #xC9BE)
368 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
369 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
370 (?\e$(H"M\e(B . #xCAF7)
371 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
372 (?\e$(H>c\e(B . #xD6CC)
373 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
374 (?\e$(H?j\e(B . #xD77A)
375 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
376 (?\e$(H7o\e(B . #xDADF)
377 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
378 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
379 (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1
380 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
381 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
382 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
383 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
384 (?\e$(HUK\e(B . #xEBF1)
385 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
386 (?\e$(HW"\e(B . #xECDE)
387 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
388 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
389 (?\e$(Hd/\e(B . #xEEEB)
390 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
391 (?\e$(H]t\e(B . #xF056)
392 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
393 (?\e$(HZ(\e(B . #xF0CB)
394 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
395 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
396 (?\e$(Hga\e(B . #xF16B)
397 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
398 (?\e$(Hi4\e(B . #xF268)
399 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
400 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
401 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
402 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
403 (?\e$(HfM\e(B . #xF4B5)
404 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
405 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
406 (?\e$(HjK\e(B . #xF663)
407 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
408 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
409 (?\e$(Hqf\e(B . #xF9C4)
410 (?\e$(Hr4\e(B . #xF9C5)
411 (?\e$(Hr@\e(B . #xF9C6)
412 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
413 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
414
415 ;; Additional Ideographs
416 (?\e$(IC7\e(B . #xF9D6)
417 (?\e$(IOP\e(B . #xF9D7)
418 (?\e$(IDN\e(B . #xF9D8)
419 (?\e$(IPJ\e(B . #xF9D9)
420 (?\e$(I,]\e(B . #xF9DA)
421 (?\e$(I=~\e(B . #xF9DB)
422 (?\e$(IK\\e(B . #xF9DC)
423 )
424 ))
425 )
426
427 ;;
428 (provide 'china-util)
429
430 ;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
431 ;;; china-util.el ends here