Add 2007 to copyright years.
[bpt/emacs.git] / lisp / language / china-util.el
CommitLineData
64b4e1f1 1;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
4ed46869 2
d7a0267c 3;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007
eaa61218 4;; Free Software Foundation, Inc.
7976eda0 5;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
d7a0267c 6;; 2005, 2006, 2007
eaa61218
KH
7;; National Institute of Advanced Industrial Science and Technology (AIST)
8;; Registration Number H14PRO021
4ed46869
KH
9
10;; Keywords: mule, multilingual, Chinese
11
12;; This file is part of GNU Emacs.
13
14;; GNU Emacs is free software; you can redistribute it and/or modify
15;; it under the terms of the GNU General Public License as published by
16;; the Free Software Foundation; either version 2, or (at your option)
17;; any later version.
18
19;; GNU Emacs is distributed in the hope that it will be useful,
20;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22;; GNU General Public License for more details.
23
24;; You should have received a copy of the GNU General Public License
369314dc 25;; along with GNU Emacs; see the file COPYING. If not, write to the
3a35cf56
LK
26;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27;; Boston, MA 02110-1301, USA.
4ed46869 28
60370d40
PJ
29;;; Commentary:
30
4ed46869
KH
31;;; Code:
32
64b4e1f1 33;; Hz/ZW/EUC-TW encoding stuff
4ed46869
KH
34
35;; HZ is an encoding method for Chinese character set GB2312 used
36;; widely in Internet. It is very similar to 7-bit environment of
37;; ISO-2022. The difference is that HZ uses the sequence "~{" and
38;; "~}" for designating GB2312 and ASCII respectively, hence, it
39;; doesn't uses ESC (0x1B) code.
40
41;; ZW is another encoding method for Chinese character set GB2312. It
42;; encodes Chinese characters line by line by starting each line with
43;; the sequence "zW". It also uses only 7-bit as HZ.
44
64b4e1f1
WL
45;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
46;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
47;; a single shift escape followed by three bytes: the first gives the
48;; plane, the second and third the character code. Note that characters
49;; of plane 1 are (redundantly) accessible with a single shift escape
50;; also.
51
4ed46869
KH
52;; ISO-2022 escape sequence to designate GB2312.
53(defvar iso2022-gb-designation "\e$A")
54;; HZ escape sequence to designate GB2312.
55(defvar hz-gb-designnation "~{")
56;; ISO-2022 escape sequence to designate ASCII.
57(defvar iso2022-ascii-designation "\e(B")
58;; HZ escape sequence to designate ASCII.
59(defvar hz-ascii-designnation "~}")
60;; Regexp of ZW sequence to start GB2312.
61(defvar zw-start-gb "^zW")
62;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
8f3969f8
KH
63(defvar hz/zw-start-gb
64 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
4ed46869
KH
65
66(defvar decode-hz-line-continuation nil
67 "Flag to tell if we should care line continuation convention of Hz.")
68
8f3969f8 69(defconst hz-set-msb-table
2254377e
SM
70 (eval-when-compile
71 (let ((chars nil)
72 (i 0))
73 (while (< i 33)
74 (push i chars)
75 (setq i (1+ i)))
76 (while (< i 127)
77 (push (+ i 128) chars)
78 (setq i (1+ i)))
79 (apply 'string (nreverse chars)))))
8f3969f8 80
4ed46869
KH
81;;;###autoload
82(defun decode-hz-region (beg end)
83 "Decode HZ/ZW encoded text in the current region.
84Return the length of resulting text."
85 (interactive "r")
86 (save-excursion
87 (save-restriction
8f3969f8
KH
88 (let (pos ch)
89 (narrow-to-region beg end)
90
91 ;; We, at first, convert HZ/ZW to `euc-china',
92 ;; then decode it.
93
94 ;; "~\n" -> "\n", "~~" -> "~"
95 (goto-char (point-min))
96 (while (search-forward "~" nil t)
97 (setq ch (following-char))
98 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
99
100 ;; "^zW...\n" -> Chinese GB2312
101 ;; "~{...~}" -> Chinese GB2312
102 (goto-char (point-min))
103 (setq beg nil)
4ed46869 104 (while (re-search-forward hz/zw-start-gb nil t)
8f3969f8
KH
105 (setq pos (match-beginning 0)
106 ch (char-after pos))
107 ;; Record the first position to start conversion.
108 (or beg (setq beg pos))
109 (end-of-line)
110 (setq end (point))
111 (if (>= ch 128) ; 8bit GB2312
112 nil
113 (goto-char pos)
114 (delete-char 2)
115 (setq end (- end 2))
116 (if (= ch ?z) ; ZW -> euc-china
117 (progn
118 (translate-region (point) end hz-set-msb-table)
119 (goto-char end))
120 (if (search-forward hz-ascii-designnation
121 (if decode-hz-line-continuation nil end)
122 t)
123 (delete-char -2))
124 (setq end (point))
125 (translate-region pos (point) hz-set-msb-table))))
126 (if beg
127 (decode-coding-region beg end 'euc-china)))
4ed46869
KH
128 (- (point-max) (point-min)))))
129
130;;;###autoload
131(defun decode-hz-buffer ()
132 "Decode HZ/ZW encoded text in the current buffer."
133 (interactive)
134 (decode-hz-region (point-min) (point-max)))
135
136;;;###autoload
137(defun encode-hz-region (beg end)
138 "Encode the text in the current region to HZ.
139Return the length of resulting text."
140 (interactive "r")
141 (save-excursion
142 (save-restriction
143 (narrow-to-region beg end)
144
145 ;; "~" -> "~~"
146 (goto-char (point-min))
147 (while (search-forward "~" nil t) (insert ?~))
148
149 ;; Chinese GB2312 -> "~{...~}"
150 (goto-char (point-min))
151 (if (re-search-forward "\\cc" nil t)
39e0da62 152 (let (pos)
4ed46869 153 (goto-char (setq pos (match-beginning 0)))
5dd921df 154 (encode-coding-region pos (point-max) 'iso-2022-7bit)
4ed46869
KH
155 (goto-char pos)
156 (while (search-forward iso2022-gb-designation nil t)
157 (delete-char -3)
158 (insert hz-gb-designnation))
159 (goto-char pos)
160 (while (search-forward iso2022-ascii-designation nil t)
161 (delete-char -3)
162 (insert hz-ascii-designnation))))
163 (- (point-max) (point-min)))))
164
165;;;###autoload
166(defun encode-hz-buffer ()
167 "Encode the text in the current buffer to HZ."
168 (interactive)
169 (encode-hz-region (point-min) (point-max)))
170
64b4e1f1
WL
171;; The following sets up a translation table (big5-to-cns) from Big 5
172;; to CNS encoding, using some auxiliary functions to make the code
173;; more readable.
174
175;; Many kudos to Himi! The used code has been adapted from his
176;; mule-ucs package.
177
2254377e 178(eval-when-compile
64b4e1f1
WL
179(defun big5-to-flat-code (num)
180 "Convert NUM in Big 5 encoding to a `flat code'.
1810xA140 will be mapped to position 0, 0xA141 to position 1, etc.
182There are no gaps in the flat code."
183
184 (let ((hi (/ num 256))
185 (lo (% num 256)))
186 (+ (* 157 (- hi #xa1))
187 (- lo (if (>= lo #xa1) 98 64)))))
188
189(defun flat-code-to-big5 (num)
190 "Convert NUM from a `flat code' to Big 5 encoding.
191This is the inverse function of `big5-to-flat-code'."
192
193 (let ((hi (/ num 157))
194 (lo (% num 157)))
195 (+ (* 256 (+ hi #xa1))
196 (+ lo (if (< lo 63) 64 98)))))
197
198(defun euc-to-flat-code (num)
199 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
2000x2121 will be mapped to position 0, 0x2122 to position 1, etc.
201There are no gaps in the flat code."
202
203 (let ((hi (/ num 256))
204 (lo (% num 256)))
205 (+ (* 94 (- hi #x21))
206 (- lo #x21))))
207
208(defun flat-code-to-euc (num)
209 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
210The inverse function of `euc-to-flat-code'. The high and low bytes are
211returned in a list."
212
213 (let ((hi (/ num 94))
214 (lo (% num 94)))
215 (list (+ hi #x21) (+ lo #x21))))
216
217(defun expand-euc-big5-alist (alist)
218 "Create a translation table and fills it with data given in ALIST.
219Elements of ALIST can be either given as
220
221 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
222
223or as
224
225 (euc-character . big5-charcode)
226
227The former maps a range of glyphs in an EUC charset (where STARTCHAR
228is in GL representation) to a certain range of Big 5 encoded
229characters, the latter maps a single glyph. Glyphs which can't be
230mapped will be represented with the byte 0xFF.
231
232The return value is the filled translation table."
233
2254377e 234 (let ((chartable (make-char-table 'translation-table #xFF))
64b4e1f1
WL
235 char
236 big5
237 i
238 end
239 codepoint
240 charset)
2254377e
SM
241 (dolist (elem alist)
242 (setq char (car elem)
243 big5 (cdr elem))
64b4e1f1
WL
244 (cond ((and (consp char)
245 (consp big5))
2254377e
SM
246 (setq i (big5-to-flat-code (car big5))
247 end (big5-to-flat-code (cdr big5))
248 codepoint (euc-to-flat-code (cdr char))
249 charset (car char))
250 (while (>= end i)
251 (aset chartable
252 (decode-big5-char (flat-code-to-big5 i))
253 (apply (function make-char)
254 charset
255 (flat-code-to-euc codepoint)))
256 (setq i (1+ i)
257 codepoint (1+ codepoint))))
64b4e1f1
WL
258 ((and (char-valid-p char)
259 (numberp big5))
2254377e
SM
260 (setq i (decode-big5-char big5))
261 (aset chartable i char))
64b4e1f1 262 (t
2254377e 263 (error "Unknown slot type: %S" elem))))
64b4e1f1 264 ;; the return value
2254377e 265 chartable)))
64b4e1f1
WL
266
267;; All non-CNS encodings are commented out.
268
269(define-translation-table 'big5-to-cns
2254377e 270 (eval-when-compile
64b4e1f1
WL
271 (expand-euc-big5-alist
272 '(
273 ;; Symbols
274 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
275 (?\e$(G"X\e(B . #xA1F6)
276 (?\e$(G"W\e(B . #xA1F7)
277 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
278 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
279 ;; Control codes (vendor dependent)
280 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
281 ;; Level 1 Ideographs
282 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
283 (?\e$(GWS\e(B . #xACFE)
284 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
285 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
286 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
287 (?\e$(GkP\e(B . #xBE52)
288 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
289 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
290 (?\e$(Gu5\e(B . #xC2CB)
291 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
292 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
293 (?\e$(Gxe\e(B . #xC3B9)
294 (?\e$(Gxd\e(B . #xC3BA)
295 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
296 (?\e$(Gx-\e(B . #xC456)
297 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
298 ;; Symbols
299 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
300 ;; Radicals
301 (?\e$(G'#\e(B . #xC6BF)
302 (?\e$(G'$\e(B . #xC6C0)
303 (?\e$(G'&\e(B . #xC6C1)
304 (?\e$(G'(\e(B . #xC6C2)
305 (?\e$(G'-\e(B . #xC6C3)
306 (?\e$(G'.\e(B . #xC6C4)
307 (?\e$(G'/\e(B . #xC6C5)
308 (?\e$(G'4\e(B . #xC6C6)
309 (?\e$(G'7\e(B . #xC6C7)
310 (?\e$(G':\e(B . #xC6C8)
311 (?\e$(G'<\e(B . #xC6C9)
312 (?\e$(G'B\e(B . #xC6CA)
313 (?\e$(G'G\e(B . #xC6CB)
314 (?\e$(G'N\e(B . #xC6CC)
315 (?\e$(G'S\e(B . #xC6CD)
316 (?\e$(G'T\e(B . #xC6CE)
317 (?\e$(G'U\e(B . #xC6CF)
318 (?\e$(G'Y\e(B . #xC6D0)
319 (?\e$(G'Z\e(B . #xC6D1)
320 (?\e$(G'a\e(B . #xC6D2)
321 (?\e$(G'f\e(B . #xC6D3)
322 (?\e$(G()\e(B . #xC6D4)
323 (?\e$(G(*\e(B . #xC6D5)
324 (?\e$(G(c\e(B . #xC6D6)
325 (?\e$(G(l\e(B . #xC6D7)
326 ;; Diacritical Marks
327 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
328 ;; Japanese Kana Supplement
329 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
330 ;; Japanese Hiragana
331 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
332 ;; Japanese Katakana
333 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
334 ;; Cyrillic Characters
335 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
336 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
337 ;; Special Chinese Characters
338 (?\e$(J!#\e(B . #xC879)
339 (?\e$(J!$\e(B . #xC87B)
340 (?\e$(J!*\e(B . #xC87D)
341 (?\e$(J!R\e(B . #xC8A2)
342
343 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
344 ; (?\e$B"L\e(B . #xC8CD)
345 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
346 ; (?\e$(D"C\e(B . #xC8CE)
347
348 ;; GB 2312 characters
349 ; (?\e$A!d\e(B . #xC8CF)
350 ; (?\e$A!e\e(B . #xC8D0)
351 ;;;;; C8D1 - Japanese `(\e$B3t\e(B)'
352 ; (?\e$A!m\e(B . #xC8D2)
353 ;;;;; C8D2 - Tel.
354
355 ;; Level 2 Ideographs
356 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
357 (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461
358 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
359 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
360 (?\e$(H!L\e(B . #xC9BE)
361 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
362 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
363 (?\e$(H"M\e(B . #xCAF7)
364 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
365 (?\e$(H>c\e(B . #xD6CC)
366 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
367 (?\e$(H?j\e(B . #xD77A)
368 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
369 (?\e$(H7o\e(B . #xDADF)
370 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
371 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
372 (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1
373 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
374 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
375 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
376 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
377 (?\e$(HUK\e(B . #xEBF1)
378 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
379 (?\e$(HW"\e(B . #xECDE)
380 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
381 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
382 (?\e$(Hd/\e(B . #xEEEB)
383 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
384 (?\e$(H]t\e(B . #xF056)
385 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
386 (?\e$(HZ(\e(B . #xF0CB)
387 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
388 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
389 (?\e$(Hga\e(B . #xF16B)
390 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
391 (?\e$(Hi4\e(B . #xF268)
392 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
393 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
394 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
395 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
396 (?\e$(HfM\e(B . #xF4B5)
397 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
398 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
399 (?\e$(HjK\e(B . #xF663)
400 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
401 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
402 (?\e$(Hqf\e(B . #xF9C4)
403 (?\e$(Hr4\e(B . #xF9C5)
404 (?\e$(Hr@\e(B . #xF9C6)
405 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
406 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
407
408 ;; Additional Ideographs
409 (?\e$(IC7\e(B . #xF9D6)
410 (?\e$(IOP\e(B . #xF9D7)
411 (?\e$(IDN\e(B . #xF9D8)
412 (?\e$(IPJ\e(B . #xF9D9)
413 (?\e$(I,]\e(B . #xF9DA)
414 (?\e$(I=~\e(B . #xF9DB)
415 (?\e$(IK\\e(B . #xF9DC)
416 )
2254377e 417 ))
64b4e1f1
WL
418)
419
4ed46869 420;;
650e8505 421(provide 'china-util)
4ed46869 422
ab5796a9 423;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
4ed46869 424;;; china-util.el ends here