Fix copyrights.
[bpt/emacs.git] / lisp / language / china-util.el
CommitLineData
64b4e1f1 1;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
4ed46869 2
eaa61218
KH
3;; Copyright (C) 1995, 2001, 2003
4;; Free Software Foundation, Inc.
5;; Copyright (C) 1995, 1997, 2003
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
4ed46869
KH
8
9;; Keywords: mule, multilingual, Chinese
10
11;; This file is part of GNU Emacs.
12
13;; GNU Emacs is free software; you can redistribute it and/or modify
14;; it under the terms of the GNU General Public License as published by
15;; the Free Software Foundation; either version 2, or (at your option)
16;; any later version.
17
18;; GNU Emacs is distributed in the hope that it will be useful,
19;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21;; GNU General Public License for more details.
22
23;; You should have received a copy of the GNU General Public License
369314dc
KH
24;; along with GNU Emacs; see the file COPYING. If not, write to the
25;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26;; Boston, MA 02111-1307, USA.
4ed46869 27
60370d40
PJ
28;;; Commentary:
29
4ed46869
KH
30;;; Code:
31
64b4e1f1 32;; Hz/ZW/EUC-TW encoding stuff
4ed46869
KH
33
34;; HZ is an encoding method for Chinese character set GB2312 used
35;; widely in Internet. It is very similar to 7-bit environment of
36;; ISO-2022. The difference is that HZ uses the sequence "~{" and
37;; "~}" for designating GB2312 and ASCII respectively, hence, it
38;; doesn't uses ESC (0x1B) code.
39
40;; ZW is another encoding method for Chinese character set GB2312. It
41;; encodes Chinese characters line by line by starting each line with
42;; the sequence "zW". It also uses only 7-bit as HZ.
43
64b4e1f1
WL
44;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
45;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
46;; a single shift escape followed by three bytes: the first gives the
47;; plane, the second and third the character code. Note that characters
48;; of plane 1 are (redundantly) accessible with a single shift escape
49;; also.
50
4ed46869
KH
51;; ISO-2022 escape sequence to designate GB2312.
52(defvar iso2022-gb-designation "\e$A")
53;; HZ escape sequence to designate GB2312.
54(defvar hz-gb-designnation "~{")
55;; ISO-2022 escape sequence to designate ASCII.
56(defvar iso2022-ascii-designation "\e(B")
57;; HZ escape sequence to designate ASCII.
58(defvar hz-ascii-designnation "~}")
59;; Regexp of ZW sequence to start GB2312.
60(defvar zw-start-gb "^zW")
61;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
8f3969f8
KH
62(defvar hz/zw-start-gb
63 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
4ed46869
KH
64
65(defvar decode-hz-line-continuation nil
66 "Flag to tell if we should care line continuation convention of Hz.")
67
8f3969f8 68(defconst hz-set-msb-table
2254377e
SM
69 (eval-when-compile
70 (let ((chars nil)
71 (i 0))
72 (while (< i 33)
73 (push i chars)
74 (setq i (1+ i)))
75 (while (< i 127)
76 (push (+ i 128) chars)
77 (setq i (1+ i)))
78 (apply 'string (nreverse chars)))))
8f3969f8 79
4ed46869
KH
80;;;###autoload
81(defun decode-hz-region (beg end)
82 "Decode HZ/ZW encoded text in the current region.
83Return the length of resulting text."
84 (interactive "r")
85 (save-excursion
86 (save-restriction
8f3969f8
KH
87 (let (pos ch)
88 (narrow-to-region beg end)
89
90 ;; We, at first, convert HZ/ZW to `euc-china',
91 ;; then decode it.
92
93 ;; "~\n" -> "\n", "~~" -> "~"
94 (goto-char (point-min))
95 (while (search-forward "~" nil t)
96 (setq ch (following-char))
97 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
98
99 ;; "^zW...\n" -> Chinese GB2312
100 ;; "~{...~}" -> Chinese GB2312
101 (goto-char (point-min))
102 (setq beg nil)
4ed46869 103 (while (re-search-forward hz/zw-start-gb nil t)
8f3969f8
KH
104 (setq pos (match-beginning 0)
105 ch (char-after pos))
106 ;; Record the first position to start conversion.
107 (or beg (setq beg pos))
108 (end-of-line)
109 (setq end (point))
110 (if (>= ch 128) ; 8bit GB2312
111 nil
112 (goto-char pos)
113 (delete-char 2)
114 (setq end (- end 2))
115 (if (= ch ?z) ; ZW -> euc-china
116 (progn
117 (translate-region (point) end hz-set-msb-table)
118 (goto-char end))
119 (if (search-forward hz-ascii-designnation
120 (if decode-hz-line-continuation nil end)
121 t)
122 (delete-char -2))
123 (setq end (point))
124 (translate-region pos (point) hz-set-msb-table))))
125 (if beg
126 (decode-coding-region beg end 'euc-china)))
4ed46869
KH
127 (- (point-max) (point-min)))))
128
129;;;###autoload
130(defun decode-hz-buffer ()
131 "Decode HZ/ZW encoded text in the current buffer."
132 (interactive)
133 (decode-hz-region (point-min) (point-max)))
134
135;;;###autoload
136(defun encode-hz-region (beg end)
137 "Encode the text in the current region to HZ.
138Return the length of resulting text."
139 (interactive "r")
140 (save-excursion
141 (save-restriction
142 (narrow-to-region beg end)
143
144 ;; "~" -> "~~"
145 (goto-char (point-min))
146 (while (search-forward "~" nil t) (insert ?~))
147
148 ;; Chinese GB2312 -> "~{...~}"
149 (goto-char (point-min))
150 (if (re-search-forward "\\cc" nil t)
39e0da62 151 (let (pos)
4ed46869 152 (goto-char (setq pos (match-beginning 0)))
5dd921df 153 (encode-coding-region pos (point-max) 'iso-2022-7bit)
4ed46869
KH
154 (goto-char pos)
155 (while (search-forward iso2022-gb-designation nil t)
156 (delete-char -3)
157 (insert hz-gb-designnation))
158 (goto-char pos)
159 (while (search-forward iso2022-ascii-designation nil t)
160 (delete-char -3)
161 (insert hz-ascii-designnation))))
162 (- (point-max) (point-min)))))
163
164;;;###autoload
165(defun encode-hz-buffer ()
166 "Encode the text in the current buffer to HZ."
167 (interactive)
168 (encode-hz-region (point-min) (point-max)))
169
64b4e1f1
WL
170;; The following sets up a translation table (big5-to-cns) from Big 5
171;; to CNS encoding, using some auxiliary functions to make the code
172;; more readable.
173
174;; Many kudos to Himi! The used code has been adapted from his
175;; mule-ucs package.
176
2254377e 177(eval-when-compile
64b4e1f1
WL
178(defun big5-to-flat-code (num)
179 "Convert NUM in Big 5 encoding to a `flat code'.
1800xA140 will be mapped to position 0, 0xA141 to position 1, etc.
181There are no gaps in the flat code."
182
183 (let ((hi (/ num 256))
184 (lo (% num 256)))
185 (+ (* 157 (- hi #xa1))
186 (- lo (if (>= lo #xa1) 98 64)))))
187
188(defun flat-code-to-big5 (num)
189 "Convert NUM from a `flat code' to Big 5 encoding.
190This is the inverse function of `big5-to-flat-code'."
191
192 (let ((hi (/ num 157))
193 (lo (% num 157)))
194 (+ (* 256 (+ hi #xa1))
195 (+ lo (if (< lo 63) 64 98)))))
196
197(defun euc-to-flat-code (num)
198 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
1990x2121 will be mapped to position 0, 0x2122 to position 1, etc.
200There are no gaps in the flat code."
201
202 (let ((hi (/ num 256))
203 (lo (% num 256)))
204 (+ (* 94 (- hi #x21))
205 (- lo #x21))))
206
207(defun flat-code-to-euc (num)
208 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
209The inverse function of `euc-to-flat-code'. The high and low bytes are
210returned in a list."
211
212 (let ((hi (/ num 94))
213 (lo (% num 94)))
214 (list (+ hi #x21) (+ lo #x21))))
215
216(defun expand-euc-big5-alist (alist)
217 "Create a translation table and fills it with data given in ALIST.
218Elements of ALIST can be either given as
219
220 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
221
222or as
223
224 (euc-character . big5-charcode)
225
226The former maps a range of glyphs in an EUC charset (where STARTCHAR
227is in GL representation) to a certain range of Big 5 encoded
228characters, the latter maps a single glyph. Glyphs which can't be
229mapped will be represented with the byte 0xFF.
230
231The return value is the filled translation table."
232
2254377e 233 (let ((chartable (make-char-table 'translation-table #xFF))
64b4e1f1
WL
234 char
235 big5
236 i
237 end
238 codepoint
239 charset)
2254377e
SM
240 (dolist (elem alist)
241 (setq char (car elem)
242 big5 (cdr elem))
64b4e1f1
WL
243 (cond ((and (consp char)
244 (consp big5))
2254377e
SM
245 (setq i (big5-to-flat-code (car big5))
246 end (big5-to-flat-code (cdr big5))
247 codepoint (euc-to-flat-code (cdr char))
248 charset (car char))
249 (while (>= end i)
250 (aset chartable
251 (decode-big5-char (flat-code-to-big5 i))
252 (apply (function make-char)
253 charset
254 (flat-code-to-euc codepoint)))
255 (setq i (1+ i)
256 codepoint (1+ codepoint))))
64b4e1f1
WL
257 ((and (char-valid-p char)
258 (numberp big5))
2254377e
SM
259 (setq i (decode-big5-char big5))
260 (aset chartable i char))
64b4e1f1 261 (t
2254377e 262 (error "Unknown slot type: %S" elem))))
64b4e1f1 263 ;; the return value
2254377e 264 chartable)))
64b4e1f1
WL
265
266;; All non-CNS encodings are commented out.
267
268(define-translation-table 'big5-to-cns
2254377e 269 (eval-when-compile
64b4e1f1
WL
270 (expand-euc-big5-alist
271 '(
272 ;; Symbols
273 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
274 (?\e$(G"X\e(B . #xA1F6)
275 (?\e$(G"W\e(B . #xA1F7)
276 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
277 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
278 ;; Control codes (vendor dependent)
279 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
280 ;; Level 1 Ideographs
281 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
282 (?\e$(GWS\e(B . #xACFE)
283 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
284 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
285 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
286 (?\e$(GkP\e(B . #xBE52)
287 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
288 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
289 (?\e$(Gu5\e(B . #xC2CB)
290 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
291 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
292 (?\e$(Gxe\e(B . #xC3B9)
293 (?\e$(Gxd\e(B . #xC3BA)
294 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
295 (?\e$(Gx-\e(B . #xC456)
296 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
297 ;; Symbols
298 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
299 ;; Radicals
300 (?\e$(G'#\e(B . #xC6BF)
301 (?\e$(G'$\e(B . #xC6C0)
302 (?\e$(G'&\e(B . #xC6C1)
303 (?\e$(G'(\e(B . #xC6C2)
304 (?\e$(G'-\e(B . #xC6C3)
305 (?\e$(G'.\e(B . #xC6C4)
306 (?\e$(G'/\e(B . #xC6C5)
307 (?\e$(G'4\e(B . #xC6C6)
308 (?\e$(G'7\e(B . #xC6C7)
309 (?\e$(G':\e(B . #xC6C8)
310 (?\e$(G'<\e(B . #xC6C9)
311 (?\e$(G'B\e(B . #xC6CA)
312 (?\e$(G'G\e(B . #xC6CB)
313 (?\e$(G'N\e(B . #xC6CC)
314 (?\e$(G'S\e(B . #xC6CD)
315 (?\e$(G'T\e(B . #xC6CE)
316 (?\e$(G'U\e(B . #xC6CF)
317 (?\e$(G'Y\e(B . #xC6D0)
318 (?\e$(G'Z\e(B . #xC6D1)
319 (?\e$(G'a\e(B . #xC6D2)
320 (?\e$(G'f\e(B . #xC6D3)
321 (?\e$(G()\e(B . #xC6D4)
322 (?\e$(G(*\e(B . #xC6D5)
323 (?\e$(G(c\e(B . #xC6D6)
324 (?\e$(G(l\e(B . #xC6D7)
325 ;; Diacritical Marks
326 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
327 ;; Japanese Kana Supplement
328 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
329 ;; Japanese Hiragana
330 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
331 ;; Japanese Katakana
332 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
333 ;; Cyrillic Characters
334 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
335 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
336 ;; Special Chinese Characters
337 (?\e$(J!#\e(B . #xC879)
338 (?\e$(J!$\e(B . #xC87B)
339 (?\e$(J!*\e(B . #xC87D)
340 (?\e$(J!R\e(B . #xC8A2)
341
342 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
343 ; (?\e$B"L\e(B . #xC8CD)
344 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
345 ; (?\e$(D"C\e(B . #xC8CE)
346
347 ;; GB 2312 characters
348 ; (?\e$A!d\e(B . #xC8CF)
349 ; (?\e$A!e\e(B . #xC8D0)
350 ;;;;; C8D1 - Japanese `(\e$B3t\e(B)'
351 ; (?\e$A!m\e(B . #xC8D2)
352 ;;;;; C8D2 - Tel.
353
354 ;; Level 2 Ideographs
355 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
356 (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461
357 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
358 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
359 (?\e$(H!L\e(B . #xC9BE)
360 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
361 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
362 (?\e$(H"M\e(B . #xCAF7)
363 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
364 (?\e$(H>c\e(B . #xD6CC)
365 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
366 (?\e$(H?j\e(B . #xD77A)
367 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
368 (?\e$(H7o\e(B . #xDADF)
369 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
370 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
371 (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1
372 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
373 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
374 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
375 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
376 (?\e$(HUK\e(B . #xEBF1)
377 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
378 (?\e$(HW"\e(B . #xECDE)
379 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
380 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
381 (?\e$(Hd/\e(B . #xEEEB)
382 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
383 (?\e$(H]t\e(B . #xF056)
384 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
385 (?\e$(HZ(\e(B . #xF0CB)
386 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
387 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
388 (?\e$(Hga\e(B . #xF16B)
389 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
390 (?\e$(Hi4\e(B . #xF268)
391 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
392 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
393 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
394 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
395 (?\e$(HfM\e(B . #xF4B5)
396 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
397 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
398 (?\e$(HjK\e(B . #xF663)
399 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
400 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
401 (?\e$(Hqf\e(B . #xF9C4)
402 (?\e$(Hr4\e(B . #xF9C5)
403 (?\e$(Hr@\e(B . #xF9C6)
404 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
405 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
406
407 ;; Additional Ideographs
408 (?\e$(IC7\e(B . #xF9D6)
409 (?\e$(IOP\e(B . #xF9D7)
410 (?\e$(IDN\e(B . #xF9D8)
411 (?\e$(IPJ\e(B . #xF9D9)
412 (?\e$(I,]\e(B . #xF9DA)
413 (?\e$(I=~\e(B . #xF9DB)
414 (?\e$(IK\\e(B . #xF9DC)
415 )
2254377e 416 ))
64b4e1f1
WL
417)
418
4ed46869 419;;
650e8505 420(provide 'china-util)
4ed46869 421
ab5796a9 422;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
4ed46869 423;;; china-util.el ends here