Add arch taglines
[bpt/emacs.git] / lisp / language / china-util.el
CommitLineData
64b4e1f1 1;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
4ed46869 2
2254377e 3;; Copyright (C) 1995, 2003 Electrotechnical Laboratory, JAPAN.
fa526c4a 4;; Licensed to the Free Software Foundation.
64b4e1f1 5;; Copyright (C) 1995, 2001 Free Software Foundation, Inc.
4ed46869
KH
6
7;; Keywords: mule, multilingual, Chinese
8
9;; This file is part of GNU Emacs.
10
11;; GNU Emacs is free software; you can redistribute it and/or modify
12;; it under the terms of the GNU General Public License as published by
13;; the Free Software Foundation; either version 2, or (at your option)
14;; any later version.
15
16;; GNU Emacs is distributed in the hope that it will be useful,
17;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;; GNU General Public License for more details.
20
21;; You should have received a copy of the GNU General Public License
369314dc
KH
22;; along with GNU Emacs; see the file COPYING. If not, write to the
23;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24;; Boston, MA 02111-1307, USA.
4ed46869 25
60370d40
PJ
26;;; Commentary:
27
4ed46869
KH
28;;; Code:
29
64b4e1f1 30;; Hz/ZW/EUC-TW encoding stuff
4ed46869
KH
31
32;; HZ is an encoding method for Chinese character set GB2312 used
33;; widely in Internet. It is very similar to 7-bit environment of
34;; ISO-2022. The difference is that HZ uses the sequence "~{" and
35;; "~}" for designating GB2312 and ASCII respectively, hence, it
36;; doesn't uses ESC (0x1B) code.
37
38;; ZW is another encoding method for Chinese character set GB2312. It
39;; encodes Chinese characters line by line by starting each line with
40;; the sequence "zW". It also uses only 7-bit as HZ.
41
64b4e1f1
WL
42;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
43;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
44;; a single shift escape followed by three bytes: the first gives the
45;; plane, the second and third the character code. Note that characters
46;; of plane 1 are (redundantly) accessible with a single shift escape
47;; also.
48
4ed46869
KH
49;; ISO-2022 escape sequence to designate GB2312.
50(defvar iso2022-gb-designation "\e$A")
51;; HZ escape sequence to designate GB2312.
52(defvar hz-gb-designnation "~{")
53;; ISO-2022 escape sequence to designate ASCII.
54(defvar iso2022-ascii-designation "\e(B")
55;; HZ escape sequence to designate ASCII.
56(defvar hz-ascii-designnation "~}")
57;; Regexp of ZW sequence to start GB2312.
58(defvar zw-start-gb "^zW")
59;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
8f3969f8
KH
60(defvar hz/zw-start-gb
61 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
4ed46869
KH
62
63(defvar decode-hz-line-continuation nil
64 "Flag to tell if we should care line continuation convention of Hz.")
65
8f3969f8 66(defconst hz-set-msb-table
2254377e
SM
67 (eval-when-compile
68 (let ((chars nil)
69 (i 0))
70 (while (< i 33)
71 (push i chars)
72 (setq i (1+ i)))
73 (while (< i 127)
74 (push (+ i 128) chars)
75 (setq i (1+ i)))
76 (apply 'string (nreverse chars)))))
8f3969f8 77
4ed46869
KH
78;;;###autoload
79(defun decode-hz-region (beg end)
80 "Decode HZ/ZW encoded text in the current region.
81Return the length of resulting text."
82 (interactive "r")
83 (save-excursion
84 (save-restriction
8f3969f8
KH
85 (let (pos ch)
86 (narrow-to-region beg end)
87
88 ;; We, at first, convert HZ/ZW to `euc-china',
89 ;; then decode it.
90
91 ;; "~\n" -> "\n", "~~" -> "~"
92 (goto-char (point-min))
93 (while (search-forward "~" nil t)
94 (setq ch (following-char))
95 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
96
97 ;; "^zW...\n" -> Chinese GB2312
98 ;; "~{...~}" -> Chinese GB2312
99 (goto-char (point-min))
100 (setq beg nil)
4ed46869 101 (while (re-search-forward hz/zw-start-gb nil t)
8f3969f8
KH
102 (setq pos (match-beginning 0)
103 ch (char-after pos))
104 ;; Record the first position to start conversion.
105 (or beg (setq beg pos))
106 (end-of-line)
107 (setq end (point))
108 (if (>= ch 128) ; 8bit GB2312
109 nil
110 (goto-char pos)
111 (delete-char 2)
112 (setq end (- end 2))
113 (if (= ch ?z) ; ZW -> euc-china
114 (progn
115 (translate-region (point) end hz-set-msb-table)
116 (goto-char end))
117 (if (search-forward hz-ascii-designnation
118 (if decode-hz-line-continuation nil end)
119 t)
120 (delete-char -2))
121 (setq end (point))
122 (translate-region pos (point) hz-set-msb-table))))
123 (if beg
124 (decode-coding-region beg end 'euc-china)))
4ed46869
KH
125 (- (point-max) (point-min)))))
126
127;;;###autoload
128(defun decode-hz-buffer ()
129 "Decode HZ/ZW encoded text in the current buffer."
130 (interactive)
131 (decode-hz-region (point-min) (point-max)))
132
133;;;###autoload
134(defun encode-hz-region (beg end)
135 "Encode the text in the current region to HZ.
136Return the length of resulting text."
137 (interactive "r")
138 (save-excursion
139 (save-restriction
140 (narrow-to-region beg end)
141
142 ;; "~" -> "~~"
143 (goto-char (point-min))
144 (while (search-forward "~" nil t) (insert ?~))
145
146 ;; Chinese GB2312 -> "~{...~}"
147 (goto-char (point-min))
148 (if (re-search-forward "\\cc" nil t)
39e0da62 149 (let (pos)
4ed46869 150 (goto-char (setq pos (match-beginning 0)))
5dd921df 151 (encode-coding-region pos (point-max) 'iso-2022-7bit)
4ed46869
KH
152 (goto-char pos)
153 (while (search-forward iso2022-gb-designation nil t)
154 (delete-char -3)
155 (insert hz-gb-designnation))
156 (goto-char pos)
157 (while (search-forward iso2022-ascii-designation nil t)
158 (delete-char -3)
159 (insert hz-ascii-designnation))))
160 (- (point-max) (point-min)))))
161
162;;;###autoload
163(defun encode-hz-buffer ()
164 "Encode the text in the current buffer to HZ."
165 (interactive)
166 (encode-hz-region (point-min) (point-max)))
167
64b4e1f1
WL
168;; The following sets up a translation table (big5-to-cns) from Big 5
169;; to CNS encoding, using some auxiliary functions to make the code
170;; more readable.
171
172;; Many kudos to Himi! The used code has been adapted from his
173;; mule-ucs package.
174
2254377e 175(eval-when-compile
64b4e1f1
WL
176(defun big5-to-flat-code (num)
177 "Convert NUM in Big 5 encoding to a `flat code'.
1780xA140 will be mapped to position 0, 0xA141 to position 1, etc.
179There are no gaps in the flat code."
180
181 (let ((hi (/ num 256))
182 (lo (% num 256)))
183 (+ (* 157 (- hi #xa1))
184 (- lo (if (>= lo #xa1) 98 64)))))
185
186(defun flat-code-to-big5 (num)
187 "Convert NUM from a `flat code' to Big 5 encoding.
188This is the inverse function of `big5-to-flat-code'."
189
190 (let ((hi (/ num 157))
191 (lo (% num 157)))
192 (+ (* 256 (+ hi #xa1))
193 (+ lo (if (< lo 63) 64 98)))))
194
195(defun euc-to-flat-code (num)
196 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
1970x2121 will be mapped to position 0, 0x2122 to position 1, etc.
198There are no gaps in the flat code."
199
200 (let ((hi (/ num 256))
201 (lo (% num 256)))
202 (+ (* 94 (- hi #x21))
203 (- lo #x21))))
204
205(defun flat-code-to-euc (num)
206 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
207The inverse function of `euc-to-flat-code'. The high and low bytes are
208returned in a list."
209
210 (let ((hi (/ num 94))
211 (lo (% num 94)))
212 (list (+ hi #x21) (+ lo #x21))))
213
214(defun expand-euc-big5-alist (alist)
215 "Create a translation table and fills it with data given in ALIST.
216Elements of ALIST can be either given as
217
218 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
219
220or as
221
222 (euc-character . big5-charcode)
223
224The former maps a range of glyphs in an EUC charset (where STARTCHAR
225is in GL representation) to a certain range of Big 5 encoded
226characters, the latter maps a single glyph. Glyphs which can't be
227mapped will be represented with the byte 0xFF.
228
229The return value is the filled translation table."
230
2254377e 231 (let ((chartable (make-char-table 'translation-table #xFF))
64b4e1f1
WL
232 char
233 big5
234 i
235 end
236 codepoint
237 charset)
2254377e
SM
238 (dolist (elem alist)
239 (setq char (car elem)
240 big5 (cdr elem))
64b4e1f1
WL
241 (cond ((and (consp char)
242 (consp big5))
2254377e
SM
243 (setq i (big5-to-flat-code (car big5))
244 end (big5-to-flat-code (cdr big5))
245 codepoint (euc-to-flat-code (cdr char))
246 charset (car char))
247 (while (>= end i)
248 (aset chartable
249 (decode-big5-char (flat-code-to-big5 i))
250 (apply (function make-char)
251 charset
252 (flat-code-to-euc codepoint)))
253 (setq i (1+ i)
254 codepoint (1+ codepoint))))
64b4e1f1
WL
255 ((and (char-valid-p char)
256 (numberp big5))
2254377e
SM
257 (setq i (decode-big5-char big5))
258 (aset chartable i char))
64b4e1f1 259 (t
2254377e 260 (error "Unknown slot type: %S" elem))))
64b4e1f1 261 ;; the return value
2254377e 262 chartable)))
64b4e1f1
WL
263
264;; All non-CNS encodings are commented out.
265
266(define-translation-table 'big5-to-cns
2254377e 267 (eval-when-compile
64b4e1f1
WL
268 (expand-euc-big5-alist
269 '(
270 ;; Symbols
271 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
272 (?\e$(G"X\e(B . #xA1F6)
273 (?\e$(G"W\e(B . #xA1F7)
274 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
275 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
276 ;; Control codes (vendor dependent)
277 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
278 ;; Level 1 Ideographs
279 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
280 (?\e$(GWS\e(B . #xACFE)
281 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
282 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
283 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
284 (?\e$(GkP\e(B . #xBE52)
285 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
286 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
287 (?\e$(Gu5\e(B . #xC2CB)
288 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
289 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
290 (?\e$(Gxe\e(B . #xC3B9)
291 (?\e$(Gxd\e(B . #xC3BA)
292 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
293 (?\e$(Gx-\e(B . #xC456)
294 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
295 ;; Symbols
296 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
297 ;; Radicals
298 (?\e$(G'#\e(B . #xC6BF)
299 (?\e$(G'$\e(B . #xC6C0)
300 (?\e$(G'&\e(B . #xC6C1)
301 (?\e$(G'(\e(B . #xC6C2)
302 (?\e$(G'-\e(B . #xC6C3)
303 (?\e$(G'.\e(B . #xC6C4)
304 (?\e$(G'/\e(B . #xC6C5)
305 (?\e$(G'4\e(B . #xC6C6)
306 (?\e$(G'7\e(B . #xC6C7)
307 (?\e$(G':\e(B . #xC6C8)
308 (?\e$(G'<\e(B . #xC6C9)
309 (?\e$(G'B\e(B . #xC6CA)
310 (?\e$(G'G\e(B . #xC6CB)
311 (?\e$(G'N\e(B . #xC6CC)
312 (?\e$(G'S\e(B . #xC6CD)
313 (?\e$(G'T\e(B . #xC6CE)
314 (?\e$(G'U\e(B . #xC6CF)
315 (?\e$(G'Y\e(B . #xC6D0)
316 (?\e$(G'Z\e(B . #xC6D1)
317 (?\e$(G'a\e(B . #xC6D2)
318 (?\e$(G'f\e(B . #xC6D3)
319 (?\e$(G()\e(B . #xC6D4)
320 (?\e$(G(*\e(B . #xC6D5)
321 (?\e$(G(c\e(B . #xC6D6)
322 (?\e$(G(l\e(B . #xC6D7)
323 ;; Diacritical Marks
324 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
325 ;; Japanese Kana Supplement
326 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
327 ;; Japanese Hiragana
328 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
329 ;; Japanese Katakana
330 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
331 ;; Cyrillic Characters
332 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
333 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
334 ;; Special Chinese Characters
335 (?\e$(J!#\e(B . #xC879)
336 (?\e$(J!$\e(B . #xC87B)
337 (?\e$(J!*\e(B . #xC87D)
338 (?\e$(J!R\e(B . #xC8A2)
339
340 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
341 ; (?\e$B"L\e(B . #xC8CD)
342 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
343 ; (?\e$(D"C\e(B . #xC8CE)
344
345 ;; GB 2312 characters
346 ; (?\e$A!d\e(B . #xC8CF)
347 ; (?\e$A!e\e(B . #xC8D0)
348 ;;;;; C8D1 - Japanese `(\e$B3t\e(B)'
349 ; (?\e$A!m\e(B . #xC8D2)
350 ;;;;; C8D2 - Tel.
351
352 ;; Level 2 Ideographs
353 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
354 (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461
355 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
356 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
357 (?\e$(H!L\e(B . #xC9BE)
358 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
359 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
360 (?\e$(H"M\e(B . #xCAF7)
361 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
362 (?\e$(H>c\e(B . #xD6CC)
363 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
364 (?\e$(H?j\e(B . #xD77A)
365 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
366 (?\e$(H7o\e(B . #xDADF)
367 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
368 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
369 (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1
370 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
371 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
372 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
373 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
374 (?\e$(HUK\e(B . #xEBF1)
375 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
376 (?\e$(HW"\e(B . #xECDE)
377 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
378 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
379 (?\e$(Hd/\e(B . #xEEEB)
380 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
381 (?\e$(H]t\e(B . #xF056)
382 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
383 (?\e$(HZ(\e(B . #xF0CB)
384 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
385 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
386 (?\e$(Hga\e(B . #xF16B)
387 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
388 (?\e$(Hi4\e(B . #xF268)
389 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
390 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
391 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
392 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
393 (?\e$(HfM\e(B . #xF4B5)
394 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
395 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
396 (?\e$(HjK\e(B . #xF663)
397 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
398 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
399 (?\e$(Hqf\e(B . #xF9C4)
400 (?\e$(Hr4\e(B . #xF9C5)
401 (?\e$(Hr@\e(B . #xF9C6)
402 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
403 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
404
405 ;; Additional Ideographs
406 (?\e$(IC7\e(B . #xF9D6)
407 (?\e$(IOP\e(B . #xF9D7)
408 (?\e$(IDN\e(B . #xF9D8)
409 (?\e$(IPJ\e(B . #xF9D9)
410 (?\e$(I,]\e(B . #xF9DA)
411 (?\e$(I=~\e(B . #xF9DB)
412 (?\e$(IK\\e(B . #xF9DC)
413 )
2254377e 414 ))
64b4e1f1
WL
415)
416
4ed46869 417;;
650e8505 418(provide 'china-util)
4ed46869 419
ab5796a9 420;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
4ed46869 421;;; china-util.el ends here