Add 2012 to FSF copyright years for Emacs files
[bpt/emacs.git] / lisp / language / china-util.el
CommitLineData
64b4e1f1 1;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
4ed46869 2
acaf905b 3;; Copyright (C) 1995, 2001-2012 Free Software Foundation, Inc.
7976eda0 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
eaa61218
KH
6;; National Institute of Advanced Industrial Science and Technology (AIST)
7;; Registration Number H14PRO021
8f924df7
KH
8;; Copyright (C) 2003
9;; National Institute of Advanced Industrial Science and Technology (AIST)
10;; Registration Number H13PRO009
4ed46869
KH
11
12;; Keywords: mule, multilingual, Chinese
13
14;; This file is part of GNU Emacs.
15
4936186e 16;; GNU Emacs is free software: you can redistribute it and/or modify
4ed46869 17;; it under the terms of the GNU General Public License as published by
4936186e
GM
18;; the Free Software Foundation, either version 3 of the License, or
19;; (at your option) any later version.
4ed46869
KH
20
21;; GNU Emacs is distributed in the hope that it will be useful,
22;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24;; GNU General Public License for more details.
25
26;; You should have received a copy of the GNU General Public License
4936186e 27;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
4ed46869 28
60370d40
PJ
29;;; Commentary:
30
4ed46869
KH
31;;; Code:
32
64b4e1f1 33;; Hz/ZW/EUC-TW encoding stuff
4ed46869
KH
34
35;; HZ is an encoding method for Chinese character set GB2312 used
36;; widely in Internet. It is very similar to 7-bit environment of
37;; ISO-2022. The difference is that HZ uses the sequence "~{" and
38;; "~}" for designating GB2312 and ASCII respectively, hence, it
39;; doesn't uses ESC (0x1B) code.
40
41;; ZW is another encoding method for Chinese character set GB2312. It
42;; encodes Chinese characters line by line by starting each line with
43;; the sequence "zW". It also uses only 7-bit as HZ.
44
64b4e1f1
WL
45;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
46;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
47;; a single shift escape followed by three bytes: the first gives the
48;; plane, the second and third the character code. Note that characters
49;; of plane 1 are (redundantly) accessible with a single shift escape
50;; also.
51
4ed46869
KH
52;; ISO-2022 escape sequence to designate GB2312.
53(defvar iso2022-gb-designation "\e$A")
54;; HZ escape sequence to designate GB2312.
55(defvar hz-gb-designnation "~{")
56;; ISO-2022 escape sequence to designate ASCII.
57(defvar iso2022-ascii-designation "\e(B")
58;; HZ escape sequence to designate ASCII.
59(defvar hz-ascii-designnation "~}")
60;; Regexp of ZW sequence to start GB2312.
61(defvar zw-start-gb "^zW")
62;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
8f3969f8
KH
63(defvar hz/zw-start-gb
64 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
4ed46869
KH
65
66(defvar decode-hz-line-continuation nil
67 "Flag to tell if we should care line continuation convention of Hz.")
68
8f3969f8 69(defconst hz-set-msb-table
2254377e
SM
70 (eval-when-compile
71 (let ((chars nil)
72 (i 0))
73 (while (< i 33)
74 (push i chars)
75 (setq i (1+ i)))
76 (while (< i 127)
6bad844c 77 (push (decode-char 'eight-bit (+ i 128)) chars)
2254377e
SM
78 (setq i (1+ i)))
79 (apply 'string (nreverse chars)))))
8f3969f8 80
4ed46869
KH
81;;;###autoload
82(defun decode-hz-region (beg end)
83 "Decode HZ/ZW encoded text in the current region.
84Return the length of resulting text."
85 (interactive "r")
86 (save-excursion
87 (save-restriction
8f3969f8
KH
88 (let (pos ch)
89 (narrow-to-region beg end)
90
91 ;; We, at first, convert HZ/ZW to `euc-china',
92 ;; then decode it.
93
94 ;; "~\n" -> "\n", "~~" -> "~"
95 (goto-char (point-min))
96 (while (search-forward "~" nil t)
97 (setq ch (following-char))
98 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
99
100 ;; "^zW...\n" -> Chinese GB2312
101 ;; "~{...~}" -> Chinese GB2312
102 (goto-char (point-min))
103 (setq beg nil)
4ed46869 104 (while (re-search-forward hz/zw-start-gb nil t)
8f3969f8
KH
105 (setq pos (match-beginning 0)
106 ch (char-after pos))
107 ;; Record the first position to start conversion.
108 (or beg (setq beg pos))
109 (end-of-line)
110 (setq end (point))
111 (if (>= ch 128) ; 8bit GB2312
112 nil
113 (goto-char pos)
114 (delete-char 2)
115 (setq end (- end 2))
116 (if (= ch ?z) ; ZW -> euc-china
117 (progn
118 (translate-region (point) end hz-set-msb-table)
119 (goto-char end))
120 (if (search-forward hz-ascii-designnation
121 (if decode-hz-line-continuation nil end)
122 t)
123 (delete-char -2))
124 (setq end (point))
125 (translate-region pos (point) hz-set-msb-table))))
126 (if beg
127 (decode-coding-region beg end 'euc-china)))
4ed46869
KH
128 (- (point-max) (point-min)))))
129
130;;;###autoload
131(defun decode-hz-buffer ()
132 "Decode HZ/ZW encoded text in the current buffer."
133 (interactive)
134 (decode-hz-region (point-min) (point-max)))
135
136;;;###autoload
137(defun encode-hz-region (beg end)
138 "Encode the text in the current region to HZ.
139Return the length of resulting text."
140 (interactive "r")
141 (save-excursion
142 (save-restriction
143 (narrow-to-region beg end)
144
145 ;; "~" -> "~~"
146 (goto-char (point-min))
147 (while (search-forward "~" nil t) (insert ?~))
148
149 ;; Chinese GB2312 -> "~{...~}"
150 (goto-char (point-min))
151 (if (re-search-forward "\\cc" nil t)
39e0da62 152 (let (pos)
4ed46869 153 (goto-char (setq pos (match-beginning 0)))
5dd921df 154 (encode-coding-region pos (point-max) 'iso-2022-7bit)
4ed46869
KH
155 (goto-char pos)
156 (while (search-forward iso2022-gb-designation nil t)
157 (delete-char -3)
158 (insert hz-gb-designnation))
159 (goto-char pos)
160 (while (search-forward iso2022-ascii-designation nil t)
161 (delete-char -3)
162 (insert hz-ascii-designnation))))
163 (- (point-max) (point-min)))))
164
165;;;###autoload
166(defun encode-hz-buffer ()
167 "Encode the text in the current buffer to HZ."
168 (interactive)
169 (encode-hz-region (point-min) (point-max)))
170
0374bae4
DL
171;;;###autoload
172(defun post-read-decode-hz (len)
173 (let ((pos (point))
174 (buffer-modified-p (buffer-modified-p))
175 last-coding-system-used)
176 (prog1
177 (decode-hz-region pos (+ pos len))
178 (set-buffer-modified-p buffer-modified-p))))
64b4e1f1 179
0374bae4
DL
180;;;###autoload
181(defun pre-write-encode-hz (from to)
182 (let ((buf (current-buffer)))
183 (set-buffer (generate-new-buffer " *temp*"))
184 (if (stringp from)
185 (insert from)
186 (insert-buffer-substring buf from to))
187 (let (last-coding-system-used)
188 (encode-hz-region 1 (point-max)))
189 nil))
4ed46869 190;;
650e8505 191(provide 'china-util)
4ed46869 192
4ed46869 193;;; china-util.el ends here