Switch to recommended form of GPLv3 permissions notice.
[bpt/emacs.git] / lisp / language / china-util.el
CommitLineData
64b4e1f1 1;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
4ed46869 2
38141d20 3;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
eaa61218 4;; Free Software Foundation, Inc.
7976eda0 5;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
38141d20 6;; 2005, 2006, 2007, 2008
eaa61218
KH
7;; National Institute of Advanced Industrial Science and Technology (AIST)
8;; Registration Number H14PRO021
8f924df7
KH
9;; Copyright (C) 2003
10;; National Institute of Advanced Industrial Science and Technology (AIST)
11;; Registration Number H13PRO009
4ed46869
KH
12
13;; Keywords: mule, multilingual, Chinese
14
15;; This file is part of GNU Emacs.
16
17;; GNU Emacs is free software; you can redistribute it and/or modify
18;; it under the terms of the GNU General Public License as published by
d7142f3e 19;; the Free Software Foundation; either version 3, or (at your option)
4ed46869
KH
20;; any later version.
21
22;; GNU Emacs is distributed in the hope that it will be useful,
23;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25;; GNU General Public License for more details.
26
27;; You should have received a copy of the GNU General Public License
369314dc 28;; along with GNU Emacs; see the file COPYING. If not, write to the
3a35cf56
LK
29;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
30;; Boston, MA 02110-1301, USA.
4ed46869 31
60370d40
PJ
32;;; Commentary:
33
4ed46869
KH
34;;; Code:
35
64b4e1f1 36;; Hz/ZW/EUC-TW encoding stuff
4ed46869
KH
37
38;; HZ is an encoding method for Chinese character set GB2312 used
39;; widely in Internet. It is very similar to 7-bit environment of
40;; ISO-2022. The difference is that HZ uses the sequence "~{" and
41;; "~}" for designating GB2312 and ASCII respectively, hence, it
42;; doesn't uses ESC (0x1B) code.
43
44;; ZW is another encoding method for Chinese character set GB2312. It
45;; encodes Chinese characters line by line by starting each line with
46;; the sequence "zW". It also uses only 7-bit as HZ.
47
64b4e1f1
WL
48;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
49;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
50;; a single shift escape followed by three bytes: the first gives the
51;; plane, the second and third the character code. Note that characters
52;; of plane 1 are (redundantly) accessible with a single shift escape
53;; also.
54
4ed46869
KH
55;; ISO-2022 escape sequence to designate GB2312.
56(defvar iso2022-gb-designation "\e$A")
57;; HZ escape sequence to designate GB2312.
58(defvar hz-gb-designnation "~{")
59;; ISO-2022 escape sequence to designate ASCII.
60(defvar iso2022-ascii-designation "\e(B")
61;; HZ escape sequence to designate ASCII.
62(defvar hz-ascii-designnation "~}")
63;; Regexp of ZW sequence to start GB2312.
64(defvar zw-start-gb "^zW")
65;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
8f3969f8
KH
66(defvar hz/zw-start-gb
67 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
4ed46869
KH
68
69(defvar decode-hz-line-continuation nil
70 "Flag to tell if we should care line continuation convention of Hz.")
71
8f3969f8 72(defconst hz-set-msb-table
2254377e
SM
73 (eval-when-compile
74 (let ((chars nil)
75 (i 0))
76 (while (< i 33)
77 (push i chars)
78 (setq i (1+ i)))
79 (while (< i 127)
6bad844c 80 (push (decode-char 'eight-bit (+ i 128)) chars)
2254377e
SM
81 (setq i (1+ i)))
82 (apply 'string (nreverse chars)))))
8f3969f8 83
4ed46869
KH
84;;;###autoload
85(defun decode-hz-region (beg end)
86 "Decode HZ/ZW encoded text in the current region.
87Return the length of resulting text."
88 (interactive "r")
89 (save-excursion
90 (save-restriction
8f3969f8
KH
91 (let (pos ch)
92 (narrow-to-region beg end)
93
94 ;; We, at first, convert HZ/ZW to `euc-china',
95 ;; then decode it.
96
97 ;; "~\n" -> "\n", "~~" -> "~"
98 (goto-char (point-min))
99 (while (search-forward "~" nil t)
100 (setq ch (following-char))
101 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
102
103 ;; "^zW...\n" -> Chinese GB2312
104 ;; "~{...~}" -> Chinese GB2312
105 (goto-char (point-min))
106 (setq beg nil)
4ed46869 107 (while (re-search-forward hz/zw-start-gb nil t)
8f3969f8
KH
108 (setq pos (match-beginning 0)
109 ch (char-after pos))
110 ;; Record the first position to start conversion.
111 (or beg (setq beg pos))
112 (end-of-line)
113 (setq end (point))
114 (if (>= ch 128) ; 8bit GB2312
115 nil
116 (goto-char pos)
117 (delete-char 2)
118 (setq end (- end 2))
119 (if (= ch ?z) ; ZW -> euc-china
120 (progn
121 (translate-region (point) end hz-set-msb-table)
122 (goto-char end))
123 (if (search-forward hz-ascii-designnation
124 (if decode-hz-line-continuation nil end)
125 t)
126 (delete-char -2))
127 (setq end (point))
128 (translate-region pos (point) hz-set-msb-table))))
129 (if beg
130 (decode-coding-region beg end 'euc-china)))
4ed46869
KH
131 (- (point-max) (point-min)))))
132
133;;;###autoload
134(defun decode-hz-buffer ()
135 "Decode HZ/ZW encoded text in the current buffer."
136 (interactive)
137 (decode-hz-region (point-min) (point-max)))
138
139;;;###autoload
140(defun encode-hz-region (beg end)
141 "Encode the text in the current region to HZ.
142Return the length of resulting text."
143 (interactive "r")
144 (save-excursion
145 (save-restriction
146 (narrow-to-region beg end)
147
148 ;; "~" -> "~~"
149 (goto-char (point-min))
150 (while (search-forward "~" nil t) (insert ?~))
151
152 ;; Chinese GB2312 -> "~{...~}"
153 (goto-char (point-min))
154 (if (re-search-forward "\\cc" nil t)
39e0da62 155 (let (pos)
4ed46869 156 (goto-char (setq pos (match-beginning 0)))
5dd921df 157 (encode-coding-region pos (point-max) 'iso-2022-7bit)
4ed46869
KH
158 (goto-char pos)
159 (while (search-forward iso2022-gb-designation nil t)
160 (delete-char -3)
161 (insert hz-gb-designnation))
162 (goto-char pos)
163 (while (search-forward iso2022-ascii-designation nil t)
164 (delete-char -3)
165 (insert hz-ascii-designnation))))
166 (- (point-max) (point-min)))))
167
168;;;###autoload
169(defun encode-hz-buffer ()
170 "Encode the text in the current buffer to HZ."
171 (interactive)
172 (encode-hz-region (point-min) (point-max)))
173
0374bae4
DL
174;;;###autoload
175(defun post-read-decode-hz (len)
176 (let ((pos (point))
177 (buffer-modified-p (buffer-modified-p))
178 last-coding-system-used)
179 (prog1
180 (decode-hz-region pos (+ pos len))
181 (set-buffer-modified-p buffer-modified-p))))
64b4e1f1 182
0374bae4
DL
183;;;###autoload
184(defun pre-write-encode-hz (from to)
185 (let ((buf (current-buffer)))
186 (set-buffer (generate-new-buffer " *temp*"))
187 (if (stringp from)
188 (insert from)
189 (insert-buffer-substring buf from to))
190 (let (last-coding-system-used)
191 (encode-hz-region 1 (point-max)))
192 nil))
4ed46869 193;;
650e8505 194(provide 'china-util)
4ed46869 195
cbee283d 196;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
4ed46869 197;;; china-util.el ends here