Commit | Line | Data |
---|---|---|
64b4e1f1 | 1 | ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
4ed46869 | 2 | |
38141d20 | 3 | ;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 |
eaa61218 | 4 | ;; Free Software Foundation, Inc. |
7976eda0 | 5 | ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, |
38141d20 | 6 | ;; 2005, 2006, 2007, 2008 |
eaa61218 KH |
7 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
8 | ;; Registration Number H14PRO021 | |
8f924df7 KH |
9 | ;; Copyright (C) 2003 |
10 | ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
11 | ;; Registration Number H13PRO009 | |
4ed46869 KH |
12 | |
13 | ;; Keywords: mule, multilingual, Chinese | |
14 | ||
15 | ;; This file is part of GNU Emacs. | |
16 | ||
17 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
18 | ;; it under the terms of the GNU General Public License as published by | |
d7142f3e | 19 | ;; the Free Software Foundation; either version 3, or (at your option) |
4ed46869 KH |
20 | ;; any later version. |
21 | ||
22 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
23 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 | ;; GNU General Public License for more details. | |
26 | ||
27 | ;; You should have received a copy of the GNU General Public License | |
369314dc | 28 | ;; along with GNU Emacs; see the file COPYING. If not, write to the |
3a35cf56 LK |
29 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
30 | ;; Boston, MA 02110-1301, USA. | |
4ed46869 | 31 | |
60370d40 PJ |
32 | ;;; Commentary: |
33 | ||
4ed46869 KH |
34 | ;;; Code: |
35 | ||
64b4e1f1 | 36 | ;; Hz/ZW/EUC-TW encoding stuff |
4ed46869 KH |
37 | |
38 | ;; HZ is an encoding method for Chinese character set GB2312 used | |
39 | ;; widely in Internet. It is very similar to 7-bit environment of | |
40 | ;; ISO-2022. The difference is that HZ uses the sequence "~{" and | |
41 | ;; "~}" for designating GB2312 and ASCII respectively, hence, it | |
42 | ;; doesn't uses ESC (0x1B) code. | |
43 | ||
44 | ;; ZW is another encoding method for Chinese character set GB2312. It | |
45 | ;; encodes Chinese characters line by line by starting each line with | |
46 | ;; the sequence "zW". It also uses only 7-bit as HZ. | |
47 | ||
64b4e1f1 WL |
48 | ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is |
49 | ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | |
50 | ;; a single shift escape followed by three bytes: the first gives the | |
51 | ;; plane, the second and third the character code. Note that characters | |
52 | ;; of plane 1 are (redundantly) accessible with a single shift escape | |
53 | ;; also. | |
54 | ||
4ed46869 KH |
55 | ;; ISO-2022 escape sequence to designate GB2312. |
56 | (defvar iso2022-gb-designation "\e$A") | |
57 | ;; HZ escape sequence to designate GB2312. | |
58 | (defvar hz-gb-designnation "~{") | |
59 | ;; ISO-2022 escape sequence to designate ASCII. | |
60 | (defvar iso2022-ascii-designation "\e(B") | |
61 | ;; HZ escape sequence to designate ASCII. | |
62 | (defvar hz-ascii-designnation "~}") | |
63 | ;; Regexp of ZW sequence to start GB2312. | |
64 | (defvar zw-start-gb "^zW") | |
65 | ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW. | |
8f3969f8 KH |
66 | (defvar hz/zw-start-gb |
67 | (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]")) | |
4ed46869 KH |
68 | |
69 | (defvar decode-hz-line-continuation nil | |
70 | "Flag to tell if we should care line continuation convention of Hz.") | |
71 | ||
8f3969f8 | 72 | (defconst hz-set-msb-table |
2254377e SM |
73 | (eval-when-compile |
74 | (let ((chars nil) | |
75 | (i 0)) | |
76 | (while (< i 33) | |
77 | (push i chars) | |
78 | (setq i (1+ i))) | |
79 | (while (< i 127) | |
6bad844c | 80 | (push (decode-char 'eight-bit (+ i 128)) chars) |
2254377e SM |
81 | (setq i (1+ i))) |
82 | (apply 'string (nreverse chars))))) | |
8f3969f8 | 83 | |
4ed46869 KH |
84 | ;;;###autoload |
85 | (defun decode-hz-region (beg end) | |
86 | "Decode HZ/ZW encoded text in the current region. | |
87 | Return the length of resulting text." | |
88 | (interactive "r") | |
89 | (save-excursion | |
90 | (save-restriction | |
8f3969f8 KH |
91 | (let (pos ch) |
92 | (narrow-to-region beg end) | |
93 | ||
94 | ;; We, at first, convert HZ/ZW to `euc-china', | |
95 | ;; then decode it. | |
96 | ||
97 | ;; "~\n" -> "\n", "~~" -> "~" | |
98 | (goto-char (point-min)) | |
99 | (while (search-forward "~" nil t) | |
100 | (setq ch (following-char)) | |
101 | (if (or (= ch ?\n) (= ch ?~)) (delete-char -1))) | |
102 | ||
103 | ;; "^zW...\n" -> Chinese GB2312 | |
104 | ;; "~{...~}" -> Chinese GB2312 | |
105 | (goto-char (point-min)) | |
106 | (setq beg nil) | |
4ed46869 | 107 | (while (re-search-forward hz/zw-start-gb nil t) |
8f3969f8 KH |
108 | (setq pos (match-beginning 0) |
109 | ch (char-after pos)) | |
110 | ;; Record the first position to start conversion. | |
111 | (or beg (setq beg pos)) | |
112 | (end-of-line) | |
113 | (setq end (point)) | |
114 | (if (>= ch 128) ; 8bit GB2312 | |
115 | nil | |
116 | (goto-char pos) | |
117 | (delete-char 2) | |
118 | (setq end (- end 2)) | |
119 | (if (= ch ?z) ; ZW -> euc-china | |
120 | (progn | |
121 | (translate-region (point) end hz-set-msb-table) | |
122 | (goto-char end)) | |
123 | (if (search-forward hz-ascii-designnation | |
124 | (if decode-hz-line-continuation nil end) | |
125 | t) | |
126 | (delete-char -2)) | |
127 | (setq end (point)) | |
128 | (translate-region pos (point) hz-set-msb-table)))) | |
129 | (if beg | |
130 | (decode-coding-region beg end 'euc-china))) | |
4ed46869 KH |
131 | (- (point-max) (point-min))))) |
132 | ||
133 | ;;;###autoload | |
134 | (defun decode-hz-buffer () | |
135 | "Decode HZ/ZW encoded text in the current buffer." | |
136 | (interactive) | |
137 | (decode-hz-region (point-min) (point-max))) | |
138 | ||
139 | ;;;###autoload | |
140 | (defun encode-hz-region (beg end) | |
141 | "Encode the text in the current region to HZ. | |
142 | Return the length of resulting text." | |
143 | (interactive "r") | |
144 | (save-excursion | |
145 | (save-restriction | |
146 | (narrow-to-region beg end) | |
147 | ||
148 | ;; "~" -> "~~" | |
149 | (goto-char (point-min)) | |
150 | (while (search-forward "~" nil t) (insert ?~)) | |
151 | ||
152 | ;; Chinese GB2312 -> "~{...~}" | |
153 | (goto-char (point-min)) | |
154 | (if (re-search-forward "\\cc" nil t) | |
39e0da62 | 155 | (let (pos) |
4ed46869 | 156 | (goto-char (setq pos (match-beginning 0))) |
5dd921df | 157 | (encode-coding-region pos (point-max) 'iso-2022-7bit) |
4ed46869 KH |
158 | (goto-char pos) |
159 | (while (search-forward iso2022-gb-designation nil t) | |
160 | (delete-char -3) | |
161 | (insert hz-gb-designnation)) | |
162 | (goto-char pos) | |
163 | (while (search-forward iso2022-ascii-designation nil t) | |
164 | (delete-char -3) | |
165 | (insert hz-ascii-designnation)))) | |
166 | (- (point-max) (point-min))))) | |
167 | ||
168 | ;;;###autoload | |
169 | (defun encode-hz-buffer () | |
170 | "Encode the text in the current buffer to HZ." | |
171 | (interactive) | |
172 | (encode-hz-region (point-min) (point-max))) | |
173 | ||
0374bae4 DL |
174 | ;;;###autoload |
175 | (defun post-read-decode-hz (len) | |
176 | (let ((pos (point)) | |
177 | (buffer-modified-p (buffer-modified-p)) | |
178 | last-coding-system-used) | |
179 | (prog1 | |
180 | (decode-hz-region pos (+ pos len)) | |
181 | (set-buffer-modified-p buffer-modified-p)))) | |
64b4e1f1 | 182 | |
0374bae4 DL |
183 | ;;;###autoload |
184 | (defun pre-write-encode-hz (from to) | |
185 | (let ((buf (current-buffer))) | |
186 | (set-buffer (generate-new-buffer " *temp*")) | |
187 | (if (stringp from) | |
188 | (insert from) | |
189 | (insert-buffer-substring buf from to)) | |
190 | (let (last-coding-system-used) | |
191 | (encode-hz-region 1 (point-max))) | |
192 | nil)) | |
4ed46869 | 193 | ;; |
650e8505 | 194 | (provide 'china-util) |
4ed46869 | 195 | |
cbee283d | 196 | ;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836 |
4ed46869 | 197 | ;;; china-util.el ends here |