Commit | Line | Data |
---|---|---|
64b4e1f1 | 1 | ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
4ed46869 | 2 | |
5df4f04c | 3 | ;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 |
eaa61218 | 4 | ;; Free Software Foundation, Inc. |
7976eda0 | 5 | ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, |
5df4f04c | 6 | ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 |
eaa61218 KH |
7 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
8 | ;; Registration Number H14PRO021 | |
8f924df7 KH |
9 | ;; Copyright (C) 2003 |
10 | ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
11 | ;; Registration Number H13PRO009 | |
4ed46869 KH |
12 | |
13 | ;; Keywords: mule, multilingual, Chinese | |
14 | ||
15 | ;; This file is part of GNU Emacs. | |
16 | ||
4936186e | 17 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
4ed46869 | 18 | ;; it under the terms of the GNU General Public License as published by |
4936186e GM |
19 | ;; the Free Software Foundation, either version 3 of the License, or |
20 | ;; (at your option) any later version. | |
4ed46869 KH |
21 | |
22 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
23 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 | ;; GNU General Public License for more details. | |
26 | ||
27 | ;; You should have received a copy of the GNU General Public License | |
4936186e | 28 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
4ed46869 | 29 | |
60370d40 PJ |
30 | ;;; Commentary: |
31 | ||
4ed46869 KH |
32 | ;;; Code: |
33 | ||
64b4e1f1 | 34 | ;; Hz/ZW/EUC-TW encoding stuff |
4ed46869 KH |
35 | |
36 | ;; HZ is an encoding method for Chinese character set GB2312 used | |
37 | ;; widely in Internet. It is very similar to 7-bit environment of | |
38 | ;; ISO-2022. The difference is that HZ uses the sequence "~{" and | |
39 | ;; "~}" for designating GB2312 and ASCII respectively, hence, it | |
40 | ;; doesn't uses ESC (0x1B) code. | |
41 | ||
42 | ;; ZW is another encoding method for Chinese character set GB2312. It | |
43 | ;; encodes Chinese characters line by line by starting each line with | |
44 | ;; the sequence "zW". It also uses only 7-bit as HZ. | |
45 | ||
64b4e1f1 WL |
46 | ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is |
47 | ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | |
48 | ;; a single shift escape followed by three bytes: the first gives the | |
49 | ;; plane, the second and third the character code. Note that characters | |
50 | ;; of plane 1 are (redundantly) accessible with a single shift escape | |
51 | ;; also. | |
52 | ||
4ed46869 KH |
53 | ;; ISO-2022 escape sequence to designate GB2312. |
54 | (defvar iso2022-gb-designation "\e$A") | |
55 | ;; HZ escape sequence to designate GB2312. | |
56 | (defvar hz-gb-designnation "~{") | |
57 | ;; ISO-2022 escape sequence to designate ASCII. | |
58 | (defvar iso2022-ascii-designation "\e(B") | |
59 | ;; HZ escape sequence to designate ASCII. | |
60 | (defvar hz-ascii-designnation "~}") | |
61 | ;; Regexp of ZW sequence to start GB2312. | |
62 | (defvar zw-start-gb "^zW") | |
63 | ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW. | |
8f3969f8 KH |
64 | (defvar hz/zw-start-gb |
65 | (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]")) | |
4ed46869 KH |
66 | |
67 | (defvar decode-hz-line-continuation nil | |
68 | "Flag to tell if we should care line continuation convention of Hz.") | |
69 | ||
8f3969f8 | 70 | (defconst hz-set-msb-table |
2254377e SM |
71 | (eval-when-compile |
72 | (let ((chars nil) | |
73 | (i 0)) | |
74 | (while (< i 33) | |
75 | (push i chars) | |
76 | (setq i (1+ i))) | |
77 | (while (< i 127) | |
6bad844c | 78 | (push (decode-char 'eight-bit (+ i 128)) chars) |
2254377e SM |
79 | (setq i (1+ i))) |
80 | (apply 'string (nreverse chars))))) | |
8f3969f8 | 81 | |
4ed46869 KH |
82 | ;;;###autoload |
83 | (defun decode-hz-region (beg end) | |
84 | "Decode HZ/ZW encoded text in the current region. | |
85 | Return the length of resulting text." | |
86 | (interactive "r") | |
87 | (save-excursion | |
88 | (save-restriction | |
8f3969f8 KH |
89 | (let (pos ch) |
90 | (narrow-to-region beg end) | |
91 | ||
92 | ;; We, at first, convert HZ/ZW to `euc-china', | |
93 | ;; then decode it. | |
94 | ||
95 | ;; "~\n" -> "\n", "~~" -> "~" | |
96 | (goto-char (point-min)) | |
97 | (while (search-forward "~" nil t) | |
98 | (setq ch (following-char)) | |
99 | (if (or (= ch ?\n) (= ch ?~)) (delete-char -1))) | |
100 | ||
101 | ;; "^zW...\n" -> Chinese GB2312 | |
102 | ;; "~{...~}" -> Chinese GB2312 | |
103 | (goto-char (point-min)) | |
104 | (setq beg nil) | |
4ed46869 | 105 | (while (re-search-forward hz/zw-start-gb nil t) |
8f3969f8 KH |
106 | (setq pos (match-beginning 0) |
107 | ch (char-after pos)) | |
108 | ;; Record the first position to start conversion. | |
109 | (or beg (setq beg pos)) | |
110 | (end-of-line) | |
111 | (setq end (point)) | |
112 | (if (>= ch 128) ; 8bit GB2312 | |
113 | nil | |
114 | (goto-char pos) | |
115 | (delete-char 2) | |
116 | (setq end (- end 2)) | |
117 | (if (= ch ?z) ; ZW -> euc-china | |
118 | (progn | |
119 | (translate-region (point) end hz-set-msb-table) | |
120 | (goto-char end)) | |
121 | (if (search-forward hz-ascii-designnation | |
122 | (if decode-hz-line-continuation nil end) | |
123 | t) | |
124 | (delete-char -2)) | |
125 | (setq end (point)) | |
126 | (translate-region pos (point) hz-set-msb-table)))) | |
127 | (if beg | |
128 | (decode-coding-region beg end 'euc-china))) | |
4ed46869 KH |
129 | (- (point-max) (point-min))))) |
130 | ||
131 | ;;;###autoload | |
132 | (defun decode-hz-buffer () | |
133 | "Decode HZ/ZW encoded text in the current buffer." | |
134 | (interactive) | |
135 | (decode-hz-region (point-min) (point-max))) | |
136 | ||
137 | ;;;###autoload | |
138 | (defun encode-hz-region (beg end) | |
139 | "Encode the text in the current region to HZ. | |
140 | Return the length of resulting text." | |
141 | (interactive "r") | |
142 | (save-excursion | |
143 | (save-restriction | |
144 | (narrow-to-region beg end) | |
145 | ||
146 | ;; "~" -> "~~" | |
147 | (goto-char (point-min)) | |
148 | (while (search-forward "~" nil t) (insert ?~)) | |
149 | ||
150 | ;; Chinese GB2312 -> "~{...~}" | |
151 | (goto-char (point-min)) | |
152 | (if (re-search-forward "\\cc" nil t) | |
39e0da62 | 153 | (let (pos) |
4ed46869 | 154 | (goto-char (setq pos (match-beginning 0))) |
5dd921df | 155 | (encode-coding-region pos (point-max) 'iso-2022-7bit) |
4ed46869 KH |
156 | (goto-char pos) |
157 | (while (search-forward iso2022-gb-designation nil t) | |
158 | (delete-char -3) | |
159 | (insert hz-gb-designnation)) | |
160 | (goto-char pos) | |
161 | (while (search-forward iso2022-ascii-designation nil t) | |
162 | (delete-char -3) | |
163 | (insert hz-ascii-designnation)))) | |
164 | (- (point-max) (point-min))))) | |
165 | ||
166 | ;;;###autoload | |
167 | (defun encode-hz-buffer () | |
168 | "Encode the text in the current buffer to HZ." | |
169 | (interactive) | |
170 | (encode-hz-region (point-min) (point-max))) | |
171 | ||
0374bae4 DL |
172 | ;;;###autoload |
173 | (defun post-read-decode-hz (len) | |
174 | (let ((pos (point)) | |
175 | (buffer-modified-p (buffer-modified-p)) | |
176 | last-coding-system-used) | |
177 | (prog1 | |
178 | (decode-hz-region pos (+ pos len)) | |
179 | (set-buffer-modified-p buffer-modified-p)))) | |
64b4e1f1 | 180 | |
0374bae4 DL |
181 | ;;;###autoload |
182 | (defun pre-write-encode-hz (from to) | |
183 | (let ((buf (current-buffer))) | |
184 | (set-buffer (generate-new-buffer " *temp*")) | |
185 | (if (stringp from) | |
186 | (insert from) | |
187 | (insert-buffer-substring buf from to)) | |
188 | (let (last-coding-system-used) | |
189 | (encode-hz-region 1 (point-max))) | |
190 | nil)) | |
4ed46869 | 191 | ;; |
650e8505 | 192 | (provide 'china-util) |
4ed46869 | 193 | |
cbee283d | 194 | ;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836 |
4ed46869 | 195 | ;;; china-util.el ends here |