Commit | Line | Data |
---|---|---|
64b4e1f1 | 1 | ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- |
4ed46869 | 2 | |
acaf905b | 3 | ;; Copyright (C) 1995, 2001-2012 Free Software Foundation, Inc. |
7976eda0 | 4 | ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, |
5df4f04c | 5 | ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 |
eaa61218 KH |
6 | ;; National Institute of Advanced Industrial Science and Technology (AIST) |
7 | ;; Registration Number H14PRO021 | |
8f924df7 KH |
8 | ;; Copyright (C) 2003 |
9 | ;; National Institute of Advanced Industrial Science and Technology (AIST) | |
10 | ;; Registration Number H13PRO009 | |
4ed46869 KH |
11 | |
12 | ;; Keywords: mule, multilingual, Chinese | |
13 | ||
14 | ;; This file is part of GNU Emacs. | |
15 | ||
4936186e | 16 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
4ed46869 | 17 | ;; it under the terms of the GNU General Public License as published by |
4936186e GM |
18 | ;; the Free Software Foundation, either version 3 of the License, or |
19 | ;; (at your option) any later version. | |
4ed46869 KH |
20 | |
21 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
22 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
23 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
24 | ;; GNU General Public License for more details. | |
25 | ||
26 | ;; You should have received a copy of the GNU General Public License | |
4936186e | 27 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
4ed46869 | 28 | |
60370d40 PJ |
29 | ;;; Commentary: |
30 | ||
4ed46869 KH |
31 | ;;; Code: |
32 | ||
64b4e1f1 | 33 | ;; Hz/ZW/EUC-TW encoding stuff |
4ed46869 KH |
34 | |
35 | ;; HZ is an encoding method for Chinese character set GB2312 used | |
36 | ;; widely in Internet. It is very similar to 7-bit environment of | |
37 | ;; ISO-2022. The difference is that HZ uses the sequence "~{" and | |
38 | ;; "~}" for designating GB2312 and ASCII respectively, hence, it | |
39 | ;; doesn't uses ESC (0x1B) code. | |
40 | ||
41 | ;; ZW is another encoding method for Chinese character set GB2312. It | |
42 | ;; encodes Chinese characters line by line by starting each line with | |
43 | ;; the sequence "zW". It also uses only 7-bit as HZ. | |
44 | ||
64b4e1f1 WL |
45 | ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is |
46 | ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with | |
47 | ;; a single shift escape followed by three bytes: the first gives the | |
48 | ;; plane, the second and third the character code. Note that characters | |
49 | ;; of plane 1 are (redundantly) accessible with a single shift escape | |
50 | ;; also. | |
51 | ||
4ed46869 KH |
52 | ;; ISO-2022 escape sequence to designate GB2312. |
53 | (defvar iso2022-gb-designation "\e$A") | |
54 | ;; HZ escape sequence to designate GB2312. | |
55 | (defvar hz-gb-designnation "~{") | |
56 | ;; ISO-2022 escape sequence to designate ASCII. | |
57 | (defvar iso2022-ascii-designation "\e(B") | |
58 | ;; HZ escape sequence to designate ASCII. | |
59 | (defvar hz-ascii-designnation "~}") | |
60 | ;; Regexp of ZW sequence to start GB2312. | |
61 | (defvar zw-start-gb "^zW") | |
62 | ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW. | |
8f3969f8 KH |
63 | (defvar hz/zw-start-gb |
64 | (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]")) | |
4ed46869 KH |
65 | |
66 | (defvar decode-hz-line-continuation nil | |
67 | "Flag to tell if we should care line continuation convention of Hz.") | |
68 | ||
8f3969f8 | 69 | (defconst hz-set-msb-table |
2254377e SM |
70 | (eval-when-compile |
71 | (let ((chars nil) | |
72 | (i 0)) | |
73 | (while (< i 33) | |
74 | (push i chars) | |
75 | (setq i (1+ i))) | |
76 | (while (< i 127) | |
6bad844c | 77 | (push (decode-char 'eight-bit (+ i 128)) chars) |
2254377e SM |
78 | (setq i (1+ i))) |
79 | (apply 'string (nreverse chars))))) | |
8f3969f8 | 80 | |
4ed46869 KH |
81 | ;;;###autoload |
82 | (defun decode-hz-region (beg end) | |
83 | "Decode HZ/ZW encoded text in the current region. | |
84 | Return the length of resulting text." | |
85 | (interactive "r") | |
86 | (save-excursion | |
87 | (save-restriction | |
8f3969f8 KH |
88 | (let (pos ch) |
89 | (narrow-to-region beg end) | |
90 | ||
91 | ;; We, at first, convert HZ/ZW to `euc-china', | |
92 | ;; then decode it. | |
93 | ||
94 | ;; "~\n" -> "\n", "~~" -> "~" | |
95 | (goto-char (point-min)) | |
96 | (while (search-forward "~" nil t) | |
97 | (setq ch (following-char)) | |
98 | (if (or (= ch ?\n) (= ch ?~)) (delete-char -1))) | |
99 | ||
100 | ;; "^zW...\n" -> Chinese GB2312 | |
101 | ;; "~{...~}" -> Chinese GB2312 | |
102 | (goto-char (point-min)) | |
103 | (setq beg nil) | |
4ed46869 | 104 | (while (re-search-forward hz/zw-start-gb nil t) |
8f3969f8 KH |
105 | (setq pos (match-beginning 0) |
106 | ch (char-after pos)) | |
107 | ;; Record the first position to start conversion. | |
108 | (or beg (setq beg pos)) | |
109 | (end-of-line) | |
110 | (setq end (point)) | |
111 | (if (>= ch 128) ; 8bit GB2312 | |
112 | nil | |
113 | (goto-char pos) | |
114 | (delete-char 2) | |
115 | (setq end (- end 2)) | |
116 | (if (= ch ?z) ; ZW -> euc-china | |
117 | (progn | |
118 | (translate-region (point) end hz-set-msb-table) | |
119 | (goto-char end)) | |
120 | (if (search-forward hz-ascii-designnation | |
121 | (if decode-hz-line-continuation nil end) | |
122 | t) | |
123 | (delete-char -2)) | |
124 | (setq end (point)) | |
125 | (translate-region pos (point) hz-set-msb-table)))) | |
126 | (if beg | |
127 | (decode-coding-region beg end 'euc-china))) | |
4ed46869 KH |
128 | (- (point-max) (point-min))))) |
129 | ||
130 | ;;;###autoload | |
131 | (defun decode-hz-buffer () | |
132 | "Decode HZ/ZW encoded text in the current buffer." | |
133 | (interactive) | |
134 | (decode-hz-region (point-min) (point-max))) | |
135 | ||
136 | ;;;###autoload | |
137 | (defun encode-hz-region (beg end) | |
138 | "Encode the text in the current region to HZ. | |
139 | Return the length of resulting text." | |
140 | (interactive "r") | |
141 | (save-excursion | |
142 | (save-restriction | |
143 | (narrow-to-region beg end) | |
144 | ||
145 | ;; "~" -> "~~" | |
146 | (goto-char (point-min)) | |
147 | (while (search-forward "~" nil t) (insert ?~)) | |
148 | ||
149 | ;; Chinese GB2312 -> "~{...~}" | |
150 | (goto-char (point-min)) | |
151 | (if (re-search-forward "\\cc" nil t) | |
39e0da62 | 152 | (let (pos) |
4ed46869 | 153 | (goto-char (setq pos (match-beginning 0))) |
5dd921df | 154 | (encode-coding-region pos (point-max) 'iso-2022-7bit) |
4ed46869 KH |
155 | (goto-char pos) |
156 | (while (search-forward iso2022-gb-designation nil t) | |
157 | (delete-char -3) | |
158 | (insert hz-gb-designnation)) | |
159 | (goto-char pos) | |
160 | (while (search-forward iso2022-ascii-designation nil t) | |
161 | (delete-char -3) | |
162 | (insert hz-ascii-designnation)))) | |
163 | (- (point-max) (point-min))))) | |
164 | ||
165 | ;;;###autoload | |
166 | (defun encode-hz-buffer () | |
167 | "Encode the text in the current buffer to HZ." | |
168 | (interactive) | |
169 | (encode-hz-region (point-min) (point-max))) | |
170 | ||
0374bae4 DL |
171 | ;;;###autoload |
172 | (defun post-read-decode-hz (len) | |
173 | (let ((pos (point)) | |
174 | (buffer-modified-p (buffer-modified-p)) | |
175 | last-coding-system-used) | |
176 | (prog1 | |
177 | (decode-hz-region pos (+ pos len)) | |
178 | (set-buffer-modified-p buffer-modified-p)))) | |
64b4e1f1 | 179 | |
0374bae4 DL |
180 | ;;;###autoload |
181 | (defun pre-write-encode-hz (from to) | |
182 | (let ((buf (current-buffer))) | |
183 | (set-buffer (generate-new-buffer " *temp*")) | |
184 | (if (stringp from) | |
185 | (insert from) | |
186 | (insert-buffer-substring buf from to)) | |
187 | (let (last-coding-system-used) | |
188 | (encode-hz-region 1 (point-max))) | |
189 | nil)) | |
4ed46869 | 190 | ;; |
650e8505 | 191 | (provide 'china-util) |
4ed46869 | 192 | |
4ed46869 | 193 | ;;; china-util.el ends here |