Commit | Line | Data |
---|---|---|
8d138742 CE |
1 | /* |
2 | * Shift_JIS <=> Unicode translate functions. | |
3 | * by Yu Kobayashi <mail@yukoba.jp> | |
4 | * Modification for JIS X 0208:1997 Annex 1 implementation | |
5 | * by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org> | |
6 | * | |
7 | */ | |
8 | ||
9 | #include <stdio.h> | |
10 | #include <string.h> | |
11 | #include "unicode.h" | |
12 | ||
13 | #define SJIS_DEBUG 0 | |
14 | ||
15 | extern const unicode_char* jisx0208_to_uni_tbls[]; | |
16 | extern const unsigned* uni_to_jisx0208_tbls[]; | |
17 | ||
18 | static unicode_char *c2u(const struct unicode_info *u, | |
19 | const char *sjis_str, int *err) | |
20 | { | |
21 | unicode_char *uc=0; | |
22 | unsigned char hi=0, lo=0; | |
23 | int len=0; | |
24 | int i=0; | |
25 | int pos=0; | |
26 | ||
27 | if(err) *err = -1; | |
28 | ||
29 | len = strlen(sjis_str); | |
30 | uc = (unicode_char*)malloc((len+1) * sizeof(unicode_char) *2); | |
31 | ||
32 | if (!uc) | |
33 | return NULL; | |
34 | ||
35 | for(i=0; i<len;) { | |
36 | /* 2 Characters replaced by JIS X 0201 */ | |
37 | if (sjis_str[i] == 0x5C) /* YEN SIGN */ | |
38 | { | |
39 | uc[pos++] = (unicode_char)0x00A5; | |
40 | i++; | |
41 | } | |
42 | else if (sjis_str[i] == 0x7E) /* OVERLINE */ | |
43 | { | |
44 | uc[pos++] = (unicode_char)0x203E; | |
45 | i++; | |
46 | } | |
47 | /* Other JIS X 0201 GL */ | |
48 | else if ((unsigned)sjis_str[i] < 0x80) | |
49 | { | |
50 | uc[pos++] = (unicode_char)sjis_str[i]; | |
51 | i++; | |
52 | } | |
53 | /* JIS X 0201 GR */ | |
54 | else if ((unsigned char)sjis_str[i] >= 0xa1 | |
55 | && (unsigned char)sjis_str[i] <= 0xdf) | |
56 | { | |
57 | lo = (unsigned char)sjis_str[i]; | |
58 | ||
59 | /* SHIFT_JIS -> JIS */ | |
60 | lo -= 0x80; | |
61 | ||
62 | uc[pos++] = (unicode_char)(lo+(unsigned)0xff40); | |
63 | i++; | |
64 | } | |
65 | /* 2 byte characters */ | |
66 | else if ((((unsigned char)sjis_str[i] >= 0x81 | |
67 | && (unsigned char)sjis_str[i] <= 0x9F) | |
68 | || ((unsigned char)sjis_str[i] >= 0xE0 | |
69 | && (unsigned char)sjis_str[i] <= 0xEF)) | |
70 | && (((unsigned char)sjis_str[i+1] >= 0x40 | |
71 | && (unsigned char)sjis_str[i+1] <= 0x7E) | |
72 | || ((unsigned char)sjis_str[i+1] >= 0x80 | |
73 | && (unsigned char)sjis_str[i+1] <= 0xFC))) | |
74 | { | |
75 | hi = (unsigned char)sjis_str[i]; | |
76 | lo = (unsigned char)sjis_str[i+1]; | |
77 | ||
78 | /* SJIS -> JIS */ | |
79 | if( lo < 0x9f ) { | |
80 | if( hi < 0xa0 ) { | |
81 | hi -= 0x81; | |
82 | hi *= 2; | |
83 | hi += 0x21; | |
84 | } else { | |
85 | hi -= 0xe0; | |
86 | hi *= 2; | |
87 | hi += 0x5f; | |
88 | } | |
89 | if( lo > 0x7f ) | |
90 | --lo; | |
91 | lo -= 0x1f; | |
92 | } else { | |
93 | if( hi < 0xa0 ) { | |
94 | hi -= 0x81; | |
95 | hi *= 2; | |
96 | hi += 0x22; | |
97 | } else { | |
98 | hi -= 0xe0; | |
99 | hi *= 2; | |
100 | hi += 0x60; | |
101 | } | |
102 | lo -= 0x7e; | |
103 | } | |
104 | ||
105 | /* JIS -> Unicode */ | |
106 | if (jisx0208_to_uni_tbls[hi-0x21] != NULL | |
107 | && jisx0208_to_uni_tbls[hi-0x21][lo-0x21] != | |
108 | (unicode_char)0x003F) | |
109 | uc[pos++] = jisx0208_to_uni_tbls[hi-0x21][lo-0x21]; | |
110 | else if (err) | |
111 | { | |
112 | *err = i; | |
113 | free(uc); | |
114 | return NULL; | |
115 | } | |
116 | else | |
117 | uc[pos++] = (unicode_char)0xFFFD; | |
118 | ||
119 | i+=2; | |
120 | } | |
121 | else if (err) | |
122 | { | |
123 | *err = i; | |
124 | free(uc); | |
125 | return NULL; | |
126 | } | |
127 | else | |
128 | { | |
129 | uc[pos++] = (unicode_char)0xFFFD; | |
130 | i++; | |
131 | } | |
132 | } | |
133 | uc[pos++] = 0; | |
134 | ||
135 | return uc; | |
136 | } | |
137 | ||
138 | static char *u2c(const struct unicode_info *u, | |
139 | const unicode_char *str, int *err) | |
140 | { | |
141 | int i=0; | |
142 | int pos=0; | |
143 | int len=0; | |
144 | char* s; | |
145 | ||
146 | if(err) *err = -1; | |
147 | ||
148 | while(str[len]) | |
149 | len++; | |
150 | s = malloc((len+1)*2); | |
151 | ||
152 | if (!s) | |
153 | return NULL; | |
154 | ||
155 | for(i=0; str[i]; i++) { | |
156 | int jis_char = 0; | |
157 | unsigned char hi=0, lo=0; | |
158 | ||
159 | unsigned char str_i_high=str[i] >> 8; | |
160 | ||
161 | /* SHIFT_JIS is mapped inside BMP range */ | |
162 | if (str[i] >= (unicode_char)0x10000) | |
163 | { | |
164 | if (err) | |
165 | { | |
166 | *err = i; | |
167 | free(s); | |
168 | return NULL; | |
169 | } | |
170 | s[pos++] = '?'; | |
171 | } | |
172 | /* JIS X 0201 GL or US-ASCII */ | |
173 | else if (str[i] < (unicode_char)0x0080) | |
174 | s[pos++] = (char)str[i]; | |
175 | /* 2 characters replaced by JIS X 0201 */ | |
176 | else if (str[i] == 0x00A5) /* YEN SIGN */ | |
177 | s[pos++] = (char)0x5C; | |
178 | else if (str[i] == 0x203E) /* OVERLINE */ | |
179 | s[pos++] = (char)0x7E; | |
180 | /* JIS X 0201 GR */ | |
181 | else if (str[i] >= (unicode_char)0xff61 | |
182 | && str[i] <= (unicode_char)0xff9f) | |
183 | { | |
184 | lo = (unsigned char)(str[i] - (unsigned)0xff40); | |
185 | /* JIS -> SHIFT_JIS */ | |
186 | lo += 0x80; | |
187 | s[pos++] = lo; | |
188 | } | |
189 | /* Not found */ | |
190 | else if (uni_to_jisx0208_tbls[str_i_high] == NULL | |
191 | || uni_to_jisx0208_tbls[str_i_high][str[i] & 0xff] == '?') | |
192 | { | |
193 | if (err) | |
194 | { | |
195 | *err = i; | |
196 | free(s); | |
197 | return NULL; | |
198 | } | |
199 | s[pos++] = '?'; | |
200 | } | |
201 | /* 2 byte characters */ | |
202 | else | |
203 | { | |
204 | jis_char = uni_to_jisx0208_tbls[str_i_high][str[i] & 0xff]; | |
205 | hi = jis_char >> 8; | |
206 | lo = jis_char & 0xff; | |
207 | ||
208 | /* JIS -> SJIS */ | |
209 | if( ( hi % 2 ) == 0 ) | |
210 | lo += 0x7d; | |
211 | else | |
212 | lo += 0x1f; | |
213 | ||
214 | if( lo > 0x7e ) | |
215 | ++ lo; | |
216 | ||
217 | if( hi < 0x5f ) { | |
218 | ++hi; | |
219 | hi /= 2; | |
220 | hi += 0x70; | |
221 | } else { | |
222 | ++hi; | |
223 | hi /= 2; | |
224 | hi += 0xb0; | |
225 | } | |
226 | s[pos++] = hi; | |
227 | s[pos++] = lo; | |
228 | } | |
229 | } | |
230 | s[pos] = 0; | |
231 | ||
232 | return s; | |
233 | } | |
234 | ||
235 | static char *toupper_func(const struct unicode_info *u, | |
236 | const char *cp, int *ip) | |
237 | { | |
238 | unicode_char *uc = c2u(u, cp, ip); | |
239 | char *s; | |
240 | size_t i; | |
241 | ||
242 | if (!uc) | |
243 | return (NULL); | |
244 | ||
245 | for (i=0; uc[i] && i<10000; i++) { | |
246 | if ((unicode_char)'a' <= uc[i] && uc[i] <= (unicode_char)'z') | |
247 | uc[i] = uc[i] - ((unicode_char)'a' - (unicode_char)'A'); | |
248 | } | |
249 | ||
250 | s = u2c(u, uc, NULL); | |
251 | free(uc); | |
252 | return (s); | |
253 | } | |
254 | ||
255 | static char *tolower_func(const struct unicode_info *u, | |
256 | const char *cp, int *ip) | |
257 | { | |
258 | unicode_char *uc = c2u(u, cp, ip); | |
259 | char *s; | |
260 | size_t i; | |
261 | ||
262 | if (!uc) | |
263 | return (NULL); | |
264 | ||
265 | for (i=0; uc[i]; i++) { | |
266 | if ((unicode_char)'A' <= uc[i] && uc[i] <= (unicode_char)'Z') | |
267 | uc[i] = uc[i] + ((unicode_char)'a' - (unicode_char)'A'); | |
268 | } | |
269 | ||
270 | s = u2c(u, uc, NULL); | |
271 | free(uc); | |
272 | ||
273 | return (s); | |
274 | } | |
275 | ||
276 | ||
277 | static char *totitle_func(const struct unicode_info *u, | |
278 | const char *cp, int *ip) | |
279 | { | |
280 | unicode_char *uc = c2u(u, cp, ip); | |
281 | char *s; | |
282 | ||
283 | if (!uc) | |
284 | return (NULL); | |
285 | ||
286 | /* Uh, sorry, what's "title" char? */ | |
287 | /* | |
288 | * for (i=0; uc[i]; i++) | |
289 | * uc[i] = unicode_tc(uc[i]); | |
290 | */ | |
291 | ||
292 | s = u2c(u, uc, NULL); | |
293 | free(uc); | |
294 | return (s); | |
295 | } | |
296 | ||
297 | extern const struct unicode_info unicode_UTF8; | |
298 | ||
299 | const struct unicode_info unicode_SHIFT_JIS = { | |
300 | "SHIFT_JIS", | |
301 | UNICODE_MB | UNICODE_REPLACEABLE | | |
302 | UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64, | |
303 | c2u, | |
304 | u2c, | |
305 | toupper_func, | |
306 | tolower_func, | |
307 | totitle_func, | |
308 | &unicode_UTF8 | |
309 | }; | |
310 |