Merge branch 'master' into boehm-demers-weiser-gc
[bpt/guile.git] / lib / localcharset.c
CommitLineData
7f8e40b7
NJ
1/* Determine a canonical name for the current locale's character encoding.
2
f240aacb 3 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
7f8e40b7
NJ
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19/* Written by Bruno Haible <bruno@clisp.org>. */
20
21#include <config.h>
22
23/* Specification. */
24#include "localcharset.h"
25
26#include <stddef.h>
27#include <stdio.h>
28#include <string.h>
29#include <stdlib.h>
30
f240aacb
LC
31#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
33#endif
34
7f8e40b7
NJ
35#if defined _WIN32 || defined __WIN32__
36# define WIN32_NATIVE
37#endif
38
39#if defined __EMX__
40/* Assume EMX program runs on OS/2, even if compiled under DOS. */
41# ifndef OS2
42# define OS2
43# endif
44#endif
45
46#if !defined WIN32_NATIVE
47# if HAVE_LANGINFO_CODESET
48# include <langinfo.h>
49# else
50# if 0 /* see comment below */
51# include <locale.h>
52# endif
53# endif
54# ifdef __CYGWIN__
55# define WIN32_LEAN_AND_MEAN
56# include <windows.h>
57# endif
58#elif defined WIN32_NATIVE
59# define WIN32_LEAN_AND_MEAN
60# include <windows.h>
61#endif
62#if defined OS2
63# define INCL_DOS
64# include <os2.h>
65#endif
66
67#if ENABLE_RELOCATABLE
68# include "relocatable.h"
69#else
70# define relocate(pathname) (pathname)
71#endif
72
73/* Get LIBDIR. */
74#ifndef LIBDIR
75# include "configmake.h"
76#endif
77
78#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
79 /* Win32, Cygwin, OS/2, DOS */
80# define ISSLASH(C) ((C) == '/' || (C) == '\\')
81#endif
82
83#ifndef DIRECTORY_SEPARATOR
84# define DIRECTORY_SEPARATOR '/'
85#endif
86
87#ifndef ISSLASH
88# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
89#endif
90
91#if HAVE_DECL_GETC_UNLOCKED
92# undef getc
93# define getc getc_unlocked
94#endif
95
96/* The following static variable is declared 'volatile' to avoid a
97 possible multithread problem in the function get_charset_aliases. If we
98 are running in a threaded environment, and if two threads initialize
99 'charset_aliases' simultaneously, both will produce the same value,
100 and everything will be ok if the two assignments to 'charset_aliases'
101 are atomic. But I don't know what will happen if the two assignments mix. */
102#if __STDC__ != 1
103# define volatile /* empty */
104#endif
105/* Pointer to the contents of the charset.alias file, if it has already been
106 read, else NULL. Its format is:
107 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
108static const char * volatile charset_aliases;
109
110/* Return a pointer to the contents of the charset.alias file. */
111static const char *
112get_charset_aliases (void)
113{
114 const char *cp;
115
116 cp = charset_aliases;
117 if (cp == NULL)
118 {
f240aacb 119#if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
7f8e40b7
NJ
120 FILE *fp;
121 const char *dir;
122 const char *base = "charset.alias";
123 char *file_name;
124
125 /* Make it possible to override the charset.alias location. This is
126 necessary for running the testsuite before "make install". */
127 dir = getenv ("CHARSETALIASDIR");
128 if (dir == NULL || dir[0] == '\0')
129 dir = relocate (LIBDIR);
130
131 /* Concatenate dir and base into freshly allocated file_name. */
132 {
133 size_t dir_len = strlen (dir);
134 size_t base_len = strlen (base);
135 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
136 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
137 if (file_name != NULL)
138 {
139 memcpy (file_name, dir, dir_len);
140 if (add_slash)
141 file_name[dir_len] = DIRECTORY_SEPARATOR;
142 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
143 }
144 }
145
146 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
147 /* Out of memory or file not found, treat it as empty. */
148 cp = "";
149 else
150 {
151 /* Parse the file's contents. */
152 char *res_ptr = NULL;
153 size_t res_size = 0;
154
155 for (;;)
156 {
157 int c;
158 char buf1[50+1];
159 char buf2[50+1];
160 size_t l1, l2;
161 char *old_res_ptr;
162
163 c = getc (fp);
164 if (c == EOF)
165 break;
166 if (c == '\n' || c == ' ' || c == '\t')
167 continue;
168 if (c == '#')
169 {
170 /* Skip comment, to end of line. */
171 do
172 c = getc (fp);
173 while (!(c == EOF || c == '\n'));
174 if (c == EOF)
175 break;
176 continue;
177 }
178 ungetc (c, fp);
179 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
180 break;
181 l1 = strlen (buf1);
182 l2 = strlen (buf2);
183 old_res_ptr = res_ptr;
184 if (res_size == 0)
185 {
186 res_size = l1 + 1 + l2 + 1;
187 res_ptr = (char *) malloc (res_size + 1);
188 }
189 else
190 {
191 res_size += l1 + 1 + l2 + 1;
192 res_ptr = (char *) realloc (res_ptr, res_size + 1);
193 }
194 if (res_ptr == NULL)
195 {
196 /* Out of memory. */
197 res_size = 0;
198 if (old_res_ptr != NULL)
199 free (old_res_ptr);
200 break;
201 }
202 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
203 strcpy (res_ptr + res_size - (l2 + 1), buf2);
204 }
205 fclose (fp);
206 if (res_size == 0)
207 cp = "";
208 else
209 {
210 *(res_ptr + res_size) = '\0';
211 cp = res_ptr;
212 }
213 }
214
215 if (file_name != NULL)
216 free (file_name);
217
218#else
219
f240aacb
LC
220# if defined DARWIN7
221 /* To avoid the trouble of installing a file that is shared by many
222 GNU packages -- many packaging systems have problems with this --,
223 simply inline the aliases here. */
224 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
225 "ISO8859-2" "\0" "ISO-8859-2" "\0"
226 "ISO8859-4" "\0" "ISO-8859-4" "\0"
227 "ISO8859-5" "\0" "ISO-8859-5" "\0"
228 "ISO8859-7" "\0" "ISO-8859-7" "\0"
229 "ISO8859-9" "\0" "ISO-8859-9" "\0"
230 "ISO8859-13" "\0" "ISO-8859-13" "\0"
231 "ISO8859-15" "\0" "ISO-8859-15" "\0"
232 "KOI8-R" "\0" "KOI8-R" "\0"
233 "KOI8-U" "\0" "KOI8-U" "\0"
234 "CP866" "\0" "CP866" "\0"
235 "CP949" "\0" "CP949" "\0"
236 "CP1131" "\0" "CP1131" "\0"
237 "CP1251" "\0" "CP1251" "\0"
238 "eucCN" "\0" "GB2312" "\0"
239 "GB2312" "\0" "GB2312" "\0"
240 "eucJP" "\0" "EUC-JP" "\0"
241 "eucKR" "\0" "EUC-KR" "\0"
242 "Big5" "\0" "BIG5" "\0"
243 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
244 "GBK" "\0" "GBK" "\0"
245 "GB18030" "\0" "GB18030" "\0"
246 "SJIS" "\0" "SHIFT_JIS" "\0"
247 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
248 "PT154" "\0" "PT154" "\0"
249 /*"ISCII-DEV" "\0" "?" "\0"*/
250 "*" "\0" "UTF-8" "\0";
251# endif
252
7f8e40b7
NJ
253# if defined VMS
254 /* To avoid the troubles of an extra file charset.alias_vms in the
255 sources of many GNU packages, simply inline the aliases here. */
256 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
257 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
258 section 10.7 "Handling Different Character Sets". */
259 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
260 "ISO8859-2" "\0" "ISO-8859-2" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-8" "\0" "ISO-8859-8" "\0"
264 "ISO8859-9" "\0" "ISO-8859-9" "\0"
265 /* Japanese */
266 "eucJP" "\0" "EUC-JP" "\0"
267 "SJIS" "\0" "SHIFT_JIS" "\0"
268 "DECKANJI" "\0" "DEC-KANJI" "\0"
269 "SDECKANJI" "\0" "EUC-JP" "\0"
270 /* Chinese */
271 "eucTW" "\0" "EUC-TW" "\0"
272 "DECHANYU" "\0" "DEC-HANYU" "\0"
273 "DECHANZI" "\0" "GB2312" "\0"
274 /* Korean */
275 "DECKOREAN" "\0" "EUC-KR" "\0";
276# endif
277
278# if defined WIN32_NATIVE || defined __CYGWIN__
279 /* To avoid the troubles of installing a separate file in the same
280 directory as the DLL and of retrieving the DLL's directory at
281 runtime, simply inline the aliases here. */
282
283 cp = "CP936" "\0" "GBK" "\0"
284 "CP1361" "\0" "JOHAB" "\0"
285 "CP20127" "\0" "ASCII" "\0"
286 "CP20866" "\0" "KOI8-R" "\0"
287 "CP20936" "\0" "GB2312" "\0"
288 "CP21866" "\0" "KOI8-RU" "\0"
289 "CP28591" "\0" "ISO-8859-1" "\0"
290 "CP28592" "\0" "ISO-8859-2" "\0"
291 "CP28593" "\0" "ISO-8859-3" "\0"
292 "CP28594" "\0" "ISO-8859-4" "\0"
293 "CP28595" "\0" "ISO-8859-5" "\0"
294 "CP28596" "\0" "ISO-8859-6" "\0"
295 "CP28597" "\0" "ISO-8859-7" "\0"
296 "CP28598" "\0" "ISO-8859-8" "\0"
297 "CP28599" "\0" "ISO-8859-9" "\0"
298 "CP28605" "\0" "ISO-8859-15" "\0"
299 "CP38598" "\0" "ISO-8859-8" "\0"
300 "CP51932" "\0" "EUC-JP" "\0"
301 "CP51936" "\0" "GB2312" "\0"
302 "CP51949" "\0" "EUC-KR" "\0"
303 "CP51950" "\0" "EUC-TW" "\0"
304 "CP54936" "\0" "GB18030" "\0"
305 "CP65001" "\0" "UTF-8" "\0";
306# endif
307#endif
308
309 charset_aliases = cp;
310 }
311
312 return cp;
313}
314
315/* Determine the current locale's character encoding, and canonicalize it
316 into one of the canonical names listed in config.charset.
317 The result must not be freed; it is statically allocated.
318 If the canonical name cannot be determined, the result is a non-canonical
319 name. */
320
321#ifdef STATIC
322STATIC
323#endif
324const char *
325locale_charset (void)
326{
327 const char *codeset;
328 const char *aliases;
329
330#if !(defined WIN32_NATIVE || defined OS2)
331
332# if HAVE_LANGINFO_CODESET
333
334 /* Most systems support nl_langinfo (CODESET) nowadays. */
335 codeset = nl_langinfo (CODESET);
336
337# ifdef __CYGWIN__
338 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
339 returns "US-ASCII". As long as this is not fixed, return the suffix
340 of the locale name from the environment variables (if present) or
341 the codepage as a number. */
342 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
343 {
344 const char *locale;
345 static char buf[2 + 10 + 1];
346
347 locale = getenv ("LC_ALL");
348 if (locale == NULL || locale[0] == '\0')
349 {
350 locale = getenv ("LC_CTYPE");
351 if (locale == NULL || locale[0] == '\0')
352 locale = getenv ("LANG");
353 }
354 if (locale != NULL && locale[0] != '\0')
355 {
356 /* If the locale name contains an encoding after the dot, return
357 it. */
358 const char *dot = strchr (locale, '.');
359
360 if (dot != NULL)
361 {
362 const char *modifier;
363
364 dot++;
365 /* Look for the possible @... trailer and remove it, if any. */
366 modifier = strchr (dot, '@');
367 if (modifier == NULL)
368 return dot;
369 if (modifier - dot < sizeof (buf))
370 {
371 memcpy (buf, dot, modifier - dot);
372 buf [modifier - dot] = '\0';
373 return buf;
374 }
375 }
376 }
377
378 /* Woe32 has a function returning the locale's codepage as a number. */
379 sprintf (buf, "CP%u", GetACP ());
380 codeset = buf;
381 }
382# endif
383
384# else
385
386 /* On old systems which lack it, use setlocale or getenv. */
387 const char *locale = NULL;
388
389 /* But most old systems don't have a complete set of locales. Some
390 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
391 use setlocale here; it would return "C" when it doesn't support the
392 locale name the user has set. */
393# if 0
394 locale = setlocale (LC_CTYPE, NULL);
395# endif
396 if (locale == NULL || locale[0] == '\0')
397 {
398 locale = getenv ("LC_ALL");
399 if (locale == NULL || locale[0] == '\0')
400 {
401 locale = getenv ("LC_CTYPE");
402 if (locale == NULL || locale[0] == '\0')
403 locale = getenv ("LANG");
404 }
405 }
406
407 /* On some old systems, one used to set locale = "iso8859_1". On others,
408 you set it to "language_COUNTRY.charset". In any case, we resolve it
409 through the charset.alias file. */
410 codeset = locale;
411
412# endif
413
414#elif defined WIN32_NATIVE
415
416 static char buf[2 + 10 + 1];
417
418 /* Woe32 has a function returning the locale's codepage as a number. */
419 sprintf (buf, "CP%u", GetACP ());
420 codeset = buf;
421
422#elif defined OS2
423
424 const char *locale;
425 static char buf[2 + 10 + 1];
426 ULONG cp[3];
427 ULONG cplen;
428
429 /* Allow user to override the codeset, as set in the operating system,
430 with standard language environment variables. */
431 locale = getenv ("LC_ALL");
432 if (locale == NULL || locale[0] == '\0')
433 {
434 locale = getenv ("LC_CTYPE");
435 if (locale == NULL || locale[0] == '\0')
436 locale = getenv ("LANG");
437 }
438 if (locale != NULL && locale[0] != '\0')
439 {
440 /* If the locale name contains an encoding after the dot, return it. */
441 const char *dot = strchr (locale, '.');
442
443 if (dot != NULL)
444 {
445 const char *modifier;
446
447 dot++;
448 /* Look for the possible @... trailer and remove it, if any. */
449 modifier = strchr (dot, '@');
450 if (modifier == NULL)
451 return dot;
452 if (modifier - dot < sizeof (buf))
453 {
454 memcpy (buf, dot, modifier - dot);
455 buf [modifier - dot] = '\0';
456 return buf;
457 }
458 }
459
460 /* Resolve through the charset.alias file. */
461 codeset = locale;
462 }
463 else
464 {
465 /* OS/2 has a function returning the locale's codepage as a number. */
466 if (DosQueryCp (sizeof (cp), cp, &cplen))
467 codeset = "";
468 else
469 {
470 sprintf (buf, "CP%u", cp[0]);
471 codeset = buf;
472 }
473 }
474
475#endif
476
477 if (codeset == NULL)
478 /* The canonical name cannot be determined. */
479 codeset = "";
480
481 /* Resolve alias. */
482 for (aliases = get_charset_aliases ();
483 *aliases != '\0';
484 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
485 if (strcmp (codeset, aliases) == 0
486 || (aliases[0] == '*' && aliases[1] == '\0'))
487 {
488 codeset = aliases + strlen (aliases) + 1;
489 break;
490 }
491
492 /* Don't return an empty string. GNU libc and GNU libiconv interpret
493 the empty string as denoting "the locale's character encoding",
494 thus GNU libiconv would call this function a second time. */
495 if (codeset[0] == '\0')
496 codeset = "ASCII";
497
498 return codeset;
499}