Use Gnulib's `sys_stat' module; update Gnulib.
[bpt/guile.git] / lib / localcharset.c
CommitLineData
7f8e40b7
NJ
1/* Determine a canonical name for the current locale's character encoding.
2
f240aacb 3 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
7f8e40b7
NJ
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19/* Written by Bruno Haible <bruno@clisp.org>. */
20
21#include <config.h>
22
23/* Specification. */
24#include "localcharset.h"
25
8912421c 26#include <fcntl.h>
7f8e40b7
NJ
27#include <stddef.h>
28#include <stdio.h>
29#include <string.h>
30#include <stdlib.h>
31
f240aacb
LC
32#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
33# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
34#endif
35
7f8e40b7
NJ
36#if defined _WIN32 || defined __WIN32__
37# define WIN32_NATIVE
38#endif
39
40#if defined __EMX__
41/* Assume EMX program runs on OS/2, even if compiled under DOS. */
42# ifndef OS2
43# define OS2
44# endif
45#endif
46
47#if !defined WIN32_NATIVE
8912421c 48# include <unistd.h>
7f8e40b7
NJ
49# if HAVE_LANGINFO_CODESET
50# include <langinfo.h>
51# else
52# if 0 /* see comment below */
53# include <locale.h>
54# endif
55# endif
56# ifdef __CYGWIN__
57# define WIN32_LEAN_AND_MEAN
58# include <windows.h>
59# endif
60#elif defined WIN32_NATIVE
61# define WIN32_LEAN_AND_MEAN
62# include <windows.h>
63#endif
64#if defined OS2
65# define INCL_DOS
66# include <os2.h>
67#endif
68
69#if ENABLE_RELOCATABLE
70# include "relocatable.h"
71#else
72# define relocate(pathname) (pathname)
73#endif
74
75/* Get LIBDIR. */
76#ifndef LIBDIR
77# include "configmake.h"
78#endif
79
8912421c
LC
80/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
81#ifndef O_NOFOLLOW
82# define O_NOFOLLOW 0
83#endif
84
7f8e40b7
NJ
85#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
86 /* Win32, Cygwin, OS/2, DOS */
87# define ISSLASH(C) ((C) == '/' || (C) == '\\')
88#endif
89
90#ifndef DIRECTORY_SEPARATOR
91# define DIRECTORY_SEPARATOR '/'
92#endif
93
94#ifndef ISSLASH
95# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
96#endif
97
98#if HAVE_DECL_GETC_UNLOCKED
99# undef getc
100# define getc getc_unlocked
101#endif
102
103/* The following static variable is declared 'volatile' to avoid a
104 possible multithread problem in the function get_charset_aliases. If we
105 are running in a threaded environment, and if two threads initialize
106 'charset_aliases' simultaneously, both will produce the same value,
107 and everything will be ok if the two assignments to 'charset_aliases'
108 are atomic. But I don't know what will happen if the two assignments mix. */
109#if __STDC__ != 1
110# define volatile /* empty */
111#endif
112/* Pointer to the contents of the charset.alias file, if it has already been
113 read, else NULL. Its format is:
114 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
115static const char * volatile charset_aliases;
116
117/* Return a pointer to the contents of the charset.alias file. */
118static const char *
119get_charset_aliases (void)
120{
121 const char *cp;
122
123 cp = charset_aliases;
124 if (cp == NULL)
125 {
f240aacb 126#if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
7f8e40b7
NJ
127 const char *dir;
128 const char *base = "charset.alias";
129 char *file_name;
130
131 /* Make it possible to override the charset.alias location. This is
1cd4fffc 132 necessary for running the testsuite before "make install". */
7f8e40b7
NJ
133 dir = getenv ("CHARSETALIASDIR");
134 if (dir == NULL || dir[0] == '\0')
1cd4fffc 135 dir = relocate (LIBDIR);
7f8e40b7
NJ
136
137 /* Concatenate dir and base into freshly allocated file_name. */
138 {
1cd4fffc
LC
139 size_t dir_len = strlen (dir);
140 size_t base_len = strlen (base);
141 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
142 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
143 if (file_name != NULL)
144 {
145 memcpy (file_name, dir, dir_len);
146 if (add_slash)
147 file_name[dir_len] = DIRECTORY_SEPARATOR;
148 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
149 }
7f8e40b7
NJ
150 }
151
8912421c 152 if (file_name == NULL)
1cd4fffc
LC
153 /* Out of memory. Treat the file as empty. */
154 cp = "";
7f8e40b7 155 else
1cd4fffc
LC
156 {
157 int fd;
158
159 /* Open the file. Reject symbolic links on platforms that support
160 O_NOFOLLOW. This is a security feature. Without it, an attacker
161 could retrieve parts of the contents (namely, the tail of the
162 first line that starts with "* ") of an arbitrary file by placing
163 a symbolic link to that file under the name "charset.alias" in
164 some writable directory and defining the environment variable
165 CHARSETALIASDIR to point to that directory. */
166 fd = open (file_name,
167 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
168 if (fd < 0)
169 /* File not found. Treat it as empty. */
170 cp = "";
171 else
172 {
173 FILE *fp;
174
175 fp = fdopen (fd, "r");
176 if (fp == NULL)
177 {
178 /* Out of memory. Treat the file as empty. */
179 close (fd);
180 cp = "";
181 }
182 else
183 {
184 /* Parse the file's contents. */
185 char *res_ptr = NULL;
186 size_t res_size = 0;
187
188 for (;;)
189 {
190 int c;
191 char buf1[50+1];
192 char buf2[50+1];
193 size_t l1, l2;
194 char *old_res_ptr;
195
196 c = getc (fp);
197 if (c == EOF)
198 break;
199 if (c == '\n' || c == ' ' || c == '\t')
200 continue;
201 if (c == '#')
202 {
203 /* Skip comment, to end of line. */
204 do
205 c = getc (fp);
206 while (!(c == EOF || c == '\n'));
207 if (c == EOF)
208 break;
209 continue;
210 }
211 ungetc (c, fp);
212 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
213 break;
214 l1 = strlen (buf1);
215 l2 = strlen (buf2);
216 old_res_ptr = res_ptr;
217 if (res_size == 0)
218 {
219 res_size = l1 + 1 + l2 + 1;
220 res_ptr = (char *) malloc (res_size + 1);
221 }
222 else
223 {
224 res_size += l1 + 1 + l2 + 1;
225 res_ptr = (char *) realloc (res_ptr, res_size + 1);
226 }
227 if (res_ptr == NULL)
228 {
229 /* Out of memory. */
230 res_size = 0;
231 if (old_res_ptr != NULL)
232 free (old_res_ptr);
233 break;
234 }
235 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
236 strcpy (res_ptr + res_size - (l2 + 1), buf2);
237 }
238 fclose (fp);
239 if (res_size == 0)
240 cp = "";
241 else
242 {
243 *(res_ptr + res_size) = '\0';
244 cp = res_ptr;
245 }
246 }
247 }
248
249 free (file_name);
250 }
7f8e40b7
NJ
251
252#else
253
f240aacb
LC
254# if defined DARWIN7
255 /* To avoid the trouble of installing a file that is shared by many
1cd4fffc
LC
256 GNU packages -- many packaging systems have problems with this --,
257 simply inline the aliases here. */
f240aacb 258 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
1cd4fffc
LC
259 "ISO8859-2" "\0" "ISO-8859-2" "\0"
260 "ISO8859-4" "\0" "ISO-8859-4" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-9" "\0" "ISO-8859-9" "\0"
264 "ISO8859-13" "\0" "ISO-8859-13" "\0"
265 "ISO8859-15" "\0" "ISO-8859-15" "\0"
266 "KOI8-R" "\0" "KOI8-R" "\0"
267 "KOI8-U" "\0" "KOI8-U" "\0"
268 "CP866" "\0" "CP866" "\0"
269 "CP949" "\0" "CP949" "\0"
270 "CP1131" "\0" "CP1131" "\0"
271 "CP1251" "\0" "CP1251" "\0"
272 "eucCN" "\0" "GB2312" "\0"
273 "GB2312" "\0" "GB2312" "\0"
274 "eucJP" "\0" "EUC-JP" "\0"
275 "eucKR" "\0" "EUC-KR" "\0"
276 "Big5" "\0" "BIG5" "\0"
277 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
278 "GBK" "\0" "GBK" "\0"
279 "GB18030" "\0" "GB18030" "\0"
280 "SJIS" "\0" "SHIFT_JIS" "\0"
281 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
282 "PT154" "\0" "PT154" "\0"
283 /*"ISCII-DEV" "\0" "?" "\0"*/
284 "*" "\0" "UTF-8" "\0";
f240aacb
LC
285# endif
286
7f8e40b7
NJ
287# if defined VMS
288 /* To avoid the troubles of an extra file charset.alias_vms in the
1cd4fffc 289 sources of many GNU packages, simply inline the aliases here. */
7f8e40b7 290 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
1cd4fffc
LC
291 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
292 section 10.7 "Handling Different Character Sets". */
7f8e40b7 293 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
1cd4fffc
LC
294 "ISO8859-2" "\0" "ISO-8859-2" "\0"
295 "ISO8859-5" "\0" "ISO-8859-5" "\0"
296 "ISO8859-7" "\0" "ISO-8859-7" "\0"
297 "ISO8859-8" "\0" "ISO-8859-8" "\0"
298 "ISO8859-9" "\0" "ISO-8859-9" "\0"
299 /* Japanese */
300 "eucJP" "\0" "EUC-JP" "\0"
301 "SJIS" "\0" "SHIFT_JIS" "\0"
302 "DECKANJI" "\0" "DEC-KANJI" "\0"
303 "SDECKANJI" "\0" "EUC-JP" "\0"
304 /* Chinese */
305 "eucTW" "\0" "EUC-TW" "\0"
306 "DECHANYU" "\0" "DEC-HANYU" "\0"
307 "DECHANZI" "\0" "GB2312" "\0"
308 /* Korean */
309 "DECKOREAN" "\0" "EUC-KR" "\0";
7f8e40b7
NJ
310# endif
311
312# if defined WIN32_NATIVE || defined __CYGWIN__
313 /* To avoid the troubles of installing a separate file in the same
1cd4fffc
LC
314 directory as the DLL and of retrieving the DLL's directory at
315 runtime, simply inline the aliases here. */
7f8e40b7
NJ
316
317 cp = "CP936" "\0" "GBK" "\0"
1cd4fffc
LC
318 "CP1361" "\0" "JOHAB" "\0"
319 "CP20127" "\0" "ASCII" "\0"
320 "CP20866" "\0" "KOI8-R" "\0"
321 "CP20936" "\0" "GB2312" "\0"
322 "CP21866" "\0" "KOI8-RU" "\0"
323 "CP28591" "\0" "ISO-8859-1" "\0"
324 "CP28592" "\0" "ISO-8859-2" "\0"
325 "CP28593" "\0" "ISO-8859-3" "\0"
326 "CP28594" "\0" "ISO-8859-4" "\0"
327 "CP28595" "\0" "ISO-8859-5" "\0"
328 "CP28596" "\0" "ISO-8859-6" "\0"
329 "CP28597" "\0" "ISO-8859-7" "\0"
330 "CP28598" "\0" "ISO-8859-8" "\0"
331 "CP28599" "\0" "ISO-8859-9" "\0"
332 "CP28605" "\0" "ISO-8859-15" "\0"
333 "CP38598" "\0" "ISO-8859-8" "\0"
334 "CP51932" "\0" "EUC-JP" "\0"
335 "CP51936" "\0" "GB2312" "\0"
336 "CP51949" "\0" "EUC-KR" "\0"
337 "CP51950" "\0" "EUC-TW" "\0"
338 "CP54936" "\0" "GB18030" "\0"
339 "CP65001" "\0" "UTF-8" "\0";
7f8e40b7
NJ
340# endif
341#endif
342
343 charset_aliases = cp;
344 }
345
346 return cp;
347}
348
349/* Determine the current locale's character encoding, and canonicalize it
350 into one of the canonical names listed in config.charset.
351 The result must not be freed; it is statically allocated.
352 If the canonical name cannot be determined, the result is a non-canonical
353 name. */
354
355#ifdef STATIC
356STATIC
357#endif
358const char *
359locale_charset (void)
360{
361 const char *codeset;
362 const char *aliases;
363
364#if !(defined WIN32_NATIVE || defined OS2)
365
366# if HAVE_LANGINFO_CODESET
367
368 /* Most systems support nl_langinfo (CODESET) nowadays. */
369 codeset = nl_langinfo (CODESET);
370
371# ifdef __CYGWIN__
1cd4fffc 372 /* Cygwin 1.5.x does not have locales. nl_langinfo (CODESET) always
7f8e40b7
NJ
373 returns "US-ASCII". As long as this is not fixed, return the suffix
374 of the locale name from the environment variables (if present) or
375 the codepage as a number. */
376 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
377 {
378 const char *locale;
379 static char buf[2 + 10 + 1];
380
381 locale = getenv ("LC_ALL");
382 if (locale == NULL || locale[0] == '\0')
1cd4fffc
LC
383 {
384 locale = getenv ("LC_CTYPE");
385 if (locale == NULL || locale[0] == '\0')
386 locale = getenv ("LANG");
387 }
7f8e40b7 388 if (locale != NULL && locale[0] != '\0')
1cd4fffc
LC
389 {
390 /* If the locale name contains an encoding after the dot, return
391 it. */
392 const char *dot = strchr (locale, '.');
393
394 if (dot != NULL)
395 {
396 const char *modifier;
397
398 dot++;
399 /* Look for the possible @... trailer and remove it, if any. */
400 modifier = strchr (dot, '@');
401 if (modifier == NULL)
402 return dot;
403 if (modifier - dot < sizeof (buf))
404 {
405 memcpy (buf, dot, modifier - dot);
406 buf [modifier - dot] = '\0';
407 return buf;
408 }
409 }
410 }
411
412 /* Woe32 has a function returning the locale's codepage as a number:
413 GetACP(). This encoding is used by Cygwin, unless the user has set
414 the environment variable CYGWIN=codepage:oem (which very few people
415 do).
416 Output directed to console windows needs to be converted (to
417 GetOEMCP() if the console is using a raster font, or to
418 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
419 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
420 converting to GetConsoleOutputCP(). This leads to correct results,
421 except when SetConsoleOutputCP has been called and a raster font is
422 in use. */
7f8e40b7
NJ
423 sprintf (buf, "CP%u", GetACP ());
424 codeset = buf;
425 }
426# endif
427
428# else
429
430 /* On old systems which lack it, use setlocale or getenv. */
431 const char *locale = NULL;
432
433 /* But most old systems don't have a complete set of locales. Some
434 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
435 use setlocale here; it would return "C" when it doesn't support the
436 locale name the user has set. */
437# if 0
438 locale = setlocale (LC_CTYPE, NULL);
439# endif
440 if (locale == NULL || locale[0] == '\0')
441 {
442 locale = getenv ("LC_ALL");
443 if (locale == NULL || locale[0] == '\0')
1cd4fffc
LC
444 {
445 locale = getenv ("LC_CTYPE");
446 if (locale == NULL || locale[0] == '\0')
447 locale = getenv ("LANG");
448 }
7f8e40b7
NJ
449 }
450
451 /* On some old systems, one used to set locale = "iso8859_1". On others,
452 you set it to "language_COUNTRY.charset". In any case, we resolve it
453 through the charset.alias file. */
454 codeset = locale;
455
456# endif
457
458#elif defined WIN32_NATIVE
459
460 static char buf[2 + 10 + 1];
461
1cd4fffc
LC
462 /* Woe32 has a function returning the locale's codepage as a number:
463 GetACP().
464 When the output goes to a console window, it needs to be provided in
465 GetOEMCP() encoding if the console is using a raster font, or in
466 GetConsoleOutputCP() encoding if it is using a TrueType font.
467 But in GUI programs and for output sent to files and pipes, GetACP()
468 encoding is the best bet. */
7f8e40b7
NJ
469 sprintf (buf, "CP%u", GetACP ());
470 codeset = buf;
471
472#elif defined OS2
473
474 const char *locale;
475 static char buf[2 + 10 + 1];
476 ULONG cp[3];
477 ULONG cplen;
478
479 /* Allow user to override the codeset, as set in the operating system,
480 with standard language environment variables. */
481 locale = getenv ("LC_ALL");
482 if (locale == NULL || locale[0] == '\0')
483 {
484 locale = getenv ("LC_CTYPE");
485 if (locale == NULL || locale[0] == '\0')
1cd4fffc 486 locale = getenv ("LANG");
7f8e40b7
NJ
487 }
488 if (locale != NULL && locale[0] != '\0')
489 {
490 /* If the locale name contains an encoding after the dot, return it. */
491 const char *dot = strchr (locale, '.');
492
493 if (dot != NULL)
1cd4fffc
LC
494 {
495 const char *modifier;
496
497 dot++;
498 /* Look for the possible @... trailer and remove it, if any. */
499 modifier = strchr (dot, '@');
500 if (modifier == NULL)
501 return dot;
502 if (modifier - dot < sizeof (buf))
503 {
504 memcpy (buf, dot, modifier - dot);
505 buf [modifier - dot] = '\0';
506 return buf;
507 }
508 }
7f8e40b7
NJ
509
510 /* Resolve through the charset.alias file. */
511 codeset = locale;
512 }
513 else
514 {
515 /* OS/2 has a function returning the locale's codepage as a number. */
516 if (DosQueryCp (sizeof (cp), cp, &cplen))
1cd4fffc 517 codeset = "";
7f8e40b7 518 else
1cd4fffc
LC
519 {
520 sprintf (buf, "CP%u", cp[0]);
521 codeset = buf;
522 }
7f8e40b7
NJ
523 }
524
525#endif
526
527 if (codeset == NULL)
528 /* The canonical name cannot be determined. */
529 codeset = "";
530
531 /* Resolve alias. */
532 for (aliases = get_charset_aliases ();
533 *aliases != '\0';
534 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
535 if (strcmp (codeset, aliases) == 0
1cd4fffc 536 || (aliases[0] == '*' && aliases[1] == '\0'))
7f8e40b7 537 {
1cd4fffc
LC
538 codeset = aliases + strlen (aliases) + 1;
539 break;
7f8e40b7
NJ
540 }
541
542 /* Don't return an empty string. GNU libc and GNU libiconv interpret
543 the empty string as denoting "the locale's character encoding",
544 thus GNU libiconv would call this function a second time. */
545 if (codeset[0] == '\0')
546 codeset = "ASCII";
547
548 return codeset;
549}