Use Gnulib's `isnan' and `isinf' modules.
[bpt/guile.git] / lib / localcharset.c
CommitLineData
7f8e40b7
NJ
1/* Determine a canonical name for the current locale's character encoding.
2
61cd9dc9 3 Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
7f8e40b7
NJ
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19/* Written by Bruno Haible <bruno@clisp.org>. */
20
21#include <config.h>
22
23/* Specification. */
24#include "localcharset.h"
25
8912421c 26#include <fcntl.h>
7f8e40b7
NJ
27#include <stddef.h>
28#include <stdio.h>
29#include <string.h>
30#include <stdlib.h>
31
f240aacb
LC
32#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
33# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
34#endif
35
7f8e40b7
NJ
36#if defined _WIN32 || defined __WIN32__
37# define WIN32_NATIVE
38#endif
39
40#if defined __EMX__
41/* Assume EMX program runs on OS/2, even if compiled under DOS. */
42# ifndef OS2
43# define OS2
44# endif
45#endif
46
47#if !defined WIN32_NATIVE
8912421c 48# include <unistd.h>
7f8e40b7
NJ
49# if HAVE_LANGINFO_CODESET
50# include <langinfo.h>
51# else
52# if 0 /* see comment below */
53# include <locale.h>
54# endif
55# endif
56# ifdef __CYGWIN__
57# define WIN32_LEAN_AND_MEAN
58# include <windows.h>
59# endif
60#elif defined WIN32_NATIVE
61# define WIN32_LEAN_AND_MEAN
62# include <windows.h>
63#endif
64#if defined OS2
65# define INCL_DOS
66# include <os2.h>
67#endif
68
69#if ENABLE_RELOCATABLE
70# include "relocatable.h"
71#else
72# define relocate(pathname) (pathname)
73#endif
74
75/* Get LIBDIR. */
76#ifndef LIBDIR
77# include "configmake.h"
78#endif
79
8912421c
LC
80/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
81#ifndef O_NOFOLLOW
82# define O_NOFOLLOW 0
83#endif
84
7f8e40b7
NJ
85#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
86 /* Win32, Cygwin, OS/2, DOS */
87# define ISSLASH(C) ((C) == '/' || (C) == '\\')
88#endif
89
90#ifndef DIRECTORY_SEPARATOR
91# define DIRECTORY_SEPARATOR '/'
92#endif
93
94#ifndef ISSLASH
95# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
96#endif
97
98#if HAVE_DECL_GETC_UNLOCKED
99# undef getc
100# define getc getc_unlocked
101#endif
102
103/* The following static variable is declared 'volatile' to avoid a
104 possible multithread problem in the function get_charset_aliases. If we
105 are running in a threaded environment, and if two threads initialize
106 'charset_aliases' simultaneously, both will produce the same value,
107 and everything will be ok if the two assignments to 'charset_aliases'
108 are atomic. But I don't know what will happen if the two assignments mix. */
109#if __STDC__ != 1
110# define volatile /* empty */
111#endif
112/* Pointer to the contents of the charset.alias file, if it has already been
113 read, else NULL. Its format is:
114 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
115static const char * volatile charset_aliases;
116
117/* Return a pointer to the contents of the charset.alias file. */
118static const char *
119get_charset_aliases (void)
120{
121 const char *cp;
122
123 cp = charset_aliases;
124 if (cp == NULL)
125 {
f240aacb 126#if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
7f8e40b7
NJ
127 const char *dir;
128 const char *base = "charset.alias";
129 char *file_name;
130
131 /* Make it possible to override the charset.alias location. This is
1cd4fffc 132 necessary for running the testsuite before "make install". */
7f8e40b7
NJ
133 dir = getenv ("CHARSETALIASDIR");
134 if (dir == NULL || dir[0] == '\0')
1cd4fffc 135 dir = relocate (LIBDIR);
7f8e40b7
NJ
136
137 /* Concatenate dir and base into freshly allocated file_name. */
138 {
1cd4fffc
LC
139 size_t dir_len = strlen (dir);
140 size_t base_len = strlen (base);
141 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
142 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
143 if (file_name != NULL)
144 {
145 memcpy (file_name, dir, dir_len);
146 if (add_slash)
147 file_name[dir_len] = DIRECTORY_SEPARATOR;
148 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
149 }
7f8e40b7
NJ
150 }
151
8912421c 152 if (file_name == NULL)
1cd4fffc
LC
153 /* Out of memory. Treat the file as empty. */
154 cp = "";
7f8e40b7 155 else
1cd4fffc
LC
156 {
157 int fd;
158
159 /* Open the file. Reject symbolic links on platforms that support
160 O_NOFOLLOW. This is a security feature. Without it, an attacker
161 could retrieve parts of the contents (namely, the tail of the
162 first line that starts with "* ") of an arbitrary file by placing
163 a symbolic link to that file under the name "charset.alias" in
164 some writable directory and defining the environment variable
165 CHARSETALIASDIR to point to that directory. */
166 fd = open (file_name,
167 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
168 if (fd < 0)
169 /* File not found. Treat it as empty. */
170 cp = "";
171 else
172 {
173 FILE *fp;
174
175 fp = fdopen (fd, "r");
176 if (fp == NULL)
177 {
178 /* Out of memory. Treat the file as empty. */
179 close (fd);
180 cp = "";
181 }
182 else
183 {
184 /* Parse the file's contents. */
185 char *res_ptr = NULL;
186 size_t res_size = 0;
187
188 for (;;)
189 {
190 int c;
191 char buf1[50+1];
192 char buf2[50+1];
193 size_t l1, l2;
194 char *old_res_ptr;
195
196 c = getc (fp);
197 if (c == EOF)
198 break;
199 if (c == '\n' || c == ' ' || c == '\t')
200 continue;
201 if (c == '#')
202 {
203 /* Skip comment, to end of line. */
204 do
205 c = getc (fp);
206 while (!(c == EOF || c == '\n'));
207 if (c == EOF)
208 break;
209 continue;
210 }
211 ungetc (c, fp);
212 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
213 break;
214 l1 = strlen (buf1);
215 l2 = strlen (buf2);
216 old_res_ptr = res_ptr;
217 if (res_size == 0)
218 {
219 res_size = l1 + 1 + l2 + 1;
220 res_ptr = (char *) malloc (res_size + 1);
221 }
222 else
223 {
224 res_size += l1 + 1 + l2 + 1;
225 res_ptr = (char *) realloc (res_ptr, res_size + 1);
226 }
227 if (res_ptr == NULL)
228 {
229 /* Out of memory. */
230 res_size = 0;
231 if (old_res_ptr != NULL)
232 free (old_res_ptr);
233 break;
234 }
235 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
236 strcpy (res_ptr + res_size - (l2 + 1), buf2);
237 }
238 fclose (fp);
239 if (res_size == 0)
240 cp = "";
241 else
242 {
243 *(res_ptr + res_size) = '\0';
244 cp = res_ptr;
245 }
246 }
247 }
248
249 free (file_name);
250 }
7f8e40b7
NJ
251
252#else
253
f240aacb
LC
254# if defined DARWIN7
255 /* To avoid the trouble of installing a file that is shared by many
1cd4fffc
LC
256 GNU packages -- many packaging systems have problems with this --,
257 simply inline the aliases here. */
f240aacb 258 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
1cd4fffc
LC
259 "ISO8859-2" "\0" "ISO-8859-2" "\0"
260 "ISO8859-4" "\0" "ISO-8859-4" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-9" "\0" "ISO-8859-9" "\0"
264 "ISO8859-13" "\0" "ISO-8859-13" "\0"
265 "ISO8859-15" "\0" "ISO-8859-15" "\0"
266 "KOI8-R" "\0" "KOI8-R" "\0"
267 "KOI8-U" "\0" "KOI8-U" "\0"
268 "CP866" "\0" "CP866" "\0"
269 "CP949" "\0" "CP949" "\0"
270 "CP1131" "\0" "CP1131" "\0"
271 "CP1251" "\0" "CP1251" "\0"
272 "eucCN" "\0" "GB2312" "\0"
273 "GB2312" "\0" "GB2312" "\0"
274 "eucJP" "\0" "EUC-JP" "\0"
275 "eucKR" "\0" "EUC-KR" "\0"
276 "Big5" "\0" "BIG5" "\0"
277 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
278 "GBK" "\0" "GBK" "\0"
279 "GB18030" "\0" "GB18030" "\0"
280 "SJIS" "\0" "SHIFT_JIS" "\0"
281 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
282 "PT154" "\0" "PT154" "\0"
283 /*"ISCII-DEV" "\0" "?" "\0"*/
284 "*" "\0" "UTF-8" "\0";
f240aacb
LC
285# endif
286
7f8e40b7
NJ
287# if defined VMS
288 /* To avoid the troubles of an extra file charset.alias_vms in the
1cd4fffc 289 sources of many GNU packages, simply inline the aliases here. */
7f8e40b7 290 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
1cd4fffc
LC
291 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
292 section 10.7 "Handling Different Character Sets". */
7f8e40b7 293 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
1cd4fffc
LC
294 "ISO8859-2" "\0" "ISO-8859-2" "\0"
295 "ISO8859-5" "\0" "ISO-8859-5" "\0"
296 "ISO8859-7" "\0" "ISO-8859-7" "\0"
297 "ISO8859-8" "\0" "ISO-8859-8" "\0"
298 "ISO8859-9" "\0" "ISO-8859-9" "\0"
299 /* Japanese */
300 "eucJP" "\0" "EUC-JP" "\0"
301 "SJIS" "\0" "SHIFT_JIS" "\0"
302 "DECKANJI" "\0" "DEC-KANJI" "\0"
303 "SDECKANJI" "\0" "EUC-JP" "\0"
304 /* Chinese */
305 "eucTW" "\0" "EUC-TW" "\0"
306 "DECHANYU" "\0" "DEC-HANYU" "\0"
307 "DECHANZI" "\0" "GB2312" "\0"
308 /* Korean */
309 "DECKOREAN" "\0" "EUC-KR" "\0";
7f8e40b7
NJ
310# endif
311
312# if defined WIN32_NATIVE || defined __CYGWIN__
313 /* To avoid the troubles of installing a separate file in the same
1cd4fffc
LC
314 directory as the DLL and of retrieving the DLL's directory at
315 runtime, simply inline the aliases here. */
7f8e40b7
NJ
316
317 cp = "CP936" "\0" "GBK" "\0"
1cd4fffc
LC
318 "CP1361" "\0" "JOHAB" "\0"
319 "CP20127" "\0" "ASCII" "\0"
320 "CP20866" "\0" "KOI8-R" "\0"
321 "CP20936" "\0" "GB2312" "\0"
322 "CP21866" "\0" "KOI8-RU" "\0"
323 "CP28591" "\0" "ISO-8859-1" "\0"
324 "CP28592" "\0" "ISO-8859-2" "\0"
325 "CP28593" "\0" "ISO-8859-3" "\0"
326 "CP28594" "\0" "ISO-8859-4" "\0"
327 "CP28595" "\0" "ISO-8859-5" "\0"
328 "CP28596" "\0" "ISO-8859-6" "\0"
329 "CP28597" "\0" "ISO-8859-7" "\0"
330 "CP28598" "\0" "ISO-8859-8" "\0"
331 "CP28599" "\0" "ISO-8859-9" "\0"
332 "CP28605" "\0" "ISO-8859-15" "\0"
333 "CP38598" "\0" "ISO-8859-8" "\0"
334 "CP51932" "\0" "EUC-JP" "\0"
335 "CP51936" "\0" "GB2312" "\0"
336 "CP51949" "\0" "EUC-KR" "\0"
337 "CP51950" "\0" "EUC-TW" "\0"
338 "CP54936" "\0" "GB18030" "\0"
339 "CP65001" "\0" "UTF-8" "\0";
7f8e40b7
NJ
340# endif
341#endif
342
343 charset_aliases = cp;
344 }
345
346 return cp;
347}
348
349/* Determine the current locale's character encoding, and canonicalize it
350 into one of the canonical names listed in config.charset.
351 The result must not be freed; it is statically allocated.
352 If the canonical name cannot be determined, the result is a non-canonical
353 name. */
354
355#ifdef STATIC
356STATIC
357#endif
358const char *
359locale_charset (void)
360{
361 const char *codeset;
362 const char *aliases;
363
364#if !(defined WIN32_NATIVE || defined OS2)
365
366# if HAVE_LANGINFO_CODESET
367
368 /* Most systems support nl_langinfo (CODESET) nowadays. */
369 codeset = nl_langinfo (CODESET);
370
371# ifdef __CYGWIN__
a927b6c1
LC
372 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
373 returns "US-ASCII". Return the suffix of the locale name from the
374 environment variables (if present) or the codepage as a number. */
7f8e40b7
NJ
375 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
376 {
377 const char *locale;
378 static char buf[2 + 10 + 1];
379
380 locale = getenv ("LC_ALL");
381 if (locale == NULL || locale[0] == '\0')
1cd4fffc
LC
382 {
383 locale = getenv ("LC_CTYPE");
384 if (locale == NULL || locale[0] == '\0')
385 locale = getenv ("LANG");
386 }
7f8e40b7 387 if (locale != NULL && locale[0] != '\0')
1cd4fffc
LC
388 {
389 /* If the locale name contains an encoding after the dot, return
390 it. */
391 const char *dot = strchr (locale, '.');
392
393 if (dot != NULL)
394 {
395 const char *modifier;
396
397 dot++;
398 /* Look for the possible @... trailer and remove it, if any. */
399 modifier = strchr (dot, '@');
400 if (modifier == NULL)
401 return dot;
402 if (modifier - dot < sizeof (buf))
403 {
404 memcpy (buf, dot, modifier - dot);
405 buf [modifier - dot] = '\0';
406 return buf;
407 }
408 }
409 }
410
411 /* Woe32 has a function returning the locale's codepage as a number:
412 GetACP(). This encoding is used by Cygwin, unless the user has set
413 the environment variable CYGWIN=codepage:oem (which very few people
414 do).
415 Output directed to console windows needs to be converted (to
416 GetOEMCP() if the console is using a raster font, or to
417 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
418 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
419 converting to GetConsoleOutputCP(). This leads to correct results,
420 except when SetConsoleOutputCP has been called and a raster font is
421 in use. */
7f8e40b7
NJ
422 sprintf (buf, "CP%u", GetACP ());
423 codeset = buf;
424 }
425# endif
426
427# else
428
429 /* On old systems which lack it, use setlocale or getenv. */
430 const char *locale = NULL;
431
432 /* But most old systems don't have a complete set of locales. Some
433 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
434 use setlocale here; it would return "C" when it doesn't support the
435 locale name the user has set. */
436# if 0
437 locale = setlocale (LC_CTYPE, NULL);
438# endif
439 if (locale == NULL || locale[0] == '\0')
440 {
441 locale = getenv ("LC_ALL");
442 if (locale == NULL || locale[0] == '\0')
1cd4fffc
LC
443 {
444 locale = getenv ("LC_CTYPE");
445 if (locale == NULL || locale[0] == '\0')
446 locale = getenv ("LANG");
447 }
7f8e40b7
NJ
448 }
449
450 /* On some old systems, one used to set locale = "iso8859_1". On others,
451 you set it to "language_COUNTRY.charset". In any case, we resolve it
452 through the charset.alias file. */
453 codeset = locale;
454
455# endif
456
457#elif defined WIN32_NATIVE
458
459 static char buf[2 + 10 + 1];
460
1cd4fffc
LC
461 /* Woe32 has a function returning the locale's codepage as a number:
462 GetACP().
463 When the output goes to a console window, it needs to be provided in
464 GetOEMCP() encoding if the console is using a raster font, or in
465 GetConsoleOutputCP() encoding if it is using a TrueType font.
466 But in GUI programs and for output sent to files and pipes, GetACP()
467 encoding is the best bet. */
7f8e40b7
NJ
468 sprintf (buf, "CP%u", GetACP ());
469 codeset = buf;
470
471#elif defined OS2
472
473 const char *locale;
474 static char buf[2 + 10 + 1];
475 ULONG cp[3];
476 ULONG cplen;
477
478 /* Allow user to override the codeset, as set in the operating system,
479 with standard language environment variables. */
480 locale = getenv ("LC_ALL");
481 if (locale == NULL || locale[0] == '\0')
482 {
483 locale = getenv ("LC_CTYPE");
484 if (locale == NULL || locale[0] == '\0')
1cd4fffc 485 locale = getenv ("LANG");
7f8e40b7
NJ
486 }
487 if (locale != NULL && locale[0] != '\0')
488 {
489 /* If the locale name contains an encoding after the dot, return it. */
490 const char *dot = strchr (locale, '.');
491
492 if (dot != NULL)
1cd4fffc
LC
493 {
494 const char *modifier;
495
496 dot++;
497 /* Look for the possible @... trailer and remove it, if any. */
498 modifier = strchr (dot, '@');
499 if (modifier == NULL)
500 return dot;
501 if (modifier - dot < sizeof (buf))
502 {
503 memcpy (buf, dot, modifier - dot);
504 buf [modifier - dot] = '\0';
505 return buf;
506 }
507 }
7f8e40b7
NJ
508
509 /* Resolve through the charset.alias file. */
510 codeset = locale;
511 }
512 else
513 {
514 /* OS/2 has a function returning the locale's codepage as a number. */
515 if (DosQueryCp (sizeof (cp), cp, &cplen))
1cd4fffc 516 codeset = "";
7f8e40b7 517 else
1cd4fffc
LC
518 {
519 sprintf (buf, "CP%u", cp[0]);
520 codeset = buf;
521 }
7f8e40b7
NJ
522 }
523
524#endif
525
526 if (codeset == NULL)
527 /* The canonical name cannot be determined. */
528 codeset = "";
529
530 /* Resolve alias. */
531 for (aliases = get_charset_aliases ();
532 *aliases != '\0';
533 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
534 if (strcmp (codeset, aliases) == 0
1cd4fffc 535 || (aliases[0] == '*' && aliases[1] == '\0'))
7f8e40b7 536 {
1cd4fffc
LC
537 codeset = aliases + strlen (aliases) + 1;
538 break;
7f8e40b7
NJ
539 }
540
541 /* Don't return an empty string. GNU libc and GNU libiconv interpret
542 the empty string as denoting "the locale's character encoding",
543 thus GNU libiconv would call this function a second time. */
544 if (codeset[0] == '\0')
545 codeset = "ASCII";
546
547 return codeset;
548}