Don't fail when locale env. vars specify a dot-less locale name.
[bpt/guile.git] / lib / localcharset.c
1 /* Determine a canonical name for the current locale's character encoding.
2
3 Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 /* Written by Bruno Haible <bruno@clisp.org>. */
20
21 #include <config.h>
22
23 /* Specification. */
24 #include "localcharset.h"
25
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31
32 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
33 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
34 #endif
35
36 #if defined _WIN32 || defined __WIN32__
37 # define WINDOWS_NATIVE
38 #endif
39
40 #if defined __EMX__
41 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
42 # ifndef OS2
43 # define OS2
44 # endif
45 #endif
46
47 #if !defined WINDOWS_NATIVE
48 # include <unistd.h>
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
51 # else
52 # if 0 /* see comment below */
53 # include <locale.h>
54 # endif
55 # endif
56 # ifdef __CYGWIN__
57 # define WIN32_LEAN_AND_MEAN
58 # include <windows.h>
59 # endif
60 #elif defined WINDOWS_NATIVE
61 # define WIN32_LEAN_AND_MEAN
62 # include <windows.h>
63 #endif
64 #if defined OS2
65 # define INCL_DOS
66 # include <os2.h>
67 #endif
68
69 #if ENABLE_RELOCATABLE
70 # include "relocatable.h"
71 #else
72 # define relocate(pathname) (pathname)
73 #endif
74
75 /* Get LIBDIR. */
76 #ifndef LIBDIR
77 # include "configmake.h"
78 #endif
79
80 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
81 #ifndef O_NOFOLLOW
82 # define O_NOFOLLOW 0
83 #endif
84
85 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
86 /* Native Windows, Cygwin, OS/2, DOS */
87 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
88 #endif
89
90 #ifndef DIRECTORY_SEPARATOR
91 # define DIRECTORY_SEPARATOR '/'
92 #endif
93
94 #ifndef ISSLASH
95 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
96 #endif
97
98 #if HAVE_DECL_GETC_UNLOCKED
99 # undef getc
100 # define getc getc_unlocked
101 #endif
102
103 /* The following static variable is declared 'volatile' to avoid a
104 possible multithread problem in the function get_charset_aliases. If we
105 are running in a threaded environment, and if two threads initialize
106 'charset_aliases' simultaneously, both will produce the same value,
107 and everything will be ok if the two assignments to 'charset_aliases'
108 are atomic. But I don't know what will happen if the two assignments mix. */
109 #if __STDC__ != 1
110 # define volatile /* empty */
111 #endif
112 /* Pointer to the contents of the charset.alias file, if it has already been
113 read, else NULL. Its format is:
114 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
115 static const char * volatile charset_aliases;
116
117 /* Return a pointer to the contents of the charset.alias file. */
118 static const char *
119 get_charset_aliases (void)
120 {
121 const char *cp;
122
123 cp = charset_aliases;
124 if (cp == NULL)
125 {
126 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
127 const char *dir;
128 const char *base = "charset.alias";
129 char *file_name;
130
131 /* Make it possible to override the charset.alias location. This is
132 necessary for running the testsuite before "make install". */
133 dir = getenv ("CHARSETALIASDIR");
134 if (dir == NULL || dir[0] == '\0')
135 dir = relocate (LIBDIR);
136
137 /* Concatenate dir and base into freshly allocated file_name. */
138 {
139 size_t dir_len = strlen (dir);
140 size_t base_len = strlen (base);
141 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
142 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
143 if (file_name != NULL)
144 {
145 memcpy (file_name, dir, dir_len);
146 if (add_slash)
147 file_name[dir_len] = DIRECTORY_SEPARATOR;
148 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
149 }
150 }
151
152 if (file_name == NULL)
153 /* Out of memory. Treat the file as empty. */
154 cp = "";
155 else
156 {
157 int fd;
158
159 /* Open the file. Reject symbolic links on platforms that support
160 O_NOFOLLOW. This is a security feature. Without it, an attacker
161 could retrieve parts of the contents (namely, the tail of the
162 first line that starts with "* ") of an arbitrary file by placing
163 a symbolic link to that file under the name "charset.alias" in
164 some writable directory and defining the environment variable
165 CHARSETALIASDIR to point to that directory. */
166 fd = open (file_name,
167 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
168 if (fd < 0)
169 /* File not found. Treat it as empty. */
170 cp = "";
171 else
172 {
173 FILE *fp;
174
175 fp = fdopen (fd, "r");
176 if (fp == NULL)
177 {
178 /* Out of memory. Treat the file as empty. */
179 close (fd);
180 cp = "";
181 }
182 else
183 {
184 /* Parse the file's contents. */
185 char *res_ptr = NULL;
186 size_t res_size = 0;
187
188 for (;;)
189 {
190 int c;
191 char buf1[50+1];
192 char buf2[50+1];
193 size_t l1, l2;
194 char *old_res_ptr;
195
196 c = getc (fp);
197 if (c == EOF)
198 break;
199 if (c == '\n' || c == ' ' || c == '\t')
200 continue;
201 if (c == '#')
202 {
203 /* Skip comment, to end of line. */
204 do
205 c = getc (fp);
206 while (!(c == EOF || c == '\n'));
207 if (c == EOF)
208 break;
209 continue;
210 }
211 ungetc (c, fp);
212 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
213 break;
214 l1 = strlen (buf1);
215 l2 = strlen (buf2);
216 old_res_ptr = res_ptr;
217 if (res_size == 0)
218 {
219 res_size = l1 + 1 + l2 + 1;
220 res_ptr = (char *) malloc (res_size + 1);
221 }
222 else
223 {
224 res_size += l1 + 1 + l2 + 1;
225 res_ptr = (char *) realloc (res_ptr, res_size + 1);
226 }
227 if (res_ptr == NULL)
228 {
229 /* Out of memory. */
230 res_size = 0;
231 free (old_res_ptr);
232 break;
233 }
234 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
235 strcpy (res_ptr + res_size - (l2 + 1), buf2);
236 }
237 fclose (fp);
238 if (res_size == 0)
239 cp = "";
240 else
241 {
242 *(res_ptr + res_size) = '\0';
243 cp = res_ptr;
244 }
245 }
246 }
247
248 free (file_name);
249 }
250
251 #else
252
253 # if defined DARWIN7
254 /* To avoid the trouble of installing a file that is shared by many
255 GNU packages -- many packaging systems have problems with this --,
256 simply inline the aliases here. */
257 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
258 "ISO8859-2" "\0" "ISO-8859-2" "\0"
259 "ISO8859-4" "\0" "ISO-8859-4" "\0"
260 "ISO8859-5" "\0" "ISO-8859-5" "\0"
261 "ISO8859-7" "\0" "ISO-8859-7" "\0"
262 "ISO8859-9" "\0" "ISO-8859-9" "\0"
263 "ISO8859-13" "\0" "ISO-8859-13" "\0"
264 "ISO8859-15" "\0" "ISO-8859-15" "\0"
265 "KOI8-R" "\0" "KOI8-R" "\0"
266 "KOI8-U" "\0" "KOI8-U" "\0"
267 "CP866" "\0" "CP866" "\0"
268 "CP949" "\0" "CP949" "\0"
269 "CP1131" "\0" "CP1131" "\0"
270 "CP1251" "\0" "CP1251" "\0"
271 "eucCN" "\0" "GB2312" "\0"
272 "GB2312" "\0" "GB2312" "\0"
273 "eucJP" "\0" "EUC-JP" "\0"
274 "eucKR" "\0" "EUC-KR" "\0"
275 "Big5" "\0" "BIG5" "\0"
276 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
277 "GBK" "\0" "GBK" "\0"
278 "GB18030" "\0" "GB18030" "\0"
279 "SJIS" "\0" "SHIFT_JIS" "\0"
280 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
281 "PT154" "\0" "PT154" "\0"
282 /*"ISCII-DEV" "\0" "?" "\0"*/
283 "*" "\0" "UTF-8" "\0";
284 # endif
285
286 # if defined VMS
287 /* To avoid the troubles of an extra file charset.alias_vms in the
288 sources of many GNU packages, simply inline the aliases here. */
289 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
290 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
291 section 10.7 "Handling Different Character Sets". */
292 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
293 "ISO8859-2" "\0" "ISO-8859-2" "\0"
294 "ISO8859-5" "\0" "ISO-8859-5" "\0"
295 "ISO8859-7" "\0" "ISO-8859-7" "\0"
296 "ISO8859-8" "\0" "ISO-8859-8" "\0"
297 "ISO8859-9" "\0" "ISO-8859-9" "\0"
298 /* Japanese */
299 "eucJP" "\0" "EUC-JP" "\0"
300 "SJIS" "\0" "SHIFT_JIS" "\0"
301 "DECKANJI" "\0" "DEC-KANJI" "\0"
302 "SDECKANJI" "\0" "EUC-JP" "\0"
303 /* Chinese */
304 "eucTW" "\0" "EUC-TW" "\0"
305 "DECHANYU" "\0" "DEC-HANYU" "\0"
306 "DECHANZI" "\0" "GB2312" "\0"
307 /* Korean */
308 "DECKOREAN" "\0" "EUC-KR" "\0";
309 # endif
310
311 # if defined WINDOWS_NATIVE || defined __CYGWIN__
312 /* To avoid the troubles of installing a separate file in the same
313 directory as the DLL and of retrieving the DLL's directory at
314 runtime, simply inline the aliases here. */
315
316 cp = "CP936" "\0" "GBK" "\0"
317 "CP1361" "\0" "JOHAB" "\0"
318 "CP20127" "\0" "ASCII" "\0"
319 "CP20866" "\0" "KOI8-R" "\0"
320 "CP20936" "\0" "GB2312" "\0"
321 "CP21866" "\0" "KOI8-RU" "\0"
322 "CP28591" "\0" "ISO-8859-1" "\0"
323 "CP28592" "\0" "ISO-8859-2" "\0"
324 "CP28593" "\0" "ISO-8859-3" "\0"
325 "CP28594" "\0" "ISO-8859-4" "\0"
326 "CP28595" "\0" "ISO-8859-5" "\0"
327 "CP28596" "\0" "ISO-8859-6" "\0"
328 "CP28597" "\0" "ISO-8859-7" "\0"
329 "CP28598" "\0" "ISO-8859-8" "\0"
330 "CP28599" "\0" "ISO-8859-9" "\0"
331 "CP28605" "\0" "ISO-8859-15" "\0"
332 "CP38598" "\0" "ISO-8859-8" "\0"
333 "CP51932" "\0" "EUC-JP" "\0"
334 "CP51936" "\0" "GB2312" "\0"
335 "CP51949" "\0" "EUC-KR" "\0"
336 "CP51950" "\0" "EUC-TW" "\0"
337 "CP54936" "\0" "GB18030" "\0"
338 "CP65001" "\0" "UTF-8" "\0";
339 # endif
340 #endif
341
342 charset_aliases = cp;
343 }
344
345 return cp;
346 }
347
348 /* Determine the current locale's character encoding, and canonicalize it
349 into one of the canonical names listed in config.charset.
350 The result must not be freed; it is statically allocated.
351 If the canonical name cannot be determined, the result is a non-canonical
352 name. */
353
354 #ifdef STATIC
355 STATIC
356 #endif
357 const char *
358 locale_charset (void)
359 {
360 const char *codeset;
361 const char *aliases;
362
363 #if !(defined WINDOWS_NATIVE || defined OS2)
364
365 # if HAVE_LANGINFO_CODESET
366
367 /* Most systems support nl_langinfo (CODESET) nowadays. */
368 codeset = nl_langinfo (CODESET);
369
370 # ifdef __CYGWIN__
371 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
372 returns "US-ASCII". Return the suffix of the locale name from the
373 environment variables (if present) or the codepage as a number. */
374 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
375 {
376 const char *locale;
377 static char buf[2 + 10 + 1];
378
379 locale = getenv ("LC_ALL");
380 if (locale == NULL || locale[0] == '\0')
381 {
382 locale = getenv ("LC_CTYPE");
383 if (locale == NULL || locale[0] == '\0')
384 locale = getenv ("LANG");
385 }
386 if (locale != NULL && locale[0] != '\0')
387 {
388 /* If the locale name contains an encoding after the dot, return
389 it. */
390 const char *dot = strchr (locale, '.');
391
392 if (dot != NULL)
393 {
394 const char *modifier;
395
396 dot++;
397 /* Look for the possible @... trailer and remove it, if any. */
398 modifier = strchr (dot, '@');
399 if (modifier == NULL)
400 return dot;
401 if (modifier - dot < sizeof (buf))
402 {
403 memcpy (buf, dot, modifier - dot);
404 buf [modifier - dot] = '\0';
405 return buf;
406 }
407 }
408 }
409
410 /* The Windows API has a function returning the locale's codepage as a
411 number: GetACP(). This encoding is used by Cygwin, unless the user
412 has set the environment variable CYGWIN=codepage:oem (which very few
413 people do).
414 Output directed to console windows needs to be converted (to
415 GetOEMCP() if the console is using a raster font, or to
416 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
417 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
418 converting to GetConsoleOutputCP(). This leads to correct results,
419 except when SetConsoleOutputCP has been called and a raster font is
420 in use. */
421 sprintf (buf, "CP%u", GetACP ());
422 codeset = buf;
423 }
424 # endif
425
426 # else
427
428 /* On old systems which lack it, use setlocale or getenv. */
429 const char *locale = NULL;
430
431 /* But most old systems don't have a complete set of locales. Some
432 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
433 use setlocale here; it would return "C" when it doesn't support the
434 locale name the user has set. */
435 # if 0
436 locale = setlocale (LC_CTYPE, NULL);
437 # endif
438 if (locale == NULL || locale[0] == '\0')
439 {
440 locale = getenv ("LC_ALL");
441 if (locale == NULL || locale[0] == '\0')
442 {
443 locale = getenv ("LC_CTYPE");
444 if (locale == NULL || locale[0] == '\0')
445 locale = getenv ("LANG");
446 }
447 }
448
449 /* On some old systems, one used to set locale = "iso8859_1". On others,
450 you set it to "language_COUNTRY.charset". In any case, we resolve it
451 through the charset.alias file. */
452 codeset = locale;
453
454 # endif
455
456 #elif defined WINDOWS_NATIVE
457
458 static char buf[2 + 10 + 1];
459
460 /* The Windows API has a function returning the locale's codepage as a
461 number: GetACP().
462 When the output goes to a console window, it needs to be provided in
463 GetOEMCP() encoding if the console is using a raster font, or in
464 GetConsoleOutputCP() encoding if it is using a TrueType font.
465 But in GUI programs and for output sent to files and pipes, GetACP()
466 encoding is the best bet. */
467 sprintf (buf, "CP%u", GetACP ());
468 codeset = buf;
469
470 #elif defined OS2
471
472 const char *locale;
473 static char buf[2 + 10 + 1];
474 ULONG cp[3];
475 ULONG cplen;
476
477 /* Allow user to override the codeset, as set in the operating system,
478 with standard language environment variables. */
479 locale = getenv ("LC_ALL");
480 if (locale == NULL || locale[0] == '\0')
481 {
482 locale = getenv ("LC_CTYPE");
483 if (locale == NULL || locale[0] == '\0')
484 locale = getenv ("LANG");
485 }
486 if (locale != NULL && locale[0] != '\0')
487 {
488 /* If the locale name contains an encoding after the dot, return it. */
489 const char *dot = strchr (locale, '.');
490
491 if (dot != NULL)
492 {
493 const char *modifier;
494
495 dot++;
496 /* Look for the possible @... trailer and remove it, if any. */
497 modifier = strchr (dot, '@');
498 if (modifier == NULL)
499 return dot;
500 if (modifier - dot < sizeof (buf))
501 {
502 memcpy (buf, dot, modifier - dot);
503 buf [modifier - dot] = '\0';
504 return buf;
505 }
506 }
507
508 /* Resolve through the charset.alias file. */
509 codeset = locale;
510 }
511 else
512 {
513 /* OS/2 has a function returning the locale's codepage as a number. */
514 if (DosQueryCp (sizeof (cp), cp, &cplen))
515 codeset = "";
516 else
517 {
518 sprintf (buf, "CP%u", cp[0]);
519 codeset = buf;
520 }
521 }
522
523 #endif
524
525 if (codeset == NULL)
526 /* The canonical name cannot be determined. */
527 codeset = "";
528
529 /* Resolve alias. */
530 for (aliases = get_charset_aliases ();
531 *aliases != '\0';
532 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
533 if (strcmp (codeset, aliases) == 0
534 || (aliases[0] == '*' && aliases[1] == '\0'))
535 {
536 codeset = aliases + strlen (aliases) + 1;
537 break;
538 }
539
540 /* Don't return an empty string. GNU libc and GNU libiconv interpret
541 the empty string as denoting "the locale's character encoding",
542 thus GNU libiconv would call this function a second time. */
543 if (codeset[0] == '\0')
544 codeset = "ASCII";
545
546 return codeset;
547 }
548
549 /* A variant of the above, without calls to `setlocale', `nl_langinfo',
550 etc. */
551 const char *
552 environ_locale_charset (void)
553 {
554 static char buf[2 + 10 + 1];
555 const char *codeset, *aliases;
556 const char *locale = NULL;
557
558 locale = getenv ("LC_ALL");
559 if (locale == NULL || locale[0] == '\0')
560 {
561 locale = getenv ("LC_CTYPE");
562 if (locale == NULL || locale[0] == '\0')
563 locale = getenv ("LANG");
564 }
565
566 if (locale != NULL && locale[0] != '\0')
567 {
568 /* If the locale name contains an encoding after the dot, return it. */
569 const char *dot = strchr (locale, '.');
570
571 if (dot != NULL)
572 {
573 const char *modifier;
574
575 dot++;
576 /* Look for the possible @... trailer and remove it, if any. */
577 modifier = strchr (dot, '@');
578 if (modifier == NULL)
579 return dot;
580 if (modifier - dot < sizeof (buf))
581 {
582 memcpy (buf, dot, modifier - dot);
583 buf [modifier - dot] = '\0';
584 return buf;
585 }
586 }
587 else if (strcmp (locale, "C") == 0)
588 {
589 strcpy (buf, "ASCII");
590 return buf;
591 }
592 else
593 codeset = "";
594 }
595 else
596 codeset = "";
597
598 /* Resolve alias. */
599 for (aliases = get_charset_aliases ();
600 *aliases != '\0';
601 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
602 if (strcmp (codeset, aliases) == 0
603 || (aliases[0] == '*' && aliases[1] == '\0'))
604 {
605 codeset = aliases + strlen (aliases) + 1;
606 break;
607 }
608
609 /* Don't return an empty string. GNU libc and GNU libiconv interpret
610 the empty string as denoting "the locale's character encoding",
611 thus GNU libiconv would call this function a second time. */
612 if (codeset[0] == '\0')
613 /* Default to Latin-1, for backward compatibility with Guile 1.8. */
614 codeset = "ISO-8859-1";
615
616 return codeset;
617 }