Use Gnulib's `inet_ntop' and `inet_pton' modules.
[bpt/guile.git] / lib / localcharset.c
1 /* Determine a canonical name for the current locale's character encoding.
2
3 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 /* Written by Bruno Haible <bruno@clisp.org>. */
20
21 #include <config.h>
22
23 /* Specification. */
24 #include "localcharset.h"
25
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31
32 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
33 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
34 #endif
35
36 #if defined _WIN32 || defined __WIN32__
37 # define WIN32_NATIVE
38 #endif
39
40 #if defined __EMX__
41 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
42 # ifndef OS2
43 # define OS2
44 # endif
45 #endif
46
47 #if !defined WIN32_NATIVE
48 # include <unistd.h>
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
51 # else
52 # if 0 /* see comment below */
53 # include <locale.h>
54 # endif
55 # endif
56 # ifdef __CYGWIN__
57 # define WIN32_LEAN_AND_MEAN
58 # include <windows.h>
59 # endif
60 #elif defined WIN32_NATIVE
61 # define WIN32_LEAN_AND_MEAN
62 # include <windows.h>
63 #endif
64 #if defined OS2
65 # define INCL_DOS
66 # include <os2.h>
67 #endif
68
69 #if ENABLE_RELOCATABLE
70 # include "relocatable.h"
71 #else
72 # define relocate(pathname) (pathname)
73 #endif
74
75 /* Get LIBDIR. */
76 #ifndef LIBDIR
77 # include "configmake.h"
78 #endif
79
80 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
81 #ifndef O_NOFOLLOW
82 # define O_NOFOLLOW 0
83 #endif
84
85 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
86 /* Win32, Cygwin, OS/2, DOS */
87 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
88 #endif
89
90 #ifndef DIRECTORY_SEPARATOR
91 # define DIRECTORY_SEPARATOR '/'
92 #endif
93
94 #ifndef ISSLASH
95 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
96 #endif
97
98 #if HAVE_DECL_GETC_UNLOCKED
99 # undef getc
100 # define getc getc_unlocked
101 #endif
102
103 /* The following static variable is declared 'volatile' to avoid a
104 possible multithread problem in the function get_charset_aliases. If we
105 are running in a threaded environment, and if two threads initialize
106 'charset_aliases' simultaneously, both will produce the same value,
107 and everything will be ok if the two assignments to 'charset_aliases'
108 are atomic. But I don't know what will happen if the two assignments mix. */
109 #if __STDC__ != 1
110 # define volatile /* empty */
111 #endif
112 /* Pointer to the contents of the charset.alias file, if it has already been
113 read, else NULL. Its format is:
114 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
115 static const char * volatile charset_aliases;
116
117 /* Return a pointer to the contents of the charset.alias file. */
118 static const char *
119 get_charset_aliases (void)
120 {
121 const char *cp;
122
123 cp = charset_aliases;
124 if (cp == NULL)
125 {
126 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
127 const char *dir;
128 const char *base = "charset.alias";
129 char *file_name;
130
131 /* Make it possible to override the charset.alias location. This is
132 necessary for running the testsuite before "make install". */
133 dir = getenv ("CHARSETALIASDIR");
134 if (dir == NULL || dir[0] == '\0')
135 dir = relocate (LIBDIR);
136
137 /* Concatenate dir and base into freshly allocated file_name. */
138 {
139 size_t dir_len = strlen (dir);
140 size_t base_len = strlen (base);
141 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
142 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
143 if (file_name != NULL)
144 {
145 memcpy (file_name, dir, dir_len);
146 if (add_slash)
147 file_name[dir_len] = DIRECTORY_SEPARATOR;
148 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
149 }
150 }
151
152 if (file_name == NULL)
153 /* Out of memory. Treat the file as empty. */
154 cp = "";
155 else
156 {
157 int fd;
158
159 /* Open the file. Reject symbolic links on platforms that support
160 O_NOFOLLOW. This is a security feature. Without it, an attacker
161 could retrieve parts of the contents (namely, the tail of the
162 first line that starts with "* ") of an arbitrary file by placing
163 a symbolic link to that file under the name "charset.alias" in
164 some writable directory and defining the environment variable
165 CHARSETALIASDIR to point to that directory. */
166 fd = open (file_name,
167 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
168 if (fd < 0)
169 /* File not found. Treat it as empty. */
170 cp = "";
171 else
172 {
173 FILE *fp;
174
175 fp = fdopen (fd, "r");
176 if (fp == NULL)
177 {
178 /* Out of memory. Treat the file as empty. */
179 close (fd);
180 cp = "";
181 }
182 else
183 {
184 /* Parse the file's contents. */
185 char *res_ptr = NULL;
186 size_t res_size = 0;
187
188 for (;;)
189 {
190 int c;
191 char buf1[50+1];
192 char buf2[50+1];
193 size_t l1, l2;
194 char *old_res_ptr;
195
196 c = getc (fp);
197 if (c == EOF)
198 break;
199 if (c == '\n' || c == ' ' || c == '\t')
200 continue;
201 if (c == '#')
202 {
203 /* Skip comment, to end of line. */
204 do
205 c = getc (fp);
206 while (!(c == EOF || c == '\n'));
207 if (c == EOF)
208 break;
209 continue;
210 }
211 ungetc (c, fp);
212 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
213 break;
214 l1 = strlen (buf1);
215 l2 = strlen (buf2);
216 old_res_ptr = res_ptr;
217 if (res_size == 0)
218 {
219 res_size = l1 + 1 + l2 + 1;
220 res_ptr = (char *) malloc (res_size + 1);
221 }
222 else
223 {
224 res_size += l1 + 1 + l2 + 1;
225 res_ptr = (char *) realloc (res_ptr, res_size + 1);
226 }
227 if (res_ptr == NULL)
228 {
229 /* Out of memory. */
230 res_size = 0;
231 if (old_res_ptr != NULL)
232 free (old_res_ptr);
233 break;
234 }
235 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
236 strcpy (res_ptr + res_size - (l2 + 1), buf2);
237 }
238 fclose (fp);
239 if (res_size == 0)
240 cp = "";
241 else
242 {
243 *(res_ptr + res_size) = '\0';
244 cp = res_ptr;
245 }
246 }
247 }
248
249 free (file_name);
250 }
251
252 #else
253
254 # if defined DARWIN7
255 /* To avoid the trouble of installing a file that is shared by many
256 GNU packages -- many packaging systems have problems with this --,
257 simply inline the aliases here. */
258 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
259 "ISO8859-2" "\0" "ISO-8859-2" "\0"
260 "ISO8859-4" "\0" "ISO-8859-4" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-9" "\0" "ISO-8859-9" "\0"
264 "ISO8859-13" "\0" "ISO-8859-13" "\0"
265 "ISO8859-15" "\0" "ISO-8859-15" "\0"
266 "KOI8-R" "\0" "KOI8-R" "\0"
267 "KOI8-U" "\0" "KOI8-U" "\0"
268 "CP866" "\0" "CP866" "\0"
269 "CP949" "\0" "CP949" "\0"
270 "CP1131" "\0" "CP1131" "\0"
271 "CP1251" "\0" "CP1251" "\0"
272 "eucCN" "\0" "GB2312" "\0"
273 "GB2312" "\0" "GB2312" "\0"
274 "eucJP" "\0" "EUC-JP" "\0"
275 "eucKR" "\0" "EUC-KR" "\0"
276 "Big5" "\0" "BIG5" "\0"
277 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
278 "GBK" "\0" "GBK" "\0"
279 "GB18030" "\0" "GB18030" "\0"
280 "SJIS" "\0" "SHIFT_JIS" "\0"
281 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
282 "PT154" "\0" "PT154" "\0"
283 /*"ISCII-DEV" "\0" "?" "\0"*/
284 "*" "\0" "UTF-8" "\0";
285 # endif
286
287 # if defined VMS
288 /* To avoid the troubles of an extra file charset.alias_vms in the
289 sources of many GNU packages, simply inline the aliases here. */
290 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
291 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
292 section 10.7 "Handling Different Character Sets". */
293 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
294 "ISO8859-2" "\0" "ISO-8859-2" "\0"
295 "ISO8859-5" "\0" "ISO-8859-5" "\0"
296 "ISO8859-7" "\0" "ISO-8859-7" "\0"
297 "ISO8859-8" "\0" "ISO-8859-8" "\0"
298 "ISO8859-9" "\0" "ISO-8859-9" "\0"
299 /* Japanese */
300 "eucJP" "\0" "EUC-JP" "\0"
301 "SJIS" "\0" "SHIFT_JIS" "\0"
302 "DECKANJI" "\0" "DEC-KANJI" "\0"
303 "SDECKANJI" "\0" "EUC-JP" "\0"
304 /* Chinese */
305 "eucTW" "\0" "EUC-TW" "\0"
306 "DECHANYU" "\0" "DEC-HANYU" "\0"
307 "DECHANZI" "\0" "GB2312" "\0"
308 /* Korean */
309 "DECKOREAN" "\0" "EUC-KR" "\0";
310 # endif
311
312 # if defined WIN32_NATIVE || defined __CYGWIN__
313 /* To avoid the troubles of installing a separate file in the same
314 directory as the DLL and of retrieving the DLL's directory at
315 runtime, simply inline the aliases here. */
316
317 cp = "CP936" "\0" "GBK" "\0"
318 "CP1361" "\0" "JOHAB" "\0"
319 "CP20127" "\0" "ASCII" "\0"
320 "CP20866" "\0" "KOI8-R" "\0"
321 "CP20936" "\0" "GB2312" "\0"
322 "CP21866" "\0" "KOI8-RU" "\0"
323 "CP28591" "\0" "ISO-8859-1" "\0"
324 "CP28592" "\0" "ISO-8859-2" "\0"
325 "CP28593" "\0" "ISO-8859-3" "\0"
326 "CP28594" "\0" "ISO-8859-4" "\0"
327 "CP28595" "\0" "ISO-8859-5" "\0"
328 "CP28596" "\0" "ISO-8859-6" "\0"
329 "CP28597" "\0" "ISO-8859-7" "\0"
330 "CP28598" "\0" "ISO-8859-8" "\0"
331 "CP28599" "\0" "ISO-8859-9" "\0"
332 "CP28605" "\0" "ISO-8859-15" "\0"
333 "CP38598" "\0" "ISO-8859-8" "\0"
334 "CP51932" "\0" "EUC-JP" "\0"
335 "CP51936" "\0" "GB2312" "\0"
336 "CP51949" "\0" "EUC-KR" "\0"
337 "CP51950" "\0" "EUC-TW" "\0"
338 "CP54936" "\0" "GB18030" "\0"
339 "CP65001" "\0" "UTF-8" "\0";
340 # endif
341 #endif
342
343 charset_aliases = cp;
344 }
345
346 return cp;
347 }
348
349 /* Determine the current locale's character encoding, and canonicalize it
350 into one of the canonical names listed in config.charset.
351 The result must not be freed; it is statically allocated.
352 If the canonical name cannot be determined, the result is a non-canonical
353 name. */
354
355 #ifdef STATIC
356 STATIC
357 #endif
358 const char *
359 locale_charset (void)
360 {
361 const char *codeset;
362 const char *aliases;
363
364 #if !(defined WIN32_NATIVE || defined OS2)
365
366 # if HAVE_LANGINFO_CODESET
367
368 /* Most systems support nl_langinfo (CODESET) nowadays. */
369 codeset = nl_langinfo (CODESET);
370
371 # ifdef __CYGWIN__
372 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
373 returns "US-ASCII". As long as this is not fixed, return the suffix
374 of the locale name from the environment variables (if present) or
375 the codepage as a number. */
376 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
377 {
378 const char *locale;
379 static char buf[2 + 10 + 1];
380
381 locale = getenv ("LC_ALL");
382 if (locale == NULL || locale[0] == '\0')
383 {
384 locale = getenv ("LC_CTYPE");
385 if (locale == NULL || locale[0] == '\0')
386 locale = getenv ("LANG");
387 }
388 if (locale != NULL && locale[0] != '\0')
389 {
390 /* If the locale name contains an encoding after the dot, return
391 it. */
392 const char *dot = strchr (locale, '.');
393
394 if (dot != NULL)
395 {
396 const char *modifier;
397
398 dot++;
399 /* Look for the possible @... trailer and remove it, if any. */
400 modifier = strchr (dot, '@');
401 if (modifier == NULL)
402 return dot;
403 if (modifier - dot < sizeof (buf))
404 {
405 memcpy (buf, dot, modifier - dot);
406 buf [modifier - dot] = '\0';
407 return buf;
408 }
409 }
410 }
411
412 /* Woe32 has a function returning the locale's codepage as a number. */
413 sprintf (buf, "CP%u", GetACP ());
414 codeset = buf;
415 }
416 # endif
417
418 # else
419
420 /* On old systems which lack it, use setlocale or getenv. */
421 const char *locale = NULL;
422
423 /* But most old systems don't have a complete set of locales. Some
424 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
425 use setlocale here; it would return "C" when it doesn't support the
426 locale name the user has set. */
427 # if 0
428 locale = setlocale (LC_CTYPE, NULL);
429 # endif
430 if (locale == NULL || locale[0] == '\0')
431 {
432 locale = getenv ("LC_ALL");
433 if (locale == NULL || locale[0] == '\0')
434 {
435 locale = getenv ("LC_CTYPE");
436 if (locale == NULL || locale[0] == '\0')
437 locale = getenv ("LANG");
438 }
439 }
440
441 /* On some old systems, one used to set locale = "iso8859_1". On others,
442 you set it to "language_COUNTRY.charset". In any case, we resolve it
443 through the charset.alias file. */
444 codeset = locale;
445
446 # endif
447
448 #elif defined WIN32_NATIVE
449
450 static char buf[2 + 10 + 1];
451
452 /* Woe32 has a function returning the locale's codepage as a number. */
453 sprintf (buf, "CP%u", GetACP ());
454 codeset = buf;
455
456 #elif defined OS2
457
458 const char *locale;
459 static char buf[2 + 10 + 1];
460 ULONG cp[3];
461 ULONG cplen;
462
463 /* Allow user to override the codeset, as set in the operating system,
464 with standard language environment variables. */
465 locale = getenv ("LC_ALL");
466 if (locale == NULL || locale[0] == '\0')
467 {
468 locale = getenv ("LC_CTYPE");
469 if (locale == NULL || locale[0] == '\0')
470 locale = getenv ("LANG");
471 }
472 if (locale != NULL && locale[0] != '\0')
473 {
474 /* If the locale name contains an encoding after the dot, return it. */
475 const char *dot = strchr (locale, '.');
476
477 if (dot != NULL)
478 {
479 const char *modifier;
480
481 dot++;
482 /* Look for the possible @... trailer and remove it, if any. */
483 modifier = strchr (dot, '@');
484 if (modifier == NULL)
485 return dot;
486 if (modifier - dot < sizeof (buf))
487 {
488 memcpy (buf, dot, modifier - dot);
489 buf [modifier - dot] = '\0';
490 return buf;
491 }
492 }
493
494 /* Resolve through the charset.alias file. */
495 codeset = locale;
496 }
497 else
498 {
499 /* OS/2 has a function returning the locale's codepage as a number. */
500 if (DosQueryCp (sizeof (cp), cp, &cplen))
501 codeset = "";
502 else
503 {
504 sprintf (buf, "CP%u", cp[0]);
505 codeset = buf;
506 }
507 }
508
509 #endif
510
511 if (codeset == NULL)
512 /* The canonical name cannot be determined. */
513 codeset = "";
514
515 /* Resolve alias. */
516 for (aliases = get_charset_aliases ();
517 *aliases != '\0';
518 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
519 if (strcmp (codeset, aliases) == 0
520 || (aliases[0] == '*' && aliases[1] == '\0'))
521 {
522 codeset = aliases + strlen (aliases) + 1;
523 break;
524 }
525
526 /* Don't return an empty string. GNU libc and GNU libiconv interpret
527 the empty string as denoting "the locale's character encoding",
528 thus GNU libiconv would call this function a second time. */
529 if (codeset[0] == '\0')
530 codeset = "ASCII";
531
532 return codeset;
533 }