HCoop Git - bpt/guile.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* Determine a canonical name for the current locale's character encoding.
	2
	3	Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc.
	4
	5	This program is free software; you can redistribute it and/or modify
	6	it under the terms of the GNU Lesser General Public License as published by
	7	the Free Software Foundation; either version 2, or (at your option)
	8	any later version.
	9
	10	This program is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	GNU Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public License along
	16	with this program; if not, see <http://www.gnu.org/licenses/>. */
	17
	18	/* Written by Bruno Haible <bruno@clisp.org>. */
	19
	20	#include <config.h>
	21
	22	/* Specification. */
	23	#include "localcharset.h"
	24
	25	#include <fcntl.h>
	26	#include <stddef.h>
	27	#include <stdio.h>
	28	#include <string.h>
	29	#include <stdlib.h>
	30
	31	#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
	32	# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
	33	#endif
	34
	35	#if defined _WIN32 \|\| defined __WIN32__
	36	# define WINDOWS_NATIVE
	37	#endif
	38
	39	#if defined __EMX__
	40	/* Assume EMX program runs on OS/2, even if compiled under DOS. */
	41	# ifndef OS2
	42	# define OS2
	43	# endif
	44	#endif
	45
	46	#if !defined WINDOWS_NATIVE
	47	# include <unistd.h>
	48	# if HAVE_LANGINFO_CODESET
	49	# include <langinfo.h>
	50	# else
	51	# if 0 /* see comment below */
	52	# include <locale.h>
	53	# endif
	54	# endif
	55	# ifdef __CYGWIN__
	56	# define WIN32_LEAN_AND_MEAN
	57	# include <windows.h>
	58	# endif
	59	#elif defined WINDOWS_NATIVE
	60	# define WIN32_LEAN_AND_MEAN
	61	# include <windows.h>
	62	#endif
	63	#if defined OS2
	64	# define INCL_DOS
	65	# include <os2.h>
	66	#endif
	67
	68	#if ENABLE_RELOCATABLE
	69	# include "relocatable.h"
	70	#else
	71	# define relocate(pathname) (pathname)
	72	#endif
	73
	74	/* Get LIBDIR. */
	75	#ifndef LIBDIR
	76	# include "configmake.h"
	77	#endif
	78
	79	/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
	80	#ifndef O_NOFOLLOW
	81	# define O_NOFOLLOW 0
	82	#endif
	83
	84	#if defined _WIN32 \|\| defined __WIN32__ \|\| defined __CYGWIN__ \|\| defined __EMX__ \|\| defined __DJGPP__
	85	/* Native Windows, Cygwin, OS/2, DOS */
	86	# define ISSLASH(C) ((C) == '/' \|\| (C) == '\\')
	87	#endif
	88
	89	#ifndef DIRECTORY_SEPARATOR
	90	# define DIRECTORY_SEPARATOR '/'
	91	#endif
	92
	93	#ifndef ISSLASH
	94	# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
	95	#endif
	96
	97	#if HAVE_DECL_GETC_UNLOCKED
	98	# undef getc
	99	# define getc getc_unlocked
	100	#endif
	101
	102	/* The following static variable is declared 'volatile' to avoid a
	103	possible multithread problem in the function get_charset_aliases. If we
	104	are running in a threaded environment, and if two threads initialize
	105	'charset_aliases' simultaneously, both will produce the same value,
	106	and everything will be ok if the two assignments to 'charset_aliases'
	107	are atomic. But I don't know what will happen if the two assignments mix. */
	108	#if __STDC__ != 1
	109	# define volatile /* empty */
	110	#endif
	111	/* Pointer to the contents of the charset.alias file, if it has already been
	112	read, else NULL. Its format is:
	113	ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
	114	static const char * volatile charset_aliases;
	115
	116	/* Return a pointer to the contents of the charset.alias file. */
	117	static const char *
	118	get_charset_aliases (void)
	119	{
	120	const char *cp;
	121
	122	cp = charset_aliases;
	123	if (cp == NULL)
	124	{
	125	#if !(defined DARWIN7 \|\| defined VMS \|\| defined WINDOWS_NATIVE \|\| defined __CYGWIN__)
	126	const char *dir;
	127	const char *base = "charset.alias";
	128	char *file_name;
	129
	130	/* Make it possible to override the charset.alias location. This is
	131	necessary for running the testsuite before "make install". */
	132	dir = getenv ("CHARSETALIASDIR");
	133	if (dir == NULL \|\| dir[0] == '\0')
	134	dir = relocate (LIBDIR);
	135
	136	/* Concatenate dir and base into freshly allocated file_name. */
	137	{
	138	size_t dir_len = strlen (dir);
	139	size_t base_len = strlen (base);
	140	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
	141	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
	142	if (file_name != NULL)
	143	{
	144	memcpy (file_name, dir, dir_len);
	145	if (add_slash)
	146	file_name[dir_len] = DIRECTORY_SEPARATOR;
	147	memcpy (file_name + dir_len + add_slash, base, base_len + 1);
	148	}
	149	}
	150
	151	if (file_name == NULL)
	152	/* Out of memory. Treat the file as empty. */
	153	cp = "";
	154	else
	155	{
	156	int fd;
	157
	158	/* Open the file. Reject symbolic links on platforms that support
	159	O_NOFOLLOW. This is a security feature. Without it, an attacker
	160	could retrieve parts of the contents (namely, the tail of the
	161	first line that starts with "* ") of an arbitrary file by placing
	162	a symbolic link to that file under the name "charset.alias" in
	163	some writable directory and defining the environment variable
	164	CHARSETALIASDIR to point to that directory. */
	165	fd = open (file_name,
	166	O_RDONLY \| (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
	167	if (fd < 0)
	168	/* File not found. Treat it as empty. */
	169	cp = "";
	170	else
	171	{
	172	FILE *fp;
	173
	174	fp = fdopen (fd, "r");
	175	if (fp == NULL)
	176	{
	177	/* Out of memory. Treat the file as empty. */
	178	close (fd);
	179	cp = "";
	180	}
	181	else
	182	{
	183	/* Parse the file's contents. */
	184	char *res_ptr = NULL;
	185	size_t res_size = 0;
	186
	187	for (;;)
	188	{
	189	int c;
	190	char buf1[50+1];
	191	char buf2[50+1];
	192	size_t l1, l2;
	193	char *old_res_ptr;
	194
	195	c = getc (fp);
	196	if (c == EOF)
	197	break;
	198	if (c == '\n' \|\| c == ' ' \|\| c == '\t')
	199	continue;
	200	if (c == '#')
	201	{
	202	/* Skip comment, to end of line. */
	203	do
	204	c = getc (fp);
	205	while (!(c == EOF \|\| c == '\n'));
	206	if (c == EOF)
	207	break;
	208	continue;
	209	}
	210	ungetc (c, fp);
	211	if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
	212	break;
	213	l1 = strlen (buf1);
	214	l2 = strlen (buf2);
	215	old_res_ptr = res_ptr;
	216	if (res_size == 0)
	217	{
	218	res_size = l1 + 1 + l2 + 1;
	219	res_ptr = (char *) malloc (res_size + 1);
	220	}
	221	else
	222	{
	223	res_size += l1 + 1 + l2 + 1;
	224	res_ptr = (char *) realloc (res_ptr, res_size + 1);
	225	}
	226	if (res_ptr == NULL)
	227	{
	228	/* Out of memory. */
	229	res_size = 0;
	230	free (old_res_ptr);
	231	break;
	232	}
	233	strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
	234	strcpy (res_ptr + res_size - (l2 + 1), buf2);
	235	}
	236	fclose (fp);
	237	if (res_size == 0)
	238	cp = "";
	239	else
	240	{
	241	*(res_ptr + res_size) = '\0';
	242	cp = res_ptr;
	243	}
	244	}
	245	}
	246
	247	free (file_name);
	248	}
	249
	250	#else
	251
	252	# if defined DARWIN7
	253	/* To avoid the trouble of installing a file that is shared by many
	254	GNU packages -- many packaging systems have problems with this --,
	255	simply inline the aliases here. */
	256	cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
	257	"ISO8859-2" "\0" "ISO-8859-2" "\0"
	258	"ISO8859-4" "\0" "ISO-8859-4" "\0"
	259	"ISO8859-5" "\0" "ISO-8859-5" "\0"
	260	"ISO8859-7" "\0" "ISO-8859-7" "\0"
	261	"ISO8859-9" "\0" "ISO-8859-9" "\0"
	262	"ISO8859-13" "\0" "ISO-8859-13" "\0"
	263	"ISO8859-15" "\0" "ISO-8859-15" "\0"
	264	"KOI8-R" "\0" "KOI8-R" "\0"
	265	"KOI8-U" "\0" "KOI8-U" "\0"
	266	"CP866" "\0" "CP866" "\0"
	267	"CP949" "\0" "CP949" "\0"
	268	"CP1131" "\0" "CP1131" "\0"
	269	"CP1251" "\0" "CP1251" "\0"
	270	"eucCN" "\0" "GB2312" "\0"
	271	"GB2312" "\0" "GB2312" "\0"
	272	"eucJP" "\0" "EUC-JP" "\0"
	273	"eucKR" "\0" "EUC-KR" "\0"
	274	"Big5" "\0" "BIG5" "\0"
	275	"Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
	276	"GBK" "\0" "GBK" "\0"
	277	"GB18030" "\0" "GB18030" "\0"
	278	"SJIS" "\0" "SHIFT_JIS" "\0"
	279	"ARMSCII-8" "\0" "ARMSCII-8" "\0"
	280	"PT154" "\0" "PT154" "\0"
	281	/"ISCII-DEV" "\0" "?" "\0"/
	282	"*" "\0" "UTF-8" "\0";
	283	# endif
	284
	285	# if defined VMS
	286	/* To avoid the troubles of an extra file charset.alias_vms in the
	287	sources of many GNU packages, simply inline the aliases here. */
	288	/* The list of encodings is taken from the OpenVMS 7.3-1 documentation
	289	"Compaq C Run-Time Library Reference Manual for OpenVMS systems"
	290	section 10.7 "Handling Different Character Sets". */
	291	cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
	292	"ISO8859-2" "\0" "ISO-8859-2" "\0"
	293	"ISO8859-5" "\0" "ISO-8859-5" "\0"
	294	"ISO8859-7" "\0" "ISO-8859-7" "\0"
	295	"ISO8859-8" "\0" "ISO-8859-8" "\0"
	296	"ISO8859-9" "\0" "ISO-8859-9" "\0"
	297	/* Japanese */
	298	"eucJP" "\0" "EUC-JP" "\0"
	299	"SJIS" "\0" "SHIFT_JIS" "\0"
	300	"DECKANJI" "\0" "DEC-KANJI" "\0"
	301	"SDECKANJI" "\0" "EUC-JP" "\0"
	302	/* Chinese */
	303	"eucTW" "\0" "EUC-TW" "\0"
	304	"DECHANYU" "\0" "DEC-HANYU" "\0"
	305	"DECHANZI" "\0" "GB2312" "\0"
	306	/* Korean */
	307	"DECKOREAN" "\0" "EUC-KR" "\0";
	308	# endif
	309
	310	# if defined WINDOWS_NATIVE \|\| defined __CYGWIN__
	311	/* To avoid the troubles of installing a separate file in the same
	312	directory as the DLL and of retrieving the DLL's directory at
	313	runtime, simply inline the aliases here. */
	314
	315	cp = "CP936" "\0" "GBK" "\0"
	316	"CP1361" "\0" "JOHAB" "\0"
	317	"CP20127" "\0" "ASCII" "\0"
	318	"CP20866" "\0" "KOI8-R" "\0"
	319	"CP20936" "\0" "GB2312" "\0"
	320	"CP21866" "\0" "KOI8-RU" "\0"
	321	"CP28591" "\0" "ISO-8859-1" "\0"
	322	"CP28592" "\0" "ISO-8859-2" "\0"
	323	"CP28593" "\0" "ISO-8859-3" "\0"
	324	"CP28594" "\0" "ISO-8859-4" "\0"
	325	"CP28595" "\0" "ISO-8859-5" "\0"
	326	"CP28596" "\0" "ISO-8859-6" "\0"
	327	"CP28597" "\0" "ISO-8859-7" "\0"
	328	"CP28598" "\0" "ISO-8859-8" "\0"
	329	"CP28599" "\0" "ISO-8859-9" "\0"
	330	"CP28605" "\0" "ISO-8859-15" "\0"
	331	"CP38598" "\0" "ISO-8859-8" "\0"
	332	"CP51932" "\0" "EUC-JP" "\0"
	333	"CP51936" "\0" "GB2312" "\0"
	334	"CP51949" "\0" "EUC-KR" "\0"
	335	"CP51950" "\0" "EUC-TW" "\0"
	336	"CP54936" "\0" "GB18030" "\0"
	337	"CP65001" "\0" "UTF-8" "\0";
	338	# endif
	339	#endif
	340
	341	charset_aliases = cp;
	342	}
	343
	344	return cp;
	345	}
	346
	347	/* Determine the current locale's character encoding, and canonicalize it
	348	into one of the canonical names listed in config.charset.
	349	The result must not be freed; it is statically allocated.
	350	If the canonical name cannot be determined, the result is a non-canonical
	351	name. */
	352
	353	#ifdef STATIC
	354	STATIC
	355	#endif
	356	const char *
	357	locale_charset (void)
	358	{
	359	const char *codeset;
	360	const char *aliases;
	361
	362	#if !(defined WINDOWS_NATIVE \|\| defined OS2)
	363
	364	# if HAVE_LANGINFO_CODESET
	365
	366	/* Most systems support nl_langinfo (CODESET) nowadays. */
	367	codeset = nl_langinfo (CODESET);
	368
	369	# ifdef __CYGWIN__
	370	/* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
	371	returns "US-ASCII". Return the suffix of the locale name from the
	372	environment variables (if present) or the codepage as a number. */
	373	if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
	374	{
	375	const char *locale;
	376	static char buf[2 + 10 + 1];
	377
	378	locale = getenv ("LC_ALL");
	379	if (locale == NULL \|\| locale[0] == '\0')
	380	{
	381	locale = getenv ("LC_CTYPE");
	382	if (locale == NULL \|\| locale[0] == '\0')
	383	locale = getenv ("LANG");
	384	}
	385	if (locale != NULL && locale[0] != '\0')
	386	{
	387	/* If the locale name contains an encoding after the dot, return
	388	it. */
	389	const char *dot = strchr (locale, '.');
	390
	391	if (dot != NULL)
	392	{
	393	const char *modifier;
	394
	395	dot++;
	396	/* Look for the possible @... trailer and remove it, if any. */
	397	modifier = strchr (dot, '@');
	398	if (modifier == NULL)
	399	return dot;
	400	if (modifier - dot < sizeof (buf))
	401	{
	402	memcpy (buf, dot, modifier - dot);
	403	buf [modifier - dot] = '\0';
	404	return buf;
	405	}
	406	}
	407	}
	408
	409	/* The Windows API has a function returning the locale's codepage as a
	410	number: GetACP(). This encoding is used by Cygwin, unless the user
	411	has set the environment variable CYGWIN=codepage:oem (which very few
	412	people do).
	413	Output directed to console windows needs to be converted (to
	414	GetOEMCP() if the console is using a raster font, or to
	415	GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
	416	this conversion transparently (see winsup/cygwin/fhandler_console.cc),
	417	converting to GetConsoleOutputCP(). This leads to correct results,
	418	except when SetConsoleOutputCP has been called and a raster font is
	419	in use. */
	420	sprintf (buf, "CP%u", GetACP ());
	421	codeset = buf;
	422	}
	423	# endif
	424
	425	# else
	426
	427	/* On old systems which lack it, use setlocale or getenv. */
	428	const char *locale = NULL;
	429
	430	/* But most old systems don't have a complete set of locales. Some
	431	(like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
	432	use setlocale here; it would return "C" when it doesn't support the
	433	locale name the user has set. */
	434	# if 0
	435	locale = setlocale (LC_CTYPE, NULL);
	436	# endif
	437	if (locale == NULL \|\| locale[0] == '\0')
	438	{
	439	locale = getenv ("LC_ALL");
	440	if (locale == NULL \|\| locale[0] == '\0')
	441	{
	442	locale = getenv ("LC_CTYPE");
	443	if (locale == NULL \|\| locale[0] == '\0')
	444	locale = getenv ("LANG");
	445	}
	446	}
	447
	448	/* On some old systems, one used to set locale = "iso8859_1". On others,
	449	you set it to "language_COUNTRY.charset". In any case, we resolve it
	450	through the charset.alias file. */
	451	codeset = locale;
	452
	453	# endif
	454
	455	#elif defined WINDOWS_NATIVE
	456
	457	static char buf[2 + 10 + 1];
	458
	459	/* The Windows API has a function returning the locale's codepage as a
	460	number: GetACP().
	461	When the output goes to a console window, it needs to be provided in
	462	GetOEMCP() encoding if the console is using a raster font, or in
	463	GetConsoleOutputCP() encoding if it is using a TrueType font.
	464	But in GUI programs and for output sent to files and pipes, GetACP()
	465	encoding is the best bet. */
	466	sprintf (buf, "CP%u", GetACP ());
	467	codeset = buf;
	468
	469	#elif defined OS2
	470
	471	const char *locale;
	472	static char buf[2 + 10 + 1];
	473	ULONG cp[3];
	474	ULONG cplen;
	475
	476	/* Allow user to override the codeset, as set in the operating system,
	477	with standard language environment variables. */
	478	locale = getenv ("LC_ALL");
	479	if (locale == NULL \|\| locale[0] == '\0')
	480	{
	481	locale = getenv ("LC_CTYPE");
	482	if (locale == NULL \|\| locale[0] == '\0')
	483	locale = getenv ("LANG");
	484	}
	485	if (locale != NULL && locale[0] != '\0')
	486	{
	487	/* If the locale name contains an encoding after the dot, return it. */
	488	const char *dot = strchr (locale, '.');
	489
	490	if (dot != NULL)
	491	{
	492	const char *modifier;
	493
	494	dot++;
	495	/* Look for the possible @... trailer and remove it, if any. */
	496	modifier = strchr (dot, '@');
	497	if (modifier == NULL)
	498	return dot;
	499	if (modifier - dot < sizeof (buf))
	500	{
	501	memcpy (buf, dot, modifier - dot);
	502	buf [modifier - dot] = '\0';
	503	return buf;
	504	}
	505	}
	506
	507	/* Resolve through the charset.alias file. */
	508	codeset = locale;
	509	}
	510	else
	511	{
	512	/* OS/2 has a function returning the locale's codepage as a number. */
	513	if (DosQueryCp (sizeof (cp), cp, &cplen))
	514	codeset = "";
	515	else
	516	{
	517	sprintf (buf, "CP%u", cp[0]);
	518	codeset = buf;
	519	}
	520	}
	521
	522	#endif
	523
	524	if (codeset == NULL)
	525	/* The canonical name cannot be determined. */
	526	codeset = "";
	527
	528	/* Resolve alias. */
	529	for (aliases = get_charset_aliases ();
	530	*aliases != '\0';
	531	aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
	532	if (strcmp (codeset, aliases) == 0
	533	\|\| (aliases[0] == '*' && aliases[1] == '\0'))
	534	{
	535	codeset = aliases + strlen (aliases) + 1;
	536	break;
	537	}
	538
	539	/* Don't return an empty string. GNU libc and GNU libiconv interpret
	540	the empty string as denoting "the locale's character encoding",
	541	thus GNU libiconv would call this function a second time. */
	542	if (codeset[0] == '\0')
	543	codeset = "ASCII";
	544
	545	return codeset;
	546	}
	547
	548	/* A variant of the above, without calls to `setlocale', `nl_langinfo',
	549	etc. */
	550	const char *
	551	environ_locale_charset (void)
	552	{
	553	static char buf[2 + 10 + 1];
	554	const char codeset, aliases;
	555	const char *locale = NULL;
	556
	557	locale = getenv ("LC_ALL");
	558	if (locale == NULL \|\| locale[0] == '\0')
	559	{
	560	locale = getenv ("LC_CTYPE");
	561	if (locale == NULL \|\| locale[0] == '\0')
	562	locale = getenv ("LANG");
	563	}
	564
	565	if (locale != NULL && locale[0] != '\0')
	566	{
	567	/* If the locale name contains an encoding after the dot, return it. */
	568	const char *dot = strchr (locale, '.');
	569
	570	if (dot != NULL)
	571	{
	572	const char *modifier;
	573
	574	dot++;
	575	/* Look for the possible @... trailer and remove it, if any. */
	576	modifier = strchr (dot, '@');
	577	if (modifier == NULL)
	578	return dot;
	579	if (modifier - dot < sizeof (buf))
	580	{
	581	memcpy (buf, dot, modifier - dot);
	582	buf [modifier - dot] = '\0';
	583	return buf;
	584	}
	585	}
	586	else if (strcmp (locale, "C") == 0)
	587	{
	588	strcpy (buf, "ASCII");
	589	return buf;
	590	}
	591	else
	592	codeset = "";
	593	}
	594	else
	595	codeset = "";
	596
	597	/* Resolve alias. */
	598	for (aliases = get_charset_aliases ();
	599	*aliases != '\0';
	600	aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
	601	if (strcmp (codeset, aliases) == 0
	602	\|\| (aliases[0] == '*' && aliases[1] == '\0'))
	603	{
	604	codeset = aliases + strlen (aliases) + 1;
	605	break;
	606	}
	607
	608	/* Don't return an empty string. GNU libc and GNU libiconv interpret
	609	the empty string as denoting "the locale's character encoding",
	610	thus GNU libiconv would call this function a second time. */
	611	if (codeset[0] == '\0')
	612	/* Default to Latin-1, for backward compatibility with Guile 1.8. */
	613	codeset = "ISO-8859-1";
	614
	615	return codeset;
	616	}