Revert "wrap iconv_open / iconv_close with a lock to help in thread/fork issues"
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229 SCM_PTHREAD_ATFORK_LOCK_STATIC_MUTEX (stringbuf_write_mutex);
230
231 \f
232 /* Copy-on-write strings.
233 */
234
235 #define STRING_TAG scm_tc7_string
236
237 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
238 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
239 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
240
241 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
242 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
243
244 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
245
246 /* Read-only strings.
247 */
248
249 #define RO_STRING_TAG scm_tc7_ro_string
250 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
251
252 /* Mutation-sharing substrings
253 */
254
255 #define SH_STRING_TAG (scm_tc7_string + 0x100)
256
257 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
258 /* START and LENGTH as for STRINGs. */
259
260 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
261
262 SCM scm_nullstr;
263
264 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
265 characters. CHARSP, if not NULL, will be set to location of the
266 char array. If READ_ONLY_P, the returned string is read-only;
267 otherwise it is writable. */
268 SCM
269 scm_i_make_string (size_t len, char **charsp, int read_only_p)
270 {
271 static SCM null_stringbuf = SCM_BOOL_F;
272 SCM buf;
273 SCM res;
274
275 if (len == 0)
276 {
277 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
278 {
279 null_stringbuf = make_stringbuf (0);
280 SET_STRINGBUF_SHARED (null_stringbuf);
281 }
282 buf = null_stringbuf;
283 }
284 else
285 buf = make_stringbuf (len);
286
287 if (charsp)
288 *charsp = (char *) STRINGBUF_CHARS (buf);
289 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
290 SCM_UNPACK (buf),
291 (scm_t_bits) 0, (scm_t_bits) len);
292 return res;
293 }
294
295 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
296 characters. CHARSP, if not NULL, will be set to location of the
297 character array. If READ_ONLY_P, the returned string is read-only;
298 otherwise it is writable. */
299 SCM
300 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
301 {
302 SCM buf = make_wide_stringbuf (len);
303 SCM res;
304 if (charsp)
305 *charsp = STRINGBUF_WIDE_CHARS (buf);
306 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
307 SCM_UNPACK (buf),
308 (scm_t_bits) 0, (scm_t_bits) len);
309 return res;
310 }
311
312 static void
313 validate_substring_args (SCM str, size_t start, size_t end)
314 {
315 if (!IS_STRING (str))
316 scm_wrong_type_arg_msg (NULL, 0, str, "string");
317 if (start > STRING_LENGTH (str))
318 scm_out_of_range (NULL, scm_from_size_t (start));
319 if (end > STRING_LENGTH (str) || end < start)
320 scm_out_of_range (NULL, scm_from_size_t (end));
321 }
322
323 static inline void
324 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
325 {
326 *start = STRING_START (*str);
327 if (IS_SH_STRING (*str))
328 {
329 *str = SH_STRING_STRING (*str);
330 *start += STRING_START (*str);
331 }
332 *buf = STRING_STRINGBUF (*str);
333 }
334
335 SCM
336 scm_i_substring (SCM str, size_t start, size_t end)
337 {
338 if (start == end)
339 return scm_i_make_string (0, NULL, 0);
340 else
341 {
342 SCM buf;
343 size_t str_start;
344 get_str_buf_start (&str, &buf, &str_start);
345 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
346 SET_STRINGBUF_SHARED (buf);
347 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
348 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
349 (scm_t_bits)str_start + start,
350 (scm_t_bits) end - start);
351 }
352 }
353
354 SCM
355 scm_i_substring_read_only (SCM str, size_t start, size_t end)
356 {
357 if (start == end)
358 return scm_i_make_string (0, NULL, 1);
359 else
360 {
361 SCM buf;
362 size_t str_start;
363 get_str_buf_start (&str, &buf, &str_start);
364 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
365 SET_STRINGBUF_SHARED (buf);
366 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
367 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
368 (scm_t_bits)str_start + start,
369 (scm_t_bits) end - start);
370 }
371 }
372
373 SCM
374 scm_i_substring_copy (SCM str, size_t start, size_t end)
375 {
376 if (start == end)
377 return scm_i_make_string (0, NULL, 0);
378 else
379 {
380 size_t len = end - start;
381 SCM buf, my_buf, substr;
382 size_t str_start;
383 int wide = 0;
384 get_str_buf_start (&str, &buf, &str_start);
385 if (scm_i_is_narrow_string (str))
386 {
387 my_buf = make_stringbuf (len);
388 memcpy (STRINGBUF_CHARS (my_buf),
389 STRINGBUF_CHARS (buf) + str_start + start, len);
390 }
391 else
392 {
393 my_buf = make_wide_stringbuf (len);
394 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
395 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
396 + start), len);
397 wide = 1;
398 }
399 scm_remember_upto_here_1 (buf);
400 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
401 (scm_t_bits) 0, (scm_t_bits) len);
402 if (wide)
403 scm_i_try_narrow_string (substr);
404 return substr;
405 }
406 }
407
408 SCM
409 scm_i_substring_shared (SCM str, size_t start, size_t end)
410 {
411 if (start == 0 && end == STRING_LENGTH (str))
412 return str;
413 else if (start == end)
414 return scm_i_make_string (0, NULL, 0);
415 else
416 {
417 size_t len = end - start;
418 if (IS_SH_STRING (str))
419 {
420 start += STRING_START (str);
421 str = SH_STRING_STRING (str);
422 }
423 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
424 (scm_t_bits)start, (scm_t_bits) len);
425 }
426 }
427
428 SCM
429 scm_c_substring (SCM str, size_t start, size_t end)
430 {
431 validate_substring_args (str, start, end);
432 return scm_i_substring (str, start, end);
433 }
434
435 SCM
436 scm_c_substring_read_only (SCM str, size_t start, size_t end)
437 {
438 validate_substring_args (str, start, end);
439 return scm_i_substring_read_only (str, start, end);
440 }
441
442 SCM
443 scm_c_substring_copy (SCM str, size_t start, size_t end)
444 {
445 validate_substring_args (str, start, end);
446 return scm_i_substring_copy (str, start, end);
447 }
448
449 SCM
450 scm_c_substring_shared (SCM str, size_t start, size_t end)
451 {
452 validate_substring_args (str, start, end);
453 return scm_i_substring_shared (str, start, end);
454 }
455
456 \f
457 /* Internal accessors
458 */
459
460 /* Returns the number of characters in STR. This may be different
461 than the memory size of the string storage. */
462 size_t
463 scm_i_string_length (SCM str)
464 {
465 return STRING_LENGTH (str);
466 }
467
468 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
469 encoding. False if it is 'wide', having a 32-bit UCS-4
470 encoding. */
471 int
472 scm_i_is_narrow_string (SCM str)
473 {
474 if (IS_SH_STRING (str))
475 str = SH_STRING_STRING (str);
476
477 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
478 }
479
480 /* Try to coerce a string to be narrow. It if is narrow already, do
481 nothing. If it is wide, shrink it to narrow if none of its
482 characters are above 0xFF. Return true if the string is narrow or
483 was made to be narrow. */
484 int
485 scm_i_try_narrow_string (SCM str)
486 {
487 if (IS_SH_STRING (str))
488 str = SH_STRING_STRING (str);
489
490 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
491
492 return scm_i_is_narrow_string (str);
493 }
494
495 /* Return a pointer to the raw data of the string, which can be either Latin-1
496 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
497 const void *
498 scm_i_string_data (SCM str)
499 {
500 SCM buf;
501 size_t start;
502 const char *data;
503
504 get_str_buf_start (&str, &buf, &start);
505
506 data = STRINGBUF_CONTENTS (buf);
507 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
508
509 return data;
510 }
511
512 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
513 STR. */
514 const char *
515 scm_i_string_chars (SCM str)
516 {
517 SCM buf;
518 size_t start;
519 get_str_buf_start (&str, &buf, &start);
520 if (scm_i_is_narrow_string (str))
521 return (const char *) STRINGBUF_CHARS (buf) + start;
522 else
523 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
524 scm_list_1 (str));
525 return NULL;
526 }
527
528 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
529 STR. */
530 const scm_t_wchar *
531 scm_i_string_wide_chars (SCM str)
532 {
533 SCM buf;
534 size_t start;
535
536 get_str_buf_start (&str, &buf, &start);
537 if (!scm_i_is_narrow_string (str))
538 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
539 else
540 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
541 scm_list_1 (str));
542 }
543
544 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
545 a new string buffer, so that it can be modified without modifying
546 other strings. Also, lock the string mutex. Later, one must call
547 scm_i_string_stop_writing to unlock the mutex. */
548 SCM
549 scm_i_string_start_writing (SCM orig_str)
550 {
551 SCM buf, str = orig_str;
552 size_t start;
553
554 get_str_buf_start (&str, &buf, &start);
555 if (IS_RO_STRING (str))
556 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
557
558 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
559 if (STRINGBUF_SHARED (buf))
560 {
561 /* Clone the stringbuf. */
562 size_t len = STRING_LENGTH (str);
563 SCM new_buf;
564
565 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
566
567 if (scm_i_is_narrow_string (str))
568 {
569 new_buf = make_stringbuf (len);
570 memcpy (STRINGBUF_CHARS (new_buf),
571 STRINGBUF_CHARS (buf) + STRING_START (str), len);
572
573 }
574 else
575 {
576 new_buf = make_wide_stringbuf (len);
577 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
578 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
579 + STRING_START (str)), len);
580 }
581
582 SET_STRING_STRINGBUF (str, new_buf);
583 start -= STRING_START (str);
584
585 /* FIXME: The following operations are not atomic, so other threads
586 looking at STR may see an inconsistent state. Nevertheless it can't
587 hurt much since (i) accessing STR while it is being mutated can't
588 yield a crash, and (ii) concurrent accesses to STR should be
589 protected by a mutex at the application level. The latter may not
590 apply when STR != ORIG_STR, though. */
591 SET_STRING_START (str, 0);
592 SET_STRING_STRINGBUF (str, new_buf);
593
594 buf = new_buf;
595
596 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
597 }
598 return orig_str;
599 }
600
601 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
602 char *
603 scm_i_string_writable_chars (SCM str)
604 {
605 SCM buf;
606 size_t start;
607
608 get_str_buf_start (&str, &buf, &start);
609 if (scm_i_is_narrow_string (str))
610 return (char *) STRINGBUF_CHARS (buf) + start;
611 else
612 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
613 scm_list_1 (str));
614 return NULL;
615 }
616
617 /* Return a pointer to the UCS-4 codepoints of a string. */
618 static scm_t_wchar *
619 scm_i_string_writable_wide_chars (SCM str)
620 {
621 SCM buf;
622 size_t start;
623
624 get_str_buf_start (&str, &buf, &start);
625 if (!scm_i_is_narrow_string (str))
626 return STRINGBUF_WIDE_CHARS (buf) + start;
627 else
628 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
629 scm_list_1 (str));
630 }
631
632 /* Unlock the string mutex that was locked when
633 scm_i_string_start_writing was called. */
634 void
635 scm_i_string_stop_writing (void)
636 {
637 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
638 }
639
640 /* Return the Xth character of STR as a UCS-4 codepoint. */
641 scm_t_wchar
642 scm_i_string_ref (SCM str, size_t x)
643 {
644 if (scm_i_is_narrow_string (str))
645 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
646 else
647 return scm_i_string_wide_chars (str)[x];
648 }
649
650 /* Returns index+1 of the first char in STR that matches C, or
651 0 if the char is not found. */
652 int
653 scm_i_string_contains_char (SCM str, char ch)
654 {
655 size_t i;
656 size_t len = scm_i_string_length (str);
657
658 i = 0;
659 if (scm_i_is_narrow_string (str))
660 {
661 while (i < len)
662 {
663 if (scm_i_string_chars (str)[i] == ch)
664 return i+1;
665 i++;
666 }
667 }
668 else
669 {
670 while (i < len)
671 {
672 if (scm_i_string_wide_chars (str)[i]
673 == (unsigned char) ch)
674 return i+1;
675 i++;
676 }
677 }
678 return 0;
679 }
680
681 int
682 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
683 {
684 if (scm_i_is_narrow_string (sstr))
685 {
686 const char *a = scm_i_string_chars (sstr) + start_x;
687 const char *b = cstr;
688 return strncmp (a, b, strlen(b));
689 }
690 else
691 {
692 size_t i;
693 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
694 const char *b = cstr;
695 for (i = 0; i < strlen (b); i++)
696 {
697 if (a[i] != (unsigned char) b[i])
698 return 1;
699 }
700 }
701 return 0;
702 }
703
704 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
705 void
706 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
707 {
708 if (IS_SH_STRING (str))
709 {
710 p += STRING_START (str);
711 str = SH_STRING_STRING (str);
712 }
713
714 if (chr > 0xFF && scm_i_is_narrow_string (str))
715 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
716
717 if (scm_i_is_narrow_string (str))
718 {
719 char *dst = scm_i_string_writable_chars (str);
720 dst[p] = chr;
721 }
722 else
723 {
724 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
725 dst[p] = chr;
726 }
727 }
728
729 \f
730 /* Symbols.
731
732 Basic symbol creation and accessing is done here, the rest is in
733 symbols.[hc]. This has been done to keep stringbufs and the
734 internals of strings and string-like objects confined to this file.
735 */
736
737 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
738
739 SCM
740 scm_i_make_symbol (SCM name, scm_t_bits flags,
741 unsigned long hash, SCM props)
742 {
743 SCM buf;
744 size_t start = STRING_START (name);
745 size_t length = STRING_LENGTH (name);
746
747 if (IS_SH_STRING (name))
748 {
749 name = SH_STRING_STRING (name);
750 start += STRING_START (name);
751 }
752 buf = SYMBOL_STRINGBUF (name);
753
754 if (start == 0 && length == STRINGBUF_LENGTH (buf))
755 {
756 /* reuse buf. */
757 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
758 SET_STRINGBUF_SHARED (buf);
759 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
760 }
761 else
762 {
763 /* make new buf. */
764 if (scm_i_is_narrow_string (name))
765 {
766 SCM new_buf = make_stringbuf (length);
767 memcpy (STRINGBUF_CHARS (new_buf),
768 STRINGBUF_CHARS (buf) + start, length);
769 buf = new_buf;
770 }
771 else
772 {
773 SCM new_buf = make_wide_stringbuf (length);
774 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
775 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
776 length);
777 buf = new_buf;
778 }
779 }
780 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
781 (scm_t_bits) hash, SCM_UNPACK (props));
782 }
783
784 SCM
785 scm_i_c_make_symbol (const char *name, size_t len,
786 scm_t_bits flags, unsigned long hash, SCM props)
787 {
788 SCM buf = make_stringbuf (len);
789 memcpy (STRINGBUF_CHARS (buf), name, len);
790
791 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
792 (scm_t_bits) hash, SCM_UNPACK (props));
793 }
794
795 /* Returns the number of characters in SYM. This may be different
796 from the memory size of SYM. */
797 size_t
798 scm_i_symbol_length (SCM sym)
799 {
800 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
801 }
802
803 size_t
804 scm_c_symbol_length (SCM sym)
805 #define FUNC_NAME "scm_c_symbol_length"
806 {
807 SCM_VALIDATE_SYMBOL (1, sym);
808
809 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
810 }
811 #undef FUNC_NAME
812
813 /* True if the name of SYM is stored as a Latin-1 encoded string.
814 False if it is stored as a 32-bit UCS-4-encoded string. */
815 int
816 scm_i_is_narrow_symbol (SCM sym)
817 {
818 SCM buf;
819
820 buf = SYMBOL_STRINGBUF (sym);
821 return !STRINGBUF_WIDE (buf);
822 }
823
824 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
825 contains the name of SYM. */
826 const char *
827 scm_i_symbol_chars (SCM sym)
828 {
829 SCM buf;
830
831 buf = SYMBOL_STRINGBUF (sym);
832 if (!STRINGBUF_WIDE (buf))
833 return (const char *) STRINGBUF_CHARS (buf);
834 else
835 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
836 scm_list_1 (sym));
837 }
838
839 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
840 symbol's name. */
841 const scm_t_wchar *
842 scm_i_symbol_wide_chars (SCM sym)
843 {
844 SCM buf;
845
846 buf = SYMBOL_STRINGBUF (sym);
847 if (STRINGBUF_WIDE (buf))
848 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
849 else
850 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
851 scm_list_1 (sym));
852 }
853
854 SCM
855 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
856 {
857 SCM buf = SYMBOL_STRINGBUF (sym);
858 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
859 SET_STRINGBUF_SHARED (buf);
860 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
861 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
862 (scm_t_bits)start, (scm_t_bits) end - start);
863 }
864
865 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
866 scm_t_wchar
867 scm_i_symbol_ref (SCM sym, size_t x)
868 {
869 if (scm_i_is_narrow_symbol (sym))
870 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
871 else
872 return scm_i_symbol_wide_chars (sym)[x];
873 }
874
875 /* Debugging
876 */
877
878 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
879 "Returns an association list containing debugging information\n"
880 "for @var{str}. The association list has the following entries."
881 "@table @code\n"
882 "@item string\n"
883 "The string itself.\n"
884 "@item start\n"
885 "The start index of the string into its stringbuf\n"
886 "@item length\n"
887 "The length of the string\n"
888 "@item shared\n"
889 "If this string is a substring, it returns its parent string.\n"
890 "Otherwise, it returns @code{#f}\n"
891 "@item read-only\n"
892 "@code{#t} if the string is read-only\n"
893 "@item stringbuf-chars\n"
894 "A new string containing this string's stringbuf's characters\n"
895 "@item stringbuf-length\n"
896 "The number of characters in this stringbuf\n"
897 "@item stringbuf-shared\n"
898 "@code{#t} if this stringbuf is shared\n"
899 "@item stringbuf-wide\n"
900 "@code{#t} if this stringbuf's characters are stored in a\n"
901 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
902 "buffer\n"
903 "@end table")
904 #define FUNC_NAME s_scm_sys_string_dump
905 {
906 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
907 SCM buf;
908 SCM_VALIDATE_STRING (1, str);
909
910 /* String info */
911 e1 = scm_cons (scm_from_latin1_symbol ("string"),
912 str);
913 e2 = scm_cons (scm_from_latin1_symbol ("start"),
914 scm_from_size_t (STRING_START (str)));
915 e3 = scm_cons (scm_from_latin1_symbol ("length"),
916 scm_from_size_t (STRING_LENGTH (str)));
917
918 if (IS_SH_STRING (str))
919 {
920 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
921 SH_STRING_STRING (str));
922 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
923 }
924 else
925 {
926 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
927 SCM_BOOL_F);
928 buf = STRING_STRINGBUF (str);
929 }
930
931 if (IS_RO_STRING (str))
932 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
933 SCM_BOOL_T);
934 else
935 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
936 SCM_BOOL_F);
937
938 /* Stringbuf info */
939 if (!STRINGBUF_WIDE (buf))
940 {
941 size_t len = STRINGBUF_LENGTH (buf);
942 char *cbuf;
943 SCM sbc = scm_i_make_string (len, &cbuf, 0);
944 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
945 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
946 sbc);
947 }
948 else
949 {
950 size_t len = STRINGBUF_LENGTH (buf);
951 scm_t_wchar *cbuf;
952 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
953 u32_cpy ((scm_t_uint32 *) cbuf,
954 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
955 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
956 sbc);
957 }
958 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
959 scm_from_size_t (STRINGBUF_LENGTH (buf)));
960 if (STRINGBUF_SHARED (buf))
961 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
962 SCM_BOOL_T);
963 else
964 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
965 SCM_BOOL_F);
966 if (STRINGBUF_WIDE (buf))
967 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
968 SCM_BOOL_T);
969 else
970 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
971 SCM_BOOL_F);
972
973 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
974 }
975 #undef FUNC_NAME
976
977 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
978 "Returns an association list containing debugging information\n"
979 "for @var{sym}. The association list has the following entries."
980 "@table @code\n"
981 "@item symbol\n"
982 "The symbol itself\n"
983 "@item hash\n"
984 "Its hash value\n"
985 "@item interned\n"
986 "@code{#t} if it is an interned symbol\n"
987 "@item stringbuf-chars\n"
988 "A new string containing this symbols's stringbuf's characters\n"
989 "@item stringbuf-length\n"
990 "The number of characters in this stringbuf\n"
991 "@item stringbuf-shared\n"
992 "@code{#t} if this stringbuf is shared\n"
993 "@item stringbuf-wide\n"
994 "@code{#t} if this stringbuf's characters are stored in a\n"
995 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
996 "buffer\n"
997 "@end table")
998 #define FUNC_NAME s_scm_sys_symbol_dump
999 {
1000 SCM e1, e2, e3, e4, e5, e6, e7;
1001 SCM buf;
1002 SCM_VALIDATE_SYMBOL (1, sym);
1003 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1004 sym);
1005 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1006 scm_from_ulong (scm_i_symbol_hash (sym)));
1007 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1008 scm_symbol_interned_p (sym));
1009 buf = SYMBOL_STRINGBUF (sym);
1010
1011 /* Stringbuf info */
1012 if (!STRINGBUF_WIDE (buf))
1013 {
1014 size_t len = STRINGBUF_LENGTH (buf);
1015 char *cbuf;
1016 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1017 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1018 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1019 sbc);
1020 }
1021 else
1022 {
1023 size_t len = STRINGBUF_LENGTH (buf);
1024 scm_t_wchar *cbuf;
1025 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1026 u32_cpy ((scm_t_uint32 *) cbuf,
1027 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1028 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1029 sbc);
1030 }
1031 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1032 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1033 if (STRINGBUF_SHARED (buf))
1034 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1035 SCM_BOOL_T);
1036 else
1037 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1038 SCM_BOOL_F);
1039 if (STRINGBUF_WIDE (buf))
1040 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1041 SCM_BOOL_T);
1042 else
1043 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1044 SCM_BOOL_F);
1045 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1046
1047 }
1048 #undef FUNC_NAME
1049
1050 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1051
1052 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1053 #define FUNC_NAME s_scm_sys_stringbuf_hist
1054 {
1055 int i;
1056 for (i = 0; i < 1000; i++)
1057 if (lenhist[i])
1058 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1059 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1060 return SCM_UNSPECIFIED;
1061 }
1062 #undef FUNC_NAME
1063
1064 #endif
1065
1066 \f
1067
1068 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1069 (SCM obj),
1070 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1071 #define FUNC_NAME s_scm_string_p
1072 {
1073 return scm_from_bool (IS_STRING (obj));
1074 }
1075 #undef FUNC_NAME
1076
1077
1078 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1079
1080 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1081 (SCM chrs),
1082 "@deffnx {Scheme Procedure} list->string chrs\n"
1083 "Return a newly allocated string composed of the arguments,\n"
1084 "@var{chrs}.")
1085 #define FUNC_NAME s_scm_string
1086 {
1087 SCM result = SCM_BOOL_F;
1088 SCM rest;
1089 size_t len;
1090 size_t p = 0;
1091 long i;
1092 int wide = 0;
1093
1094 /* Verify that this is a list of chars. */
1095 i = scm_ilength (chrs);
1096 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1097
1098 len = (size_t) i;
1099 rest = chrs;
1100
1101 while (len > 0 && scm_is_pair (rest))
1102 {
1103 SCM elt = SCM_CAR (rest);
1104 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1105 if (SCM_CHAR (elt) > 0xFF)
1106 wide = 1;
1107 rest = SCM_CDR (rest);
1108 len--;
1109 scm_remember_upto_here_1 (elt);
1110 }
1111
1112 /* Construct a string containing this list of chars. */
1113 len = (size_t) i;
1114 rest = chrs;
1115
1116 if (wide == 0)
1117 {
1118 char *buf;
1119
1120 result = scm_i_make_string (len, NULL, 0);
1121 result = scm_i_string_start_writing (result);
1122 buf = scm_i_string_writable_chars (result);
1123 while (len > 0 && scm_is_pair (rest))
1124 {
1125 SCM elt = SCM_CAR (rest);
1126 buf[p] = (unsigned char) SCM_CHAR (elt);
1127 p++;
1128 rest = SCM_CDR (rest);
1129 len--;
1130 scm_remember_upto_here_1 (elt);
1131 }
1132 }
1133 else
1134 {
1135 scm_t_wchar *buf;
1136
1137 result = scm_i_make_wide_string (len, NULL, 0);
1138 result = scm_i_string_start_writing (result);
1139 buf = scm_i_string_writable_wide_chars (result);
1140 while (len > 0 && scm_is_pair (rest))
1141 {
1142 SCM elt = SCM_CAR (rest);
1143 buf[p] = SCM_CHAR (elt);
1144 p++;
1145 rest = SCM_CDR (rest);
1146 len--;
1147 scm_remember_upto_here_1 (elt);
1148 }
1149 }
1150 scm_i_string_stop_writing ();
1151
1152 if (len > 0)
1153 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1154 if (!scm_is_null (rest))
1155 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1156
1157 return result;
1158 }
1159 #undef FUNC_NAME
1160
1161 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1162 (SCM k, SCM chr),
1163 "Return a newly allocated string of\n"
1164 "length @var{k}. If @var{chr} is given, then all elements of\n"
1165 "the string are initialized to @var{chr}, otherwise the contents\n"
1166 "of the string are all set to @code{#\nul}.")
1167 #define FUNC_NAME s_scm_make_string
1168 {
1169 return scm_c_make_string (scm_to_size_t (k), chr);
1170 }
1171 #undef FUNC_NAME
1172
1173 SCM
1174 scm_c_make_string (size_t len, SCM chr)
1175 #define FUNC_NAME NULL
1176 {
1177 size_t p;
1178 char *contents = NULL;
1179 SCM res = scm_i_make_string (len, &contents, 0);
1180
1181 /* If no char is given, initialize string contents to NULL. */
1182 if (SCM_UNBNDP (chr))
1183 memset (contents, 0, len);
1184 else
1185 {
1186 SCM_VALIDATE_CHAR (0, chr);
1187 res = scm_i_string_start_writing (res);
1188 for (p = 0; p < len; p++)
1189 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1190 scm_i_string_stop_writing ();
1191 }
1192
1193 return res;
1194 }
1195 #undef FUNC_NAME
1196
1197 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1198 (SCM string),
1199 "Return the number of characters in @var{string}.")
1200 #define FUNC_NAME s_scm_string_length
1201 {
1202 SCM_VALIDATE_STRING (1, string);
1203 return scm_from_size_t (STRING_LENGTH (string));
1204 }
1205 #undef FUNC_NAME
1206
1207 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1208 (SCM string),
1209 "Return the bytes used to represent a character in @var{string}."
1210 "This will return 1 or 4.")
1211 #define FUNC_NAME s_scm_string_bytes_per_char
1212 {
1213 SCM_VALIDATE_STRING (1, string);
1214 if (!scm_i_is_narrow_string (string))
1215 return scm_from_int (4);
1216
1217 return scm_from_int (1);
1218 }
1219 #undef FUNC_NAME
1220
1221 size_t
1222 scm_c_string_length (SCM string)
1223 {
1224 if (!IS_STRING (string))
1225 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1226 return STRING_LENGTH (string);
1227 }
1228
1229 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1230 (SCM str, SCM k),
1231 "Return character @var{k} of @var{str} using zero-origin\n"
1232 "indexing. @var{k} must be a valid index of @var{str}.")
1233 #define FUNC_NAME s_scm_string_ref
1234 {
1235 size_t len;
1236 unsigned long idx;
1237
1238 SCM_VALIDATE_STRING (1, str);
1239
1240 len = scm_i_string_length (str);
1241 if (SCM_LIKELY (len > 0))
1242 idx = scm_to_unsigned_integer (k, 0, len - 1);
1243 else
1244 scm_out_of_range (NULL, k);
1245
1246 if (scm_i_is_narrow_string (str))
1247 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1248 else
1249 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1250 }
1251 #undef FUNC_NAME
1252
1253 SCM
1254 scm_c_string_ref (SCM str, size_t p)
1255 {
1256 if (p >= scm_i_string_length (str))
1257 scm_out_of_range (NULL, scm_from_size_t (p));
1258 if (scm_i_is_narrow_string (str))
1259 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1260 else
1261 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1262
1263 }
1264
1265 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1266 (SCM str, SCM k, SCM chr),
1267 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1268 "an unspecified value. @var{k} must be a valid index of\n"
1269 "@var{str}.")
1270 #define FUNC_NAME s_scm_string_set_x
1271 {
1272 size_t len;
1273 unsigned long idx;
1274
1275 SCM_VALIDATE_STRING (1, str);
1276
1277 len = scm_i_string_length (str);
1278 if (SCM_LIKELY (len > 0))
1279 idx = scm_to_unsigned_integer (k, 0, len - 1);
1280 else
1281 scm_out_of_range (NULL, k);
1282
1283 SCM_VALIDATE_CHAR (3, chr);
1284 str = scm_i_string_start_writing (str);
1285 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1286 scm_i_string_stop_writing ();
1287
1288 return SCM_UNSPECIFIED;
1289 }
1290 #undef FUNC_NAME
1291
1292 void
1293 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1294 {
1295 if (p >= scm_i_string_length (str))
1296 scm_out_of_range (NULL, scm_from_size_t (p));
1297 str = scm_i_string_start_writing (str);
1298 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1299 scm_i_string_stop_writing ();
1300 }
1301
1302 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1303 (SCM str, SCM start, SCM end),
1304 "Return a newly allocated string formed from the characters\n"
1305 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1306 "ending with index @var{end} (exclusive).\n"
1307 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1308 "exact integers satisfying:\n\n"
1309 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1310 #define FUNC_NAME s_scm_substring
1311 {
1312 size_t len, from, to;
1313
1314 SCM_VALIDATE_STRING (1, str);
1315 len = scm_i_string_length (str);
1316 from = scm_to_unsigned_integer (start, 0, len);
1317 if (SCM_UNBNDP (end))
1318 to = len;
1319 else
1320 to = scm_to_unsigned_integer (end, from, len);
1321 return scm_i_substring (str, from, to);
1322 }
1323 #undef FUNC_NAME
1324
1325 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1326 (SCM str, SCM start, SCM end),
1327 "Return a newly allocated string formed from the characters\n"
1328 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1329 "ending with index @var{end} (exclusive).\n"
1330 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1331 "exact integers satisfying:\n"
1332 "\n"
1333 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1334 "\n"
1335 "The returned string is read-only.\n")
1336 #define FUNC_NAME s_scm_substring_read_only
1337 {
1338 size_t len, from, to;
1339
1340 SCM_VALIDATE_STRING (1, str);
1341 len = scm_i_string_length (str);
1342 from = scm_to_unsigned_integer (start, 0, len);
1343 if (SCM_UNBNDP (end))
1344 to = len;
1345 else
1346 to = scm_to_unsigned_integer (end, from, len);
1347 return scm_i_substring_read_only (str, from, to);
1348 }
1349 #undef FUNC_NAME
1350
1351 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1352 (SCM str, SCM start, SCM end),
1353 "Return a newly allocated string formed from the characters\n"
1354 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1355 "ending with index @var{end} (exclusive).\n"
1356 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1357 "exact integers satisfying:\n\n"
1358 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1359 #define FUNC_NAME s_scm_substring_copy
1360 {
1361 /* For the Scheme version, START is mandatory, but for the C
1362 version, it is optional. See scm_string_copy in srfi-13.c for a
1363 rationale.
1364 */
1365
1366 size_t from, to;
1367
1368 SCM_VALIDATE_STRING (1, str);
1369 scm_i_get_substring_spec (scm_i_string_length (str),
1370 start, &from, end, &to);
1371 return scm_i_substring_copy (str, from, to);
1372 }
1373 #undef FUNC_NAME
1374
1375 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1376 (SCM str, SCM start, SCM end),
1377 "Return string that indirectly refers to the characters\n"
1378 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379 "ending with index @var{end} (exclusive).\n"
1380 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381 "exact integers satisfying:\n\n"
1382 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383 #define FUNC_NAME s_scm_substring_shared
1384 {
1385 size_t len, from, to;
1386
1387 SCM_VALIDATE_STRING (1, str);
1388 len = scm_i_string_length (str);
1389 from = scm_to_unsigned_integer (start, 0, len);
1390 if (SCM_UNBNDP (end))
1391 to = len;
1392 else
1393 to = scm_to_unsigned_integer (end, from, len);
1394 return scm_i_substring_shared (str, from, to);
1395 }
1396 #undef FUNC_NAME
1397
1398 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1399 (SCM args),
1400 "Return a newly allocated string whose characters form the\n"
1401 "concatenation of the given strings, @var{args}.")
1402 #define FUNC_NAME s_scm_string_append
1403 {
1404 SCM res;
1405 size_t len = 0;
1406 int wide = 0;
1407 SCM l, s;
1408 size_t i;
1409 union
1410 {
1411 char *narrow;
1412 scm_t_wchar *wide;
1413 } data;
1414
1415 SCM_VALIDATE_REST_ARGUMENT (args);
1416 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1417 {
1418 s = SCM_CAR (l);
1419 SCM_VALIDATE_STRING (SCM_ARGn, s);
1420 len += scm_i_string_length (s);
1421 if (!scm_i_is_narrow_string (s))
1422 wide = 1;
1423 }
1424 data.narrow = NULL;
1425 if (!wide)
1426 res = scm_i_make_string (len, &data.narrow, 0);
1427 else
1428 res = scm_i_make_wide_string (len, &data.wide, 0);
1429
1430 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1431 {
1432 size_t len;
1433 s = SCM_CAR (l);
1434 SCM_VALIDATE_STRING (SCM_ARGn, s);
1435 len = scm_i_string_length (s);
1436 if (!wide)
1437 {
1438 memcpy (data.narrow, scm_i_string_chars (s), len);
1439 data.narrow += len;
1440 }
1441 else
1442 {
1443 if (scm_i_is_narrow_string (s))
1444 {
1445 for (i = 0; i < scm_i_string_length (s); i++)
1446 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1447 }
1448 else
1449 u32_cpy ((scm_t_uint32 *) data.wide,
1450 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1451 data.wide += len;
1452 }
1453 scm_remember_upto_here_1 (s);
1454 }
1455 return res;
1456 }
1457 #undef FUNC_NAME
1458
1459
1460 \f
1461 /* Charset conversion error handling. */
1462
1463 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1464 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1465
1466 /* Raise an exception informing that character CHR could not be written
1467 to PORT in its current encoding. */
1468 void
1469 scm_encoding_error (const char *subr, int err, const char *message,
1470 SCM port, SCM chr)
1471 {
1472 scm_throw (scm_encoding_error_key,
1473 scm_list_n (scm_from_latin1_string (subr),
1474 scm_from_latin1_string (message),
1475 scm_from_int (err),
1476 port, chr,
1477 SCM_UNDEFINED));
1478 }
1479
1480 /* Raise an exception informing of an encoding error on PORT. This
1481 means that a character could not be written in PORT's encoding. */
1482 void
1483 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1484 {
1485 scm_throw (scm_decoding_error_key,
1486 scm_list_n (scm_from_latin1_string (subr),
1487 scm_from_latin1_string (message),
1488 scm_from_int (err),
1489 port,
1490 SCM_UNDEFINED));
1491 }
1492
1493 \f
1494 /* String conversion to/from C. */
1495
1496 static void
1497 decoding_error (const char *func_name, int errno_save,
1498 const char *str, size_t len)
1499 {
1500 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1501 handler. */
1502 SCM bv;
1503 signed char *buf;
1504
1505 buf = scm_gc_malloc_pointerless (len, "bytevector");
1506 memcpy (buf, str, len);
1507 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1508
1509 scm_decoding_error (func_name, errno_save,
1510 "input locale conversion error", bv);
1511 }
1512
1513 SCM
1514 scm_from_stringn (const char *str, size_t len, const char *encoding,
1515 scm_t_string_failed_conversion_handler handler)
1516 {
1517 size_t u32len, i;
1518 scm_t_wchar *u32;
1519 int wide = 0;
1520 SCM res;
1521
1522 /* The order of these checks is important. */
1523 if (!str && len != 0)
1524 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1525 if (len == (size_t) -1)
1526 len = strlen (str);
1527
1528 if (encoding == NULL || len == 0)
1529 return scm_from_latin1_stringn (str, len);
1530 else if (strcmp (encoding, "UTF-8") == 0)
1531 return scm_from_utf8_stringn (str, len);
1532
1533 u32len = 0;
1534 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1535 (enum iconv_ilseq_handler)
1536 handler,
1537 str, len,
1538 NULL,
1539 NULL, &u32len);
1540
1541 if (SCM_UNLIKELY (u32 == NULL))
1542 decoding_error (__func__, errno, str, len);
1543
1544 i = 0;
1545 while (i < u32len)
1546 if (u32[i++] > 0xFF)
1547 {
1548 wide = 1;
1549 break;
1550 }
1551
1552 if (!wide)
1553 {
1554 char *dst;
1555 res = scm_i_make_string (u32len, &dst, 0);
1556 for (i = 0; i < u32len; i ++)
1557 dst[i] = (unsigned char) u32[i];
1558 dst[u32len] = '\0';
1559 }
1560 else
1561 {
1562 scm_t_wchar *wdst;
1563 res = scm_i_make_wide_string (u32len, &wdst, 0);
1564 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1565 wdst[u32len] = 0;
1566 }
1567
1568 free (u32);
1569 return res;
1570 }
1571
1572 SCM
1573 scm_from_locale_string (const char *str)
1574 {
1575 return scm_from_locale_stringn (str, -1);
1576 }
1577
1578 SCM
1579 scm_from_locale_stringn (const char *str, size_t len)
1580 {
1581 return scm_from_stringn (str, len, locale_charset (),
1582 scm_i_get_conversion_strategy (SCM_BOOL_F));
1583 }
1584
1585 SCM
1586 scm_from_latin1_string (const char *str)
1587 {
1588 return scm_from_latin1_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_latin1_stringn (const char *str, size_t len)
1593 {
1594 char *buf;
1595 SCM result;
1596
1597 if (len == (size_t) -1)
1598 len = strlen (str);
1599
1600 /* Make a narrow string and copy STR as is. */
1601 result = scm_i_make_string (len, &buf, 0);
1602 memcpy (buf, str, len);
1603
1604 return result;
1605 }
1606
1607 SCM
1608 scm_from_utf8_string (const char *str)
1609 {
1610 return scm_from_utf8_stringn (str, -1);
1611 }
1612
1613 SCM
1614 scm_from_utf8_stringn (const char *str, size_t len)
1615 {
1616 size_t i, char_len;
1617 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1618 int ascii = 1, narrow = 1;
1619 SCM res;
1620
1621 if (len == (size_t) -1)
1622 len = strlen (str);
1623
1624 i = 0;
1625 char_len = 0;
1626
1627 while (i < len)
1628 {
1629 if (ustr[i] <= 127)
1630 {
1631 char_len++;
1632 i++;
1633 }
1634 else
1635 {
1636 ucs4_t c;
1637 int nbytes;
1638
1639 ascii = 0;
1640
1641 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1642
1643 if (nbytes < 0)
1644 /* Bad UTF-8. */
1645 decoding_error (__func__, errno, str, len);
1646
1647 if (c > 255)
1648 narrow = 0;
1649
1650 char_len++;
1651 i += nbytes;
1652 }
1653 }
1654
1655 if (ascii)
1656 {
1657 char *dst;
1658 res = scm_i_make_string (char_len, &dst, 0);
1659 memcpy (dst, str, len);
1660 }
1661 else if (narrow)
1662 {
1663 char *dst;
1664 size_t j;
1665 ucs4_t c;
1666
1667 res = scm_i_make_string (char_len, &dst, 0);
1668
1669 for (i = 0, j = 0; i < len; j++)
1670 {
1671 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1672 dst[j] = (signed char) c;
1673 }
1674 }
1675 else
1676 {
1677 scm_t_wchar *dst;
1678 size_t j;
1679 ucs4_t c;
1680
1681 res = scm_i_make_wide_string (char_len, &dst, 0);
1682
1683 for (i = 0, j = 0; i < len; j++)
1684 {
1685 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1686 dst[j] = c;
1687 }
1688 }
1689
1690 return res;
1691 }
1692
1693 SCM
1694 scm_from_utf32_string (const scm_t_wchar *str)
1695 {
1696 return scm_from_utf32_stringn (str, -1);
1697 }
1698
1699 SCM
1700 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1701 {
1702 SCM result;
1703 scm_t_wchar *buf;
1704
1705 if (len == (size_t) -1)
1706 len = u32_strlen ((uint32_t *) str);
1707
1708 result = scm_i_make_wide_string (len, &buf, 0);
1709 memcpy (buf, str, len * sizeof (scm_t_wchar));
1710 scm_i_try_narrow_string (result);
1711
1712 return result;
1713 }
1714
1715 /* Create a new scheme string from the C string STR. The memory of
1716 STR may be used directly as storage for the new string. */
1717 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1718 would be to register a finalizer to eventually free(3) STR, which isn't
1719 worth it. Should we just deprecate the `scm_take_' functions? */
1720 SCM
1721 scm_take_locale_stringn (char *str, size_t len)
1722 {
1723 SCM res;
1724
1725 res = scm_from_locale_stringn (str, len);
1726 free (str);
1727
1728 return res;
1729 }
1730
1731 SCM
1732 scm_take_locale_string (char *str)
1733 {
1734 return scm_take_locale_stringn (str, -1);
1735 }
1736
1737 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1738 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1739 Set *LENP to the size of the resulting string.
1740
1741 FIXME: This is a hack we should get rid of. See
1742 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1743 for details. */
1744 static void
1745 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1746 {
1747 char *before, *after;
1748 size_t i, j;
1749
1750 before = buf;
1751 after = buf;
1752 i = 0;
1753 j = 0;
1754 while (i < *lenp)
1755 {
1756 if ((i <= *lenp - 6)
1757 && before[i] == '\\'
1758 && before[i + 1] == 'u'
1759 && before[i + 2] == '0' && before[i + 3] == '0')
1760 {
1761 /* Convert \u00NN to \xNN */
1762 after[j] = '\\';
1763 after[j + 1] = 'x';
1764 after[j + 2] = tolower ((int) before[i + 4]);
1765 after[j + 3] = tolower ((int) before[i + 5]);
1766 i += 6;
1767 j += 4;
1768 }
1769 else if ((i <= *lenp - 10)
1770 && before[i] == '\\'
1771 && before[i + 1] == 'U'
1772 && before[i + 2] == '0' && before[i + 3] == '0')
1773 {
1774 /* Convert \U00NNNNNN to \UNNNNNN */
1775 after[j] = '\\';
1776 after[j + 1] = 'U';
1777 after[j + 2] = tolower ((int) before[i + 4]);
1778 after[j + 3] = tolower ((int) before[i + 5]);
1779 after[j + 4] = tolower ((int) before[i + 6]);
1780 after[j + 5] = tolower ((int) before[i + 7]);
1781 after[j + 6] = tolower ((int) before[i + 8]);
1782 after[j + 7] = tolower ((int) before[i + 9]);
1783 i += 10;
1784 j += 8;
1785 }
1786 else
1787 {
1788 after[j] = before[i];
1789 i++;
1790 j++;
1791 }
1792 }
1793 *lenp = j;
1794 }
1795
1796 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1797 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1798 of the resulting string. BUF must be large enough to handle the
1799 worst case when `\uXXXX' escapes (6 characters) are replaced by
1800 `\xXXXX;' (7 characters). */
1801 static void
1802 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1803 {
1804 char *before, *after;
1805 size_t i, j;
1806 /* The worst case is if the input string contains all 4-digit hex escapes.
1807 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1808 size_t max_out_len = (*lenp * 7) / 6 + 1;
1809 size_t nzeros, ndigits;
1810
1811 before = buf;
1812 after = alloca (max_out_len);
1813 i = 0;
1814 j = 0;
1815 while (i < *lenp)
1816 {
1817 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1818 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1819 {
1820 if (before[i + 1] == 'u')
1821 ndigits = 4;
1822 else if (before[i + 1] == 'U')
1823 ndigits = 8;
1824 else
1825 abort ();
1826
1827 /* Add the R6RS hex escape initial sequence. */
1828 after[j] = '\\';
1829 after[j + 1] = 'x';
1830
1831 /* Move string positions to the start of the hex numbers. */
1832 i += 2;
1833 j += 2;
1834
1835 /* Find the number of initial zeros in this hex number. */
1836 nzeros = 0;
1837 while (before[i + nzeros] == '0' && nzeros < ndigits)
1838 nzeros++;
1839
1840 /* Copy the number, skipping initial zeros, and then move the string
1841 positions. */
1842 if (nzeros == ndigits)
1843 {
1844 after[j] = '0';
1845 i += ndigits;
1846 j += 1;
1847 }
1848 else
1849 {
1850 int pos;
1851 for (pos = 0; pos < ndigits - nzeros; pos++)
1852 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1853 i += ndigits;
1854 j += (ndigits - nzeros);
1855 }
1856
1857 /* Add terminating semicolon. */
1858 after[j] = ';';
1859 j++;
1860 }
1861 else
1862 {
1863 after[j] = before[i];
1864 i++;
1865 j++;
1866 }
1867 }
1868 *lenp = j;
1869 memcpy (before, after, j);
1870 }
1871
1872 char *
1873 scm_to_locale_string (SCM str)
1874 {
1875 return scm_to_locale_stringn (str, NULL);
1876 }
1877
1878 char *
1879 scm_to_locale_stringn (SCM str, size_t *lenp)
1880 {
1881 return scm_to_stringn (str, lenp,
1882 locale_charset (),
1883 scm_i_get_conversion_strategy (SCM_BOOL_F));
1884 }
1885
1886 char *
1887 scm_to_latin1_string (SCM str)
1888 {
1889 return scm_to_latin1_stringn (str, NULL);
1890 }
1891
1892 char *
1893 scm_to_latin1_stringn (SCM str, size_t *lenp)
1894 #define FUNC_NAME "scm_to_latin1_stringn"
1895 {
1896 char *result;
1897
1898 SCM_VALIDATE_STRING (1, str);
1899
1900 if (scm_i_is_narrow_string (str))
1901 {
1902 size_t len = scm_i_string_length (str);
1903
1904 if (lenp)
1905 *lenp = len;
1906
1907 result = scm_strndup (scm_i_string_data (str), len);
1908 }
1909 else
1910 result = scm_to_stringn (str, lenp, NULL,
1911 SCM_FAILED_CONVERSION_ERROR);
1912
1913 return result;
1914 }
1915 #undef FUNC_NAME
1916
1917 char *
1918 scm_to_utf8_string (SCM str)
1919 {
1920 return scm_to_utf8_stringn (str, NULL);
1921 }
1922
1923 static size_t
1924 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1925 {
1926 size_t ret, i;
1927 for (i = 0, ret = 0; i < len; i++)
1928 ret += (str[i] < 128) ? 1 : 2;
1929 return ret;
1930 }
1931
1932 static scm_t_uint8*
1933 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1934 scm_t_uint8 *u8_result, size_t *u8_lenp)
1935 {
1936 size_t i, n;
1937 size_t u8_len = latin1_u8_strlen (str, latin_len);
1938
1939 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1940 u8_result = scm_malloc (u8_len + 1);
1941 if (u8_lenp)
1942 *u8_lenp = u8_len;
1943
1944 for (i = 0, n = 0; i < latin_len; i++)
1945 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1946 if (n != u8_len)
1947 abort ();
1948 u8_result[n] = 0;
1949
1950 return u8_result;
1951 }
1952
1953 char *
1954 scm_to_utf8_stringn (SCM str, size_t *lenp)
1955 {
1956 if (scm_i_is_narrow_string (str))
1957 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1958 scm_i_string_length (str),
1959 NULL, lenp);
1960 else
1961 return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
1962 scm_i_string_length (str),
1963 NULL, lenp);
1964 }
1965
1966 scm_t_wchar *
1967 scm_to_utf32_string (SCM str)
1968 {
1969 return scm_to_utf32_stringn (str, NULL);
1970 }
1971
1972 scm_t_wchar *
1973 scm_to_utf32_stringn (SCM str, size_t *lenp)
1974 #define FUNC_NAME "scm_to_utf32_stringn"
1975 {
1976 scm_t_wchar *result;
1977
1978 SCM_VALIDATE_STRING (1, str);
1979
1980 if (scm_i_is_narrow_string (str))
1981 {
1982 scm_t_uint8 *codepoints;
1983 size_t i, len;
1984
1985 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
1986 len = scm_i_string_length (str);
1987 if (lenp)
1988 *lenp = len;
1989
1990 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1991 for (i = 0; i < len; i++)
1992 result[i] = codepoints[i];
1993 result[len] = 0;
1994 }
1995 else
1996 {
1997 size_t len;
1998
1999 len = scm_i_string_length (str);
2000 if (lenp)
2001 *lenp = len;
2002
2003 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2004 memcpy (result, scm_i_string_wide_chars (str),
2005 len * sizeof (scm_t_wchar));
2006 result[len] = 0;
2007 }
2008
2009 return result;
2010 }
2011 #undef FUNC_NAME
2012
2013 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2014 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2015 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2016 defined by HANDLER. */
2017 char *
2018 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2019 scm_t_string_failed_conversion_handler handler)
2020 {
2021 char *buf;
2022 size_t ilen, len, i;
2023 int ret;
2024 const char *enc;
2025
2026 if (!scm_is_string (str))
2027 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2028 ilen = scm_i_string_length (str);
2029
2030 if (ilen == 0)
2031 {
2032 buf = scm_malloc (1);
2033 buf[0] = '\0';
2034 if (lenp)
2035 *lenp = 0;
2036 return buf;
2037 }
2038
2039 if (lenp == NULL)
2040 for (i = 0; i < ilen; i++)
2041 if (scm_i_string_ref (str, i) == '\0')
2042 scm_misc_error (NULL,
2043 "string contains #\\nul character: ~S",
2044 scm_list_1 (str));
2045
2046 if (scm_i_is_narrow_string (str) && (encoding == NULL))
2047 {
2048 /* If using native Latin-1 encoding, just copy the string
2049 contents. */
2050 if (lenp)
2051 {
2052 buf = scm_malloc (ilen);
2053 memcpy (buf, scm_i_string_chars (str), ilen);
2054 *lenp = ilen;
2055 return buf;
2056 }
2057 else
2058 {
2059 buf = scm_malloc (ilen + 1);
2060 memcpy (buf, scm_i_string_chars (str), ilen);
2061 buf[ilen] = '\0';
2062 return buf;
2063 }
2064 }
2065
2066
2067 buf = NULL;
2068 len = 0;
2069 enc = encoding;
2070 if (enc == NULL)
2071 enc = "ISO-8859-1";
2072 if (scm_i_is_narrow_string (str))
2073 {
2074 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2075 "ISO-8859-1", enc,
2076 (enum iconv_ilseq_handler) handler, NULL,
2077 &buf, &len);
2078
2079 if (ret != 0)
2080 scm_encoding_error (__func__, errno,
2081 "cannot convert narrow string to output locale",
2082 SCM_BOOL_F,
2083 /* FIXME: Faulty character unknown. */
2084 SCM_BOOL_F);
2085 }
2086 else
2087 {
2088 buf = u32_conv_to_encoding (enc,
2089 (enum iconv_ilseq_handler) handler,
2090 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2091 ilen,
2092 NULL,
2093 NULL, &len);
2094 if (buf == NULL)
2095 scm_encoding_error (__func__, errno,
2096 "cannot convert wide string to output locale",
2097 SCM_BOOL_F,
2098 /* FIXME: Faulty character unknown. */
2099 SCM_BOOL_F);
2100 }
2101 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2102 {
2103 if (SCM_R6RS_ESCAPES_P)
2104 {
2105 /* The worst case is if the input string contains all 4-digit
2106 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2107 (seven characters). Make BUF large enough to hold
2108 that. */
2109 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2110 unistring_escapes_to_r6rs_escapes (buf, &len);
2111 }
2112 else
2113 unistring_escapes_to_guile_escapes (buf, &len);
2114
2115 buf = scm_realloc (buf, len);
2116 }
2117 if (lenp)
2118 *lenp = len;
2119 else
2120 {
2121 buf = scm_realloc (buf, len + 1);
2122 buf[len] = '\0';
2123 }
2124
2125 scm_remember_upto_here_1 (str);
2126 return buf;
2127 }
2128
2129 size_t
2130 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2131 {
2132 size_t len;
2133 char *result = NULL;
2134 if (!scm_is_string (str))
2135 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2136 result = scm_to_locale_stringn (str, &len);
2137
2138 memcpy (buf, result, (len > max_len) ? max_len : len);
2139 free (result);
2140
2141 scm_remember_upto_here_1 (str);
2142 return len;
2143 }
2144
2145 \f
2146 /* Unicode string normalization. */
2147
2148 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2149 libguile/i18n.c. It would be useful to have this factored out into a more
2150 convenient location, but its use of alloca makes that tricky to do. */
2151
2152 static SCM
2153 normalize_str (SCM string, uninorm_t form)
2154 {
2155 SCM ret;
2156 scm_t_uint32 *w_str;
2157 scm_t_wchar *cbuf;
2158 size_t rlen, len = scm_i_string_length (string);
2159
2160 if (scm_i_is_narrow_string (string))
2161 {
2162 size_t i;
2163 const char *buf = scm_i_string_chars (string);
2164
2165 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2166
2167 for (i = 0; i < len; i ++)
2168 w_str[i] = (unsigned char) buf[i];
2169 w_str[len] = 0;
2170 }
2171 else
2172 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2173
2174 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2175
2176 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2177 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2178 free (w_str);
2179
2180 scm_i_try_narrow_string (ret);
2181
2182 return ret;
2183 }
2184
2185 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2186 (SCM string),
2187 "Returns the NFC normalized form of @var{string}.")
2188 #define FUNC_NAME s_scm_string_normalize_nfc
2189 {
2190 SCM_VALIDATE_STRING (1, string);
2191 return normalize_str (string, UNINORM_NFC);
2192 }
2193 #undef FUNC_NAME
2194
2195 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2196 (SCM string),
2197 "Returns the NFD normalized form of @var{string}.")
2198 #define FUNC_NAME s_scm_string_normalize_nfd
2199 {
2200 SCM_VALIDATE_STRING (1, string);
2201 return normalize_str (string, UNINORM_NFD);
2202 }
2203 #undef FUNC_NAME
2204
2205 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2206 (SCM string),
2207 "Returns the NFKC normalized form of @var{string}.")
2208 #define FUNC_NAME s_scm_string_normalize_nfkc
2209 {
2210 SCM_VALIDATE_STRING (1, string);
2211 return normalize_str (string, UNINORM_NFKC);
2212 }
2213 #undef FUNC_NAME
2214
2215 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2216 (SCM string),
2217 "Returns the NFKD normalized form of @var{string}.")
2218 #define FUNC_NAME s_scm_string_normalize_nfkd
2219 {
2220 SCM_VALIDATE_STRING (1, string);
2221 return normalize_str (string, UNINORM_NFKD);
2222 }
2223 #undef FUNC_NAME
2224
2225 /* converts C scm_array of strings to SCM scm_list of strings.
2226 If argc < 0, a null terminated scm_array is assumed.
2227 The current locale encoding is assumed */
2228 SCM
2229 scm_makfromstrs (int argc, char **argv)
2230 {
2231 int i = argc;
2232 SCM lst = SCM_EOL;
2233 if (0 > i)
2234 for (i = 0; argv[i]; i++);
2235 while (i--)
2236 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2237 return lst;
2238 }
2239
2240 /* Return a newly allocated array of char pointers to each of the strings
2241 in args, with a terminating NULL pointer. The strings are encoded using
2242 the current locale. */
2243
2244 char **
2245 scm_i_allocate_string_pointers (SCM list)
2246 #define FUNC_NAME "scm_i_allocate_string_pointers"
2247 {
2248 char **result;
2249 int list_len = scm_ilength (list);
2250 int i;
2251
2252 if (list_len < 0)
2253 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2254
2255 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2256 "string pointers");
2257 result[list_len] = NULL;
2258
2259 /* The list might have been modified in another thread, so
2260 we check LIST before each access.
2261 */
2262 for (i = 0; i < list_len && scm_is_pair (list); i++)
2263 {
2264 SCM str = SCM_CAR (list);
2265 size_t len; /* String length in bytes */
2266 char *c_str = scm_to_locale_stringn (str, &len);
2267
2268 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2269 scm_malloc to allocate the returned string, which must be
2270 explicitly deallocated. This forces us to copy the string a
2271 second time into a new buffer. Ideally there would be variants
2272 of scm_to_*_stringn that can return garbage-collected buffers. */
2273
2274 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2275 memcpy (result[i], c_str, len);
2276 result[i][len] = '\0';
2277 free (c_str);
2278
2279 list = SCM_CDR (list);
2280 }
2281
2282 return result;
2283 }
2284 #undef FUNC_NAME
2285
2286 void
2287 scm_i_get_substring_spec (size_t len,
2288 SCM start, size_t *cstart,
2289 SCM end, size_t *cend)
2290 {
2291 if (SCM_UNBNDP (start))
2292 *cstart = 0;
2293 else
2294 *cstart = scm_to_unsigned_integer (start, 0, len);
2295
2296 if (SCM_UNBNDP (end))
2297 *cend = len;
2298 else
2299 *cend = scm_to_unsigned_integer (end, *cstart, len);
2300 }
2301
2302 static SCM
2303 string_handle_ref (scm_t_array_handle *h, size_t index)
2304 {
2305 return scm_c_string_ref (h->array, index);
2306 }
2307
2308 static void
2309 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2310 {
2311 scm_c_string_set_x (h->array, index, val);
2312 }
2313
2314 static void
2315 string_get_handle (SCM v, scm_t_array_handle *h)
2316 {
2317 h->array = v;
2318 h->ndims = 1;
2319 h->dims = &h->dim0;
2320 h->dim0.lbnd = 0;
2321 h->dim0.ubnd = scm_c_string_length (v) - 1;
2322 h->dim0.inc = 1;
2323 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2324 h->elements = h->writable_elements = NULL;
2325 }
2326
2327 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2328 string_handle_ref, string_handle_set,
2329 string_get_handle)
2330 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2331
2332 void
2333 scm_init_strings ()
2334 {
2335 scm_nullstr = scm_i_make_string (0, NULL, 0);
2336
2337 #include "libguile/strings.x"
2338 }
2339
2340
2341 /*
2342 Local Variables:
2343 c-file-style: "gnu"
2344 End:
2345 */