Merge remote-tracking branch 'local-2.0/stable-2.0'
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 \f
229
230 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
231 SCM_PTHREAD_ATFORK_LOCK_STATIC_MUTEX (stringbuf_write_mutex);
232
233 static scm_i_pthread_mutex_t iconv_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
234
235 void
236 scm_i_lock_iconv (void)
237 {
238 scm_i_pthread_mutex_lock (&iconv_mutex);
239 }
240
241 void
242 scm_i_unlock_iconv (void)
243 {
244 scm_i_pthread_mutex_unlock (&iconv_mutex);
245 }
246
247 \f
248 /* Copy-on-write strings.
249 */
250
251 #define STRING_TAG scm_tc7_string
252
253 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
254 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
255 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
256
257 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
258 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
259
260 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
261
262 /* Read-only strings.
263 */
264
265 #define RO_STRING_TAG scm_tc7_ro_string
266 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
267
268 /* Mutation-sharing substrings
269 */
270
271 #define SH_STRING_TAG (scm_tc7_string + 0x100)
272
273 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
274 /* START and LENGTH as for STRINGs. */
275
276 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
277
278 SCM scm_nullstr;
279
280 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
282 char array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
284 SCM
285 scm_i_make_string (size_t len, char **charsp, int read_only_p)
286 {
287 static SCM null_stringbuf = SCM_BOOL_F;
288 SCM buf;
289 SCM res;
290
291 if (len == 0)
292 {
293 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
294 {
295 null_stringbuf = make_stringbuf (0);
296 SET_STRINGBUF_SHARED (null_stringbuf);
297 }
298 buf = null_stringbuf;
299 }
300 else
301 buf = make_stringbuf (len);
302
303 if (charsp)
304 *charsp = (char *) STRINGBUF_CHARS (buf);
305 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
306 SCM_UNPACK (buf),
307 (scm_t_bits) 0, (scm_t_bits) len);
308 return res;
309 }
310
311 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
312 characters. CHARSP, if not NULL, will be set to location of the
313 character array. If READ_ONLY_P, the returned string is read-only;
314 otherwise it is writable. */
315 SCM
316 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
317 {
318 SCM buf = make_wide_stringbuf (len);
319 SCM res;
320 if (charsp)
321 *charsp = STRINGBUF_WIDE_CHARS (buf);
322 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
323 SCM_UNPACK (buf),
324 (scm_t_bits) 0, (scm_t_bits) len);
325 return res;
326 }
327
328 static void
329 validate_substring_args (SCM str, size_t start, size_t end)
330 {
331 if (!IS_STRING (str))
332 scm_wrong_type_arg_msg (NULL, 0, str, "string");
333 if (start > STRING_LENGTH (str))
334 scm_out_of_range (NULL, scm_from_size_t (start));
335 if (end > STRING_LENGTH (str) || end < start)
336 scm_out_of_range (NULL, scm_from_size_t (end));
337 }
338
339 static inline void
340 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
341 {
342 *start = STRING_START (*str);
343 if (IS_SH_STRING (*str))
344 {
345 *str = SH_STRING_STRING (*str);
346 *start += STRING_START (*str);
347 }
348 *buf = STRING_STRINGBUF (*str);
349 }
350
351 SCM
352 scm_i_substring (SCM str, size_t start, size_t end)
353 {
354 if (start == end)
355 return scm_i_make_string (0, NULL, 0);
356 else
357 {
358 SCM buf;
359 size_t str_start;
360 get_str_buf_start (&str, &buf, &str_start);
361 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
362 SET_STRINGBUF_SHARED (buf);
363 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
364 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
365 (scm_t_bits)str_start + start,
366 (scm_t_bits) end - start);
367 }
368 }
369
370 SCM
371 scm_i_substring_read_only (SCM str, size_t start, size_t end)
372 {
373 if (start == end)
374 return scm_i_make_string (0, NULL, 1);
375 else
376 {
377 SCM buf;
378 size_t str_start;
379 get_str_buf_start (&str, &buf, &str_start);
380 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
381 SET_STRINGBUF_SHARED (buf);
382 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
383 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
384 (scm_t_bits)str_start + start,
385 (scm_t_bits) end - start);
386 }
387 }
388
389 SCM
390 scm_i_substring_copy (SCM str, size_t start, size_t end)
391 {
392 if (start == end)
393 return scm_i_make_string (0, NULL, 0);
394 else
395 {
396 size_t len = end - start;
397 SCM buf, my_buf, substr;
398 size_t str_start;
399 int wide = 0;
400 get_str_buf_start (&str, &buf, &str_start);
401 if (scm_i_is_narrow_string (str))
402 {
403 my_buf = make_stringbuf (len);
404 memcpy (STRINGBUF_CHARS (my_buf),
405 STRINGBUF_CHARS (buf) + str_start + start, len);
406 }
407 else
408 {
409 my_buf = make_wide_stringbuf (len);
410 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
411 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
412 + start), len);
413 wide = 1;
414 }
415 scm_remember_upto_here_1 (buf);
416 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
417 (scm_t_bits) 0, (scm_t_bits) len);
418 if (wide)
419 scm_i_try_narrow_string (substr);
420 return substr;
421 }
422 }
423
424 SCM
425 scm_i_substring_shared (SCM str, size_t start, size_t end)
426 {
427 if (start == 0 && end == STRING_LENGTH (str))
428 return str;
429 else if (start == end)
430 return scm_i_make_string (0, NULL, 0);
431 else
432 {
433 size_t len = end - start;
434 if (IS_SH_STRING (str))
435 {
436 start += STRING_START (str);
437 str = SH_STRING_STRING (str);
438 }
439 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
440 (scm_t_bits)start, (scm_t_bits) len);
441 }
442 }
443
444 SCM
445 scm_c_substring (SCM str, size_t start, size_t end)
446 {
447 validate_substring_args (str, start, end);
448 return scm_i_substring (str, start, end);
449 }
450
451 SCM
452 scm_c_substring_read_only (SCM str, size_t start, size_t end)
453 {
454 validate_substring_args (str, start, end);
455 return scm_i_substring_read_only (str, start, end);
456 }
457
458 SCM
459 scm_c_substring_copy (SCM str, size_t start, size_t end)
460 {
461 validate_substring_args (str, start, end);
462 return scm_i_substring_copy (str, start, end);
463 }
464
465 SCM
466 scm_c_substring_shared (SCM str, size_t start, size_t end)
467 {
468 validate_substring_args (str, start, end);
469 return scm_i_substring_shared (str, start, end);
470 }
471
472 \f
473 /* Internal accessors
474 */
475
476 /* Returns the number of characters in STR. This may be different
477 than the memory size of the string storage. */
478 size_t
479 scm_i_string_length (SCM str)
480 {
481 return STRING_LENGTH (str);
482 }
483
484 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
485 encoding. False if it is 'wide', having a 32-bit UCS-4
486 encoding. */
487 int
488 scm_i_is_narrow_string (SCM str)
489 {
490 if (IS_SH_STRING (str))
491 str = SH_STRING_STRING (str);
492
493 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
494 }
495
496 /* Try to coerce a string to be narrow. It if is narrow already, do
497 nothing. If it is wide, shrink it to narrow if none of its
498 characters are above 0xFF. Return true if the string is narrow or
499 was made to be narrow. */
500 int
501 scm_i_try_narrow_string (SCM str)
502 {
503 if (IS_SH_STRING (str))
504 str = SH_STRING_STRING (str);
505
506 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
507
508 return scm_i_is_narrow_string (str);
509 }
510
511 /* Return a pointer to the raw data of the string, which can be either Latin-1
512 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
513 const void *
514 scm_i_string_data (SCM str)
515 {
516 SCM buf;
517 size_t start;
518 const char *data;
519
520 get_str_buf_start (&str, &buf, &start);
521
522 data = STRINGBUF_CONTENTS (buf);
523 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
524
525 return data;
526 }
527
528 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
529 STR. */
530 const char *
531 scm_i_string_chars (SCM str)
532 {
533 SCM buf;
534 size_t start;
535 get_str_buf_start (&str, &buf, &start);
536 if (scm_i_is_narrow_string (str))
537 return (const char *) STRINGBUF_CHARS (buf) + start;
538 else
539 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
540 scm_list_1 (str));
541 return NULL;
542 }
543
544 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
545 STR. */
546 const scm_t_wchar *
547 scm_i_string_wide_chars (SCM str)
548 {
549 SCM buf;
550 size_t start;
551
552 get_str_buf_start (&str, &buf, &start);
553 if (!scm_i_is_narrow_string (str))
554 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
555 else
556 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
557 scm_list_1 (str));
558 }
559
560 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
561 a new string buffer, so that it can be modified without modifying
562 other strings. Also, lock the string mutex. Later, one must call
563 scm_i_string_stop_writing to unlock the mutex. */
564 SCM
565 scm_i_string_start_writing (SCM orig_str)
566 {
567 SCM buf, str = orig_str;
568 size_t start;
569
570 get_str_buf_start (&str, &buf, &start);
571 if (IS_RO_STRING (str))
572 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
573
574 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
575 if (STRINGBUF_SHARED (buf))
576 {
577 /* Clone the stringbuf. */
578 size_t len = STRING_LENGTH (str);
579 SCM new_buf;
580
581 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
582
583 if (scm_i_is_narrow_string (str))
584 {
585 new_buf = make_stringbuf (len);
586 memcpy (STRINGBUF_CHARS (new_buf),
587 STRINGBUF_CHARS (buf) + STRING_START (str), len);
588
589 }
590 else
591 {
592 new_buf = make_wide_stringbuf (len);
593 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
594 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
595 + STRING_START (str)), len);
596 }
597
598 SET_STRING_STRINGBUF (str, new_buf);
599 start -= STRING_START (str);
600
601 /* FIXME: The following operations are not atomic, so other threads
602 looking at STR may see an inconsistent state. Nevertheless it can't
603 hurt much since (i) accessing STR while it is being mutated can't
604 yield a crash, and (ii) concurrent accesses to STR should be
605 protected by a mutex at the application level. The latter may not
606 apply when STR != ORIG_STR, though. */
607 SET_STRING_START (str, 0);
608 SET_STRING_STRINGBUF (str, new_buf);
609
610 buf = new_buf;
611
612 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
613 }
614 return orig_str;
615 }
616
617 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
618 char *
619 scm_i_string_writable_chars (SCM str)
620 {
621 SCM buf;
622 size_t start;
623
624 get_str_buf_start (&str, &buf, &start);
625 if (scm_i_is_narrow_string (str))
626 return (char *) STRINGBUF_CHARS (buf) + start;
627 else
628 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
629 scm_list_1 (str));
630 return NULL;
631 }
632
633 /* Return a pointer to the UCS-4 codepoints of a string. */
634 static scm_t_wchar *
635 scm_i_string_writable_wide_chars (SCM str)
636 {
637 SCM buf;
638 size_t start;
639
640 get_str_buf_start (&str, &buf, &start);
641 if (!scm_i_is_narrow_string (str))
642 return STRINGBUF_WIDE_CHARS (buf) + start;
643 else
644 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
645 scm_list_1 (str));
646 }
647
648 /* Unlock the string mutex that was locked when
649 scm_i_string_start_writing was called. */
650 void
651 scm_i_string_stop_writing (void)
652 {
653 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
654 }
655
656 /* Return the Xth character of STR as a UCS-4 codepoint. */
657 scm_t_wchar
658 scm_i_string_ref (SCM str, size_t x)
659 {
660 if (scm_i_is_narrow_string (str))
661 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
662 else
663 return scm_i_string_wide_chars (str)[x];
664 }
665
666 /* Returns index+1 of the first char in STR that matches C, or
667 0 if the char is not found. */
668 int
669 scm_i_string_contains_char (SCM str, char ch)
670 {
671 size_t i;
672 size_t len = scm_i_string_length (str);
673
674 i = 0;
675 if (scm_i_is_narrow_string (str))
676 {
677 while (i < len)
678 {
679 if (scm_i_string_chars (str)[i] == ch)
680 return i+1;
681 i++;
682 }
683 }
684 else
685 {
686 while (i < len)
687 {
688 if (scm_i_string_wide_chars (str)[i]
689 == (unsigned char) ch)
690 return i+1;
691 i++;
692 }
693 }
694 return 0;
695 }
696
697 int
698 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
699 {
700 if (scm_i_is_narrow_string (sstr))
701 {
702 const char *a = scm_i_string_chars (sstr) + start_x;
703 const char *b = cstr;
704 return strncmp (a, b, strlen(b));
705 }
706 else
707 {
708 size_t i;
709 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
710 const char *b = cstr;
711 for (i = 0; i < strlen (b); i++)
712 {
713 if (a[i] != (unsigned char) b[i])
714 return 1;
715 }
716 }
717 return 0;
718 }
719
720 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
721 void
722 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
723 {
724 if (IS_SH_STRING (str))
725 {
726 p += STRING_START (str);
727 str = SH_STRING_STRING (str);
728 }
729
730 if (chr > 0xFF && scm_i_is_narrow_string (str))
731 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
732
733 if (scm_i_is_narrow_string (str))
734 {
735 char *dst = scm_i_string_writable_chars (str);
736 dst[p] = chr;
737 }
738 else
739 {
740 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
741 dst[p] = chr;
742 }
743 }
744
745 \f
746 /* Symbols.
747
748 Basic symbol creation and accessing is done here, the rest is in
749 symbols.[hc]. This has been done to keep stringbufs and the
750 internals of strings and string-like objects confined to this file.
751 */
752
753 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
754
755 SCM
756 scm_i_make_symbol (SCM name, scm_t_bits flags,
757 unsigned long hash, SCM props)
758 {
759 SCM buf;
760 size_t start = STRING_START (name);
761 size_t length = STRING_LENGTH (name);
762
763 if (IS_SH_STRING (name))
764 {
765 name = SH_STRING_STRING (name);
766 start += STRING_START (name);
767 }
768 buf = SYMBOL_STRINGBUF (name);
769
770 if (start == 0 && length == STRINGBUF_LENGTH (buf))
771 {
772 /* reuse buf. */
773 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
774 SET_STRINGBUF_SHARED (buf);
775 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
776 }
777 else
778 {
779 /* make new buf. */
780 if (scm_i_is_narrow_string (name))
781 {
782 SCM new_buf = make_stringbuf (length);
783 memcpy (STRINGBUF_CHARS (new_buf),
784 STRINGBUF_CHARS (buf) + start, length);
785 buf = new_buf;
786 }
787 else
788 {
789 SCM new_buf = make_wide_stringbuf (length);
790 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
791 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
792 length);
793 buf = new_buf;
794 }
795 }
796 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
797 (scm_t_bits) hash, SCM_UNPACK (props));
798 }
799
800 SCM
801 scm_i_c_make_symbol (const char *name, size_t len,
802 scm_t_bits flags, unsigned long hash, SCM props)
803 {
804 SCM buf = make_stringbuf (len);
805 memcpy (STRINGBUF_CHARS (buf), name, len);
806
807 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
808 (scm_t_bits) hash, SCM_UNPACK (props));
809 }
810
811 /* Returns the number of characters in SYM. This may be different
812 from the memory size of SYM. */
813 size_t
814 scm_i_symbol_length (SCM sym)
815 {
816 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
817 }
818
819 size_t
820 scm_c_symbol_length (SCM sym)
821 #define FUNC_NAME "scm_c_symbol_length"
822 {
823 SCM_VALIDATE_SYMBOL (1, sym);
824
825 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
826 }
827 #undef FUNC_NAME
828
829 /* True if the name of SYM is stored as a Latin-1 encoded string.
830 False if it is stored as a 32-bit UCS-4-encoded string. */
831 int
832 scm_i_is_narrow_symbol (SCM sym)
833 {
834 SCM buf;
835
836 buf = SYMBOL_STRINGBUF (sym);
837 return !STRINGBUF_WIDE (buf);
838 }
839
840 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
841 contains the name of SYM. */
842 const char *
843 scm_i_symbol_chars (SCM sym)
844 {
845 SCM buf;
846
847 buf = SYMBOL_STRINGBUF (sym);
848 if (!STRINGBUF_WIDE (buf))
849 return (const char *) STRINGBUF_CHARS (buf);
850 else
851 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
852 scm_list_1 (sym));
853 }
854
855 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
856 symbol's name. */
857 const scm_t_wchar *
858 scm_i_symbol_wide_chars (SCM sym)
859 {
860 SCM buf;
861
862 buf = SYMBOL_STRINGBUF (sym);
863 if (STRINGBUF_WIDE (buf))
864 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
865 else
866 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
867 scm_list_1 (sym));
868 }
869
870 SCM
871 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
872 {
873 SCM buf = SYMBOL_STRINGBUF (sym);
874 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
875 SET_STRINGBUF_SHARED (buf);
876 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
877 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
878 (scm_t_bits)start, (scm_t_bits) end - start);
879 }
880
881 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
882 scm_t_wchar
883 scm_i_symbol_ref (SCM sym, size_t x)
884 {
885 if (scm_i_is_narrow_symbol (sym))
886 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
887 else
888 return scm_i_symbol_wide_chars (sym)[x];
889 }
890
891 /* Debugging
892 */
893
894 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
895 "Returns an association list containing debugging information\n"
896 "for @var{str}. The association list has the following entries."
897 "@table @code\n"
898 "@item string\n"
899 "The string itself.\n"
900 "@item start\n"
901 "The start index of the string into its stringbuf\n"
902 "@item length\n"
903 "The length of the string\n"
904 "@item shared\n"
905 "If this string is a substring, it returns its parent string.\n"
906 "Otherwise, it returns @code{#f}\n"
907 "@item read-only\n"
908 "@code{#t} if the string is read-only\n"
909 "@item stringbuf-chars\n"
910 "A new string containing this string's stringbuf's characters\n"
911 "@item stringbuf-length\n"
912 "The number of characters in this stringbuf\n"
913 "@item stringbuf-shared\n"
914 "@code{#t} if this stringbuf is shared\n"
915 "@item stringbuf-wide\n"
916 "@code{#t} if this stringbuf's characters are stored in a\n"
917 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
918 "buffer\n"
919 "@end table")
920 #define FUNC_NAME s_scm_sys_string_dump
921 {
922 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
923 SCM buf;
924 SCM_VALIDATE_STRING (1, str);
925
926 /* String info */
927 e1 = scm_cons (scm_from_latin1_symbol ("string"),
928 str);
929 e2 = scm_cons (scm_from_latin1_symbol ("start"),
930 scm_from_size_t (STRING_START (str)));
931 e3 = scm_cons (scm_from_latin1_symbol ("length"),
932 scm_from_size_t (STRING_LENGTH (str)));
933
934 if (IS_SH_STRING (str))
935 {
936 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
937 SH_STRING_STRING (str));
938 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
939 }
940 else
941 {
942 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
943 SCM_BOOL_F);
944 buf = STRING_STRINGBUF (str);
945 }
946
947 if (IS_RO_STRING (str))
948 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
949 SCM_BOOL_T);
950 else
951 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
952 SCM_BOOL_F);
953
954 /* Stringbuf info */
955 if (!STRINGBUF_WIDE (buf))
956 {
957 size_t len = STRINGBUF_LENGTH (buf);
958 char *cbuf;
959 SCM sbc = scm_i_make_string (len, &cbuf, 0);
960 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
961 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
962 sbc);
963 }
964 else
965 {
966 size_t len = STRINGBUF_LENGTH (buf);
967 scm_t_wchar *cbuf;
968 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
969 u32_cpy ((scm_t_uint32 *) cbuf,
970 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
971 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
972 sbc);
973 }
974 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
975 scm_from_size_t (STRINGBUF_LENGTH (buf)));
976 if (STRINGBUF_SHARED (buf))
977 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
978 SCM_BOOL_T);
979 else
980 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
981 SCM_BOOL_F);
982 if (STRINGBUF_WIDE (buf))
983 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
984 SCM_BOOL_T);
985 else
986 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
987 SCM_BOOL_F);
988
989 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
990 }
991 #undef FUNC_NAME
992
993 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
994 "Returns an association list containing debugging information\n"
995 "for @var{sym}. The association list has the following entries."
996 "@table @code\n"
997 "@item symbol\n"
998 "The symbol itself\n"
999 "@item hash\n"
1000 "Its hash value\n"
1001 "@item interned\n"
1002 "@code{#t} if it is an interned symbol\n"
1003 "@item stringbuf-chars\n"
1004 "A new string containing this symbols's stringbuf's characters\n"
1005 "@item stringbuf-length\n"
1006 "The number of characters in this stringbuf\n"
1007 "@item stringbuf-shared\n"
1008 "@code{#t} if this stringbuf is shared\n"
1009 "@item stringbuf-wide\n"
1010 "@code{#t} if this stringbuf's characters are stored in a\n"
1011 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1012 "buffer\n"
1013 "@end table")
1014 #define FUNC_NAME s_scm_sys_symbol_dump
1015 {
1016 SCM e1, e2, e3, e4, e5, e6, e7;
1017 SCM buf;
1018 SCM_VALIDATE_SYMBOL (1, sym);
1019 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1020 sym);
1021 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1022 scm_from_ulong (scm_i_symbol_hash (sym)));
1023 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1024 scm_symbol_interned_p (sym));
1025 buf = SYMBOL_STRINGBUF (sym);
1026
1027 /* Stringbuf info */
1028 if (!STRINGBUF_WIDE (buf))
1029 {
1030 size_t len = STRINGBUF_LENGTH (buf);
1031 char *cbuf;
1032 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1033 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1034 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1035 sbc);
1036 }
1037 else
1038 {
1039 size_t len = STRINGBUF_LENGTH (buf);
1040 scm_t_wchar *cbuf;
1041 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1042 u32_cpy ((scm_t_uint32 *) cbuf,
1043 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1044 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1045 sbc);
1046 }
1047 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1048 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1049 if (STRINGBUF_SHARED (buf))
1050 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1051 SCM_BOOL_T);
1052 else
1053 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1054 SCM_BOOL_F);
1055 if (STRINGBUF_WIDE (buf))
1056 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1057 SCM_BOOL_T);
1058 else
1059 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1060 SCM_BOOL_F);
1061 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1062
1063 }
1064 #undef FUNC_NAME
1065
1066 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1067
1068 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1069 #define FUNC_NAME s_scm_sys_stringbuf_hist
1070 {
1071 int i;
1072 for (i = 0; i < 1000; i++)
1073 if (lenhist[i])
1074 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1075 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1076 return SCM_UNSPECIFIED;
1077 }
1078 #undef FUNC_NAME
1079
1080 #endif
1081
1082 \f
1083
1084 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1085 (SCM obj),
1086 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1087 #define FUNC_NAME s_scm_string_p
1088 {
1089 return scm_from_bool (IS_STRING (obj));
1090 }
1091 #undef FUNC_NAME
1092
1093
1094 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1095
1096 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1097 (SCM chrs),
1098 "@deffnx {Scheme Procedure} list->string chrs\n"
1099 "Return a newly allocated string composed of the arguments,\n"
1100 "@var{chrs}.")
1101 #define FUNC_NAME s_scm_string
1102 {
1103 SCM result = SCM_BOOL_F;
1104 SCM rest;
1105 size_t len;
1106 size_t p = 0;
1107 long i;
1108 int wide = 0;
1109
1110 /* Verify that this is a list of chars. */
1111 i = scm_ilength (chrs);
1112 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1113
1114 len = (size_t) i;
1115 rest = chrs;
1116
1117 while (len > 0 && scm_is_pair (rest))
1118 {
1119 SCM elt = SCM_CAR (rest);
1120 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1121 if (SCM_CHAR (elt) > 0xFF)
1122 wide = 1;
1123 rest = SCM_CDR (rest);
1124 len--;
1125 scm_remember_upto_here_1 (elt);
1126 }
1127
1128 /* Construct a string containing this list of chars. */
1129 len = (size_t) i;
1130 rest = chrs;
1131
1132 if (wide == 0)
1133 {
1134 char *buf;
1135
1136 result = scm_i_make_string (len, NULL, 0);
1137 result = scm_i_string_start_writing (result);
1138 buf = scm_i_string_writable_chars (result);
1139 while (len > 0 && scm_is_pair (rest))
1140 {
1141 SCM elt = SCM_CAR (rest);
1142 buf[p] = (unsigned char) SCM_CHAR (elt);
1143 p++;
1144 rest = SCM_CDR (rest);
1145 len--;
1146 scm_remember_upto_here_1 (elt);
1147 }
1148 }
1149 else
1150 {
1151 scm_t_wchar *buf;
1152
1153 result = scm_i_make_wide_string (len, NULL, 0);
1154 result = scm_i_string_start_writing (result);
1155 buf = scm_i_string_writable_wide_chars (result);
1156 while (len > 0 && scm_is_pair (rest))
1157 {
1158 SCM elt = SCM_CAR (rest);
1159 buf[p] = SCM_CHAR (elt);
1160 p++;
1161 rest = SCM_CDR (rest);
1162 len--;
1163 scm_remember_upto_here_1 (elt);
1164 }
1165 }
1166 scm_i_string_stop_writing ();
1167
1168 if (len > 0)
1169 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1170 if (!scm_is_null (rest))
1171 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1172
1173 return result;
1174 }
1175 #undef FUNC_NAME
1176
1177 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1178 (SCM k, SCM chr),
1179 "Return a newly allocated string of\n"
1180 "length @var{k}. If @var{chr} is given, then all elements of\n"
1181 "the string are initialized to @var{chr}, otherwise the contents\n"
1182 "of the string are all set to @code{#\nul}.")
1183 #define FUNC_NAME s_scm_make_string
1184 {
1185 return scm_c_make_string (scm_to_size_t (k), chr);
1186 }
1187 #undef FUNC_NAME
1188
1189 SCM
1190 scm_c_make_string (size_t len, SCM chr)
1191 #define FUNC_NAME NULL
1192 {
1193 size_t p;
1194 char *contents = NULL;
1195 SCM res = scm_i_make_string (len, &contents, 0);
1196
1197 /* If no char is given, initialize string contents to NULL. */
1198 if (SCM_UNBNDP (chr))
1199 memset (contents, 0, len);
1200 else
1201 {
1202 SCM_VALIDATE_CHAR (0, chr);
1203 res = scm_i_string_start_writing (res);
1204 for (p = 0; p < len; p++)
1205 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1206 scm_i_string_stop_writing ();
1207 }
1208
1209 return res;
1210 }
1211 #undef FUNC_NAME
1212
1213 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1214 (SCM string),
1215 "Return the number of characters in @var{string}.")
1216 #define FUNC_NAME s_scm_string_length
1217 {
1218 SCM_VALIDATE_STRING (1, string);
1219 return scm_from_size_t (STRING_LENGTH (string));
1220 }
1221 #undef FUNC_NAME
1222
1223 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1224 (SCM string),
1225 "Return the bytes used to represent a character in @var{string}."
1226 "This will return 1 or 4.")
1227 #define FUNC_NAME s_scm_string_bytes_per_char
1228 {
1229 SCM_VALIDATE_STRING (1, string);
1230 if (!scm_i_is_narrow_string (string))
1231 return scm_from_int (4);
1232
1233 return scm_from_int (1);
1234 }
1235 #undef FUNC_NAME
1236
1237 size_t
1238 scm_c_string_length (SCM string)
1239 {
1240 if (!IS_STRING (string))
1241 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1242 return STRING_LENGTH (string);
1243 }
1244
1245 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1246 (SCM str, SCM k),
1247 "Return character @var{k} of @var{str} using zero-origin\n"
1248 "indexing. @var{k} must be a valid index of @var{str}.")
1249 #define FUNC_NAME s_scm_string_ref
1250 {
1251 size_t len;
1252 unsigned long idx;
1253
1254 SCM_VALIDATE_STRING (1, str);
1255
1256 len = scm_i_string_length (str);
1257 if (SCM_LIKELY (len > 0))
1258 idx = scm_to_unsigned_integer (k, 0, len - 1);
1259 else
1260 scm_out_of_range (NULL, k);
1261
1262 if (scm_i_is_narrow_string (str))
1263 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1264 else
1265 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1266 }
1267 #undef FUNC_NAME
1268
1269 SCM
1270 scm_c_string_ref (SCM str, size_t p)
1271 {
1272 if (p >= scm_i_string_length (str))
1273 scm_out_of_range (NULL, scm_from_size_t (p));
1274 if (scm_i_is_narrow_string (str))
1275 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1276 else
1277 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1278
1279 }
1280
1281 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1282 (SCM str, SCM k, SCM chr),
1283 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1284 "an unspecified value. @var{k} must be a valid index of\n"
1285 "@var{str}.")
1286 #define FUNC_NAME s_scm_string_set_x
1287 {
1288 size_t len;
1289 unsigned long idx;
1290
1291 SCM_VALIDATE_STRING (1, str);
1292
1293 len = scm_i_string_length (str);
1294 if (SCM_LIKELY (len > 0))
1295 idx = scm_to_unsigned_integer (k, 0, len - 1);
1296 else
1297 scm_out_of_range (NULL, k);
1298
1299 SCM_VALIDATE_CHAR (3, chr);
1300 str = scm_i_string_start_writing (str);
1301 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1302 scm_i_string_stop_writing ();
1303
1304 return SCM_UNSPECIFIED;
1305 }
1306 #undef FUNC_NAME
1307
1308 void
1309 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1310 {
1311 if (p >= scm_i_string_length (str))
1312 scm_out_of_range (NULL, scm_from_size_t (p));
1313 str = scm_i_string_start_writing (str);
1314 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1315 scm_i_string_stop_writing ();
1316 }
1317
1318 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1319 (SCM str, SCM start, SCM end),
1320 "Return a newly allocated string formed from the characters\n"
1321 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1322 "ending with index @var{end} (exclusive).\n"
1323 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1324 "exact integers satisfying:\n\n"
1325 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1326 #define FUNC_NAME s_scm_substring
1327 {
1328 size_t len, from, to;
1329
1330 SCM_VALIDATE_STRING (1, str);
1331 len = scm_i_string_length (str);
1332 from = scm_to_unsigned_integer (start, 0, len);
1333 if (SCM_UNBNDP (end))
1334 to = len;
1335 else
1336 to = scm_to_unsigned_integer (end, from, len);
1337 return scm_i_substring (str, from, to);
1338 }
1339 #undef FUNC_NAME
1340
1341 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1342 (SCM str, SCM start, SCM end),
1343 "Return a newly allocated string formed from the characters\n"
1344 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1345 "ending with index @var{end} (exclusive).\n"
1346 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1347 "exact integers satisfying:\n"
1348 "\n"
1349 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1350 "\n"
1351 "The returned string is read-only.\n")
1352 #define FUNC_NAME s_scm_substring_read_only
1353 {
1354 size_t len, from, to;
1355
1356 SCM_VALIDATE_STRING (1, str);
1357 len = scm_i_string_length (str);
1358 from = scm_to_unsigned_integer (start, 0, len);
1359 if (SCM_UNBNDP (end))
1360 to = len;
1361 else
1362 to = scm_to_unsigned_integer (end, from, len);
1363 return scm_i_substring_read_only (str, from, to);
1364 }
1365 #undef FUNC_NAME
1366
1367 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1368 (SCM str, SCM start, SCM end),
1369 "Return a newly allocated string formed from the characters\n"
1370 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1371 "ending with index @var{end} (exclusive).\n"
1372 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1373 "exact integers satisfying:\n\n"
1374 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1375 #define FUNC_NAME s_scm_substring_copy
1376 {
1377 /* For the Scheme version, START is mandatory, but for the C
1378 version, it is optional. See scm_string_copy in srfi-13.c for a
1379 rationale.
1380 */
1381
1382 size_t from, to;
1383
1384 SCM_VALIDATE_STRING (1, str);
1385 scm_i_get_substring_spec (scm_i_string_length (str),
1386 start, &from, end, &to);
1387 return scm_i_substring_copy (str, from, to);
1388 }
1389 #undef FUNC_NAME
1390
1391 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1392 (SCM str, SCM start, SCM end),
1393 "Return string that indirectly refers to the characters\n"
1394 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1395 "ending with index @var{end} (exclusive).\n"
1396 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1397 "exact integers satisfying:\n\n"
1398 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1399 #define FUNC_NAME s_scm_substring_shared
1400 {
1401 size_t len, from, to;
1402
1403 SCM_VALIDATE_STRING (1, str);
1404 len = scm_i_string_length (str);
1405 from = scm_to_unsigned_integer (start, 0, len);
1406 if (SCM_UNBNDP (end))
1407 to = len;
1408 else
1409 to = scm_to_unsigned_integer (end, from, len);
1410 return scm_i_substring_shared (str, from, to);
1411 }
1412 #undef FUNC_NAME
1413
1414 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1415 (SCM args),
1416 "Return a newly allocated string whose characters form the\n"
1417 "concatenation of the given strings, @var{args}.")
1418 #define FUNC_NAME s_scm_string_append
1419 {
1420 SCM res;
1421 size_t len = 0;
1422 int wide = 0;
1423 SCM l, s;
1424 size_t i;
1425 union
1426 {
1427 char *narrow;
1428 scm_t_wchar *wide;
1429 } data;
1430
1431 SCM_VALIDATE_REST_ARGUMENT (args);
1432 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1433 {
1434 s = SCM_CAR (l);
1435 SCM_VALIDATE_STRING (SCM_ARGn, s);
1436 len += scm_i_string_length (s);
1437 if (!scm_i_is_narrow_string (s))
1438 wide = 1;
1439 }
1440 data.narrow = NULL;
1441 if (!wide)
1442 res = scm_i_make_string (len, &data.narrow, 0);
1443 else
1444 res = scm_i_make_wide_string (len, &data.wide, 0);
1445
1446 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1447 {
1448 size_t len;
1449 s = SCM_CAR (l);
1450 SCM_VALIDATE_STRING (SCM_ARGn, s);
1451 len = scm_i_string_length (s);
1452 if (!wide)
1453 {
1454 memcpy (data.narrow, scm_i_string_chars (s), len);
1455 data.narrow += len;
1456 }
1457 else
1458 {
1459 if (scm_i_is_narrow_string (s))
1460 {
1461 for (i = 0; i < scm_i_string_length (s); i++)
1462 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1463 }
1464 else
1465 u32_cpy ((scm_t_uint32 *) data.wide,
1466 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1467 data.wide += len;
1468 }
1469 scm_remember_upto_here_1 (s);
1470 }
1471 return res;
1472 }
1473 #undef FUNC_NAME
1474
1475
1476 \f
1477 /* Charset conversion error handling. */
1478
1479 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1480 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1481
1482 /* Raise an exception informing that character CHR could not be written
1483 to PORT in its current encoding. */
1484 void
1485 scm_encoding_error (const char *subr, int err, const char *message,
1486 SCM port, SCM chr)
1487 {
1488 scm_throw (scm_encoding_error_key,
1489 scm_list_n (scm_from_latin1_string (subr),
1490 scm_from_latin1_string (message),
1491 scm_from_int (err),
1492 port, chr,
1493 SCM_UNDEFINED));
1494 }
1495
1496 /* Raise an exception informing of an encoding error on PORT. This
1497 means that a character could not be written in PORT's encoding. */
1498 void
1499 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1500 {
1501 scm_throw (scm_decoding_error_key,
1502 scm_list_n (scm_from_latin1_string (subr),
1503 scm_from_latin1_string (message),
1504 scm_from_int (err),
1505 port,
1506 SCM_UNDEFINED));
1507 }
1508
1509 \f
1510 /* String conversion to/from C. */
1511
1512 static void
1513 decoding_error (const char *func_name, int errno_save,
1514 const char *str, size_t len)
1515 {
1516 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1517 handler. */
1518 SCM bv;
1519 signed char *buf;
1520
1521 buf = scm_gc_malloc_pointerless (len, "bytevector");
1522 memcpy (buf, str, len);
1523 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1524
1525 scm_decoding_error (func_name, errno_save,
1526 "input locale conversion error", bv);
1527 }
1528
1529 SCM
1530 scm_from_stringn (const char *str, size_t len, const char *encoding,
1531 scm_t_string_failed_conversion_handler handler)
1532 {
1533 size_t u32len, i;
1534 scm_t_wchar *u32;
1535 int wide = 0;
1536 SCM res;
1537
1538 /* The order of these checks is important. */
1539 if (!str && len != 0)
1540 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1541 if (len == (size_t) -1)
1542 len = strlen (str);
1543
1544 if (encoding == NULL || len == 0)
1545 return scm_from_latin1_stringn (str, len);
1546 else if (strcmp (encoding, "UTF-8") == 0)
1547 return scm_from_utf8_stringn (str, len);
1548
1549 u32len = 0;
1550 scm_i_lock_iconv ();
1551 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1552 (enum iconv_ilseq_handler)
1553 handler,
1554 str, len,
1555 NULL,
1556 NULL, &u32len);
1557 scm_i_unlock_iconv ();
1558
1559 if (SCM_UNLIKELY (u32 == NULL))
1560 decoding_error (__func__, errno, str, len);
1561
1562 i = 0;
1563 while (i < u32len)
1564 if (u32[i++] > 0xFF)
1565 {
1566 wide = 1;
1567 break;
1568 }
1569
1570 if (!wide)
1571 {
1572 char *dst;
1573 res = scm_i_make_string (u32len, &dst, 0);
1574 for (i = 0; i < u32len; i ++)
1575 dst[i] = (unsigned char) u32[i];
1576 dst[u32len] = '\0';
1577 }
1578 else
1579 {
1580 scm_t_wchar *wdst;
1581 res = scm_i_make_wide_string (u32len, &wdst, 0);
1582 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1583 wdst[u32len] = 0;
1584 }
1585
1586 free (u32);
1587 return res;
1588 }
1589
1590 SCM
1591 scm_from_locale_string (const char *str)
1592 {
1593 return scm_from_locale_stringn (str, -1);
1594 }
1595
1596 SCM
1597 scm_from_locale_stringn (const char *str, size_t len)
1598 {
1599 return scm_from_stringn (str, len, locale_charset (),
1600 scm_i_get_conversion_strategy (SCM_BOOL_F));
1601 }
1602
1603 SCM
1604 scm_from_latin1_string (const char *str)
1605 {
1606 return scm_from_latin1_stringn (str, -1);
1607 }
1608
1609 SCM
1610 scm_from_latin1_stringn (const char *str, size_t len)
1611 {
1612 char *buf;
1613 SCM result;
1614
1615 if (len == (size_t) -1)
1616 len = strlen (str);
1617
1618 /* Make a narrow string and copy STR as is. */
1619 result = scm_i_make_string (len, &buf, 0);
1620 memcpy (buf, str, len);
1621
1622 return result;
1623 }
1624
1625 SCM
1626 scm_from_utf8_string (const char *str)
1627 {
1628 return scm_from_utf8_stringn (str, -1);
1629 }
1630
1631 SCM
1632 scm_from_utf8_stringn (const char *str, size_t len)
1633 {
1634 size_t i, char_len;
1635 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1636 int ascii = 1, narrow = 1;
1637 SCM res;
1638
1639 if (len == (size_t) -1)
1640 len = strlen (str);
1641
1642 i = 0;
1643 char_len = 0;
1644
1645 while (i < len)
1646 {
1647 if (ustr[i] <= 127)
1648 {
1649 char_len++;
1650 i++;
1651 }
1652 else
1653 {
1654 ucs4_t c;
1655 int nbytes;
1656
1657 ascii = 0;
1658
1659 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1660
1661 if (nbytes < 0)
1662 /* Bad UTF-8. */
1663 decoding_error (__func__, errno, str, len);
1664
1665 if (c > 255)
1666 narrow = 0;
1667
1668 char_len++;
1669 i += nbytes;
1670 }
1671 }
1672
1673 if (ascii)
1674 {
1675 char *dst;
1676 res = scm_i_make_string (char_len, &dst, 0);
1677 memcpy (dst, str, len);
1678 }
1679 else if (narrow)
1680 {
1681 char *dst;
1682 size_t j;
1683 ucs4_t c;
1684
1685 res = scm_i_make_string (char_len, &dst, 0);
1686
1687 for (i = 0, j = 0; i < len; j++)
1688 {
1689 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1690 dst[j] = (signed char) c;
1691 }
1692 }
1693 else
1694 {
1695 scm_t_wchar *dst;
1696 size_t j;
1697 ucs4_t c;
1698
1699 res = scm_i_make_wide_string (char_len, &dst, 0);
1700
1701 for (i = 0, j = 0; i < len; j++)
1702 {
1703 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1704 dst[j] = c;
1705 }
1706 }
1707
1708 return res;
1709 }
1710
1711 SCM
1712 scm_from_utf32_string (const scm_t_wchar *str)
1713 {
1714 return scm_from_utf32_stringn (str, -1);
1715 }
1716
1717 SCM
1718 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1719 {
1720 SCM result;
1721 scm_t_wchar *buf;
1722
1723 if (len == (size_t) -1)
1724 len = u32_strlen ((uint32_t *) str);
1725
1726 result = scm_i_make_wide_string (len, &buf, 0);
1727 memcpy (buf, str, len * sizeof (scm_t_wchar));
1728 scm_i_try_narrow_string (result);
1729
1730 return result;
1731 }
1732
1733 /* Create a new scheme string from the C string STR. The memory of
1734 STR may be used directly as storage for the new string. */
1735 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1736 would be to register a finalizer to eventually free(3) STR, which isn't
1737 worth it. Should we just deprecate the `scm_take_' functions? */
1738 SCM
1739 scm_take_locale_stringn (char *str, size_t len)
1740 {
1741 SCM res;
1742
1743 res = scm_from_locale_stringn (str, len);
1744 free (str);
1745
1746 return res;
1747 }
1748
1749 SCM
1750 scm_take_locale_string (char *str)
1751 {
1752 return scm_take_locale_stringn (str, -1);
1753 }
1754
1755 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1756 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1757 Set *LENP to the size of the resulting string.
1758
1759 FIXME: This is a hack we should get rid of. See
1760 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1761 for details. */
1762 static void
1763 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1764 {
1765 char *before, *after;
1766 size_t i, j;
1767
1768 before = buf;
1769 after = buf;
1770 i = 0;
1771 j = 0;
1772 while (i < *lenp)
1773 {
1774 if ((i <= *lenp - 6)
1775 && before[i] == '\\'
1776 && before[i + 1] == 'u'
1777 && before[i + 2] == '0' && before[i + 3] == '0')
1778 {
1779 /* Convert \u00NN to \xNN */
1780 after[j] = '\\';
1781 after[j + 1] = 'x';
1782 after[j + 2] = tolower ((int) before[i + 4]);
1783 after[j + 3] = tolower ((int) before[i + 5]);
1784 i += 6;
1785 j += 4;
1786 }
1787 else if ((i <= *lenp - 10)
1788 && before[i] == '\\'
1789 && before[i + 1] == 'U'
1790 && before[i + 2] == '0' && before[i + 3] == '0')
1791 {
1792 /* Convert \U00NNNNNN to \UNNNNNN */
1793 after[j] = '\\';
1794 after[j + 1] = 'U';
1795 after[j + 2] = tolower ((int) before[i + 4]);
1796 after[j + 3] = tolower ((int) before[i + 5]);
1797 after[j + 4] = tolower ((int) before[i + 6]);
1798 after[j + 5] = tolower ((int) before[i + 7]);
1799 after[j + 6] = tolower ((int) before[i + 8]);
1800 after[j + 7] = tolower ((int) before[i + 9]);
1801 i += 10;
1802 j += 8;
1803 }
1804 else
1805 {
1806 after[j] = before[i];
1807 i++;
1808 j++;
1809 }
1810 }
1811 *lenp = j;
1812 }
1813
1814 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1815 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1816 of the resulting string. BUF must be large enough to handle the
1817 worst case when `\uXXXX' escapes (6 characters) are replaced by
1818 `\xXXXX;' (7 characters). */
1819 static void
1820 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1821 {
1822 char *before, *after;
1823 size_t i, j;
1824 /* The worst case is if the input string contains all 4-digit hex escapes.
1825 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1826 size_t max_out_len = (*lenp * 7) / 6 + 1;
1827 size_t nzeros, ndigits;
1828
1829 before = buf;
1830 after = alloca (max_out_len);
1831 i = 0;
1832 j = 0;
1833 while (i < *lenp)
1834 {
1835 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1836 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1837 {
1838 if (before[i + 1] == 'u')
1839 ndigits = 4;
1840 else if (before[i + 1] == 'U')
1841 ndigits = 8;
1842 else
1843 abort ();
1844
1845 /* Add the R6RS hex escape initial sequence. */
1846 after[j] = '\\';
1847 after[j + 1] = 'x';
1848
1849 /* Move string positions to the start of the hex numbers. */
1850 i += 2;
1851 j += 2;
1852
1853 /* Find the number of initial zeros in this hex number. */
1854 nzeros = 0;
1855 while (before[i + nzeros] == '0' && nzeros < ndigits)
1856 nzeros++;
1857
1858 /* Copy the number, skipping initial zeros, and then move the string
1859 positions. */
1860 if (nzeros == ndigits)
1861 {
1862 after[j] = '0';
1863 i += ndigits;
1864 j += 1;
1865 }
1866 else
1867 {
1868 int pos;
1869 for (pos = 0; pos < ndigits - nzeros; pos++)
1870 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1871 i += ndigits;
1872 j += (ndigits - nzeros);
1873 }
1874
1875 /* Add terminating semicolon. */
1876 after[j] = ';';
1877 j++;
1878 }
1879 else
1880 {
1881 after[j] = before[i];
1882 i++;
1883 j++;
1884 }
1885 }
1886 *lenp = j;
1887 memcpy (before, after, j);
1888 }
1889
1890 char *
1891 scm_to_locale_string (SCM str)
1892 {
1893 return scm_to_locale_stringn (str, NULL);
1894 }
1895
1896 char *
1897 scm_to_locale_stringn (SCM str, size_t *lenp)
1898 {
1899 return scm_to_stringn (str, lenp,
1900 locale_charset (),
1901 scm_i_get_conversion_strategy (SCM_BOOL_F));
1902 }
1903
1904 char *
1905 scm_to_latin1_string (SCM str)
1906 {
1907 return scm_to_latin1_stringn (str, NULL);
1908 }
1909
1910 char *
1911 scm_to_latin1_stringn (SCM str, size_t *lenp)
1912 #define FUNC_NAME "scm_to_latin1_stringn"
1913 {
1914 char *result;
1915
1916 SCM_VALIDATE_STRING (1, str);
1917
1918 if (scm_i_is_narrow_string (str))
1919 {
1920 size_t len = scm_i_string_length (str);
1921
1922 if (lenp)
1923 *lenp = len;
1924
1925 result = scm_strndup (scm_i_string_data (str), len);
1926 }
1927 else
1928 result = scm_to_stringn (str, lenp, NULL,
1929 SCM_FAILED_CONVERSION_ERROR);
1930
1931 return result;
1932 }
1933 #undef FUNC_NAME
1934
1935 char *
1936 scm_to_utf8_string (SCM str)
1937 {
1938 return scm_to_utf8_stringn (str, NULL);
1939 }
1940
1941 static size_t
1942 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1943 {
1944 size_t ret, i;
1945 for (i = 0, ret = 0; i < len; i++)
1946 ret += (str[i] < 128) ? 1 : 2;
1947 return ret;
1948 }
1949
1950 static scm_t_uint8*
1951 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1952 scm_t_uint8 *u8_result, size_t *u8_lenp)
1953 {
1954 size_t i, n;
1955 size_t u8_len = latin1_u8_strlen (str, latin_len);
1956
1957 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1958 u8_result = scm_malloc (u8_len + 1);
1959 if (u8_lenp)
1960 *u8_lenp = u8_len;
1961
1962 for (i = 0, n = 0; i < latin_len; i++)
1963 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1964 if (n != u8_len)
1965 abort ();
1966 u8_result[n] = 0;
1967
1968 return u8_result;
1969 }
1970
1971 char *
1972 scm_to_utf8_stringn (SCM str, size_t *lenp)
1973 {
1974 if (scm_i_is_narrow_string (str))
1975 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1976 scm_i_string_length (str),
1977 NULL, lenp);
1978 else
1979 return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
1980 scm_i_string_length (str),
1981 NULL, lenp);
1982 }
1983
1984 scm_t_wchar *
1985 scm_to_utf32_string (SCM str)
1986 {
1987 return scm_to_utf32_stringn (str, NULL);
1988 }
1989
1990 scm_t_wchar *
1991 scm_to_utf32_stringn (SCM str, size_t *lenp)
1992 #define FUNC_NAME "scm_to_utf32_stringn"
1993 {
1994 scm_t_wchar *result;
1995
1996 SCM_VALIDATE_STRING (1, str);
1997
1998 if (scm_i_is_narrow_string (str))
1999 {
2000 scm_t_uint8 *codepoints;
2001 size_t i, len;
2002
2003 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2004 len = scm_i_string_length (str);
2005 if (lenp)
2006 *lenp = len;
2007
2008 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2009 for (i = 0; i < len; i++)
2010 result[i] = codepoints[i];
2011 result[len] = 0;
2012 }
2013 else
2014 {
2015 size_t len;
2016
2017 len = scm_i_string_length (str);
2018 if (lenp)
2019 *lenp = len;
2020
2021 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2022 memcpy (result, scm_i_string_wide_chars (str),
2023 len * sizeof (scm_t_wchar));
2024 result[len] = 0;
2025 }
2026
2027 return result;
2028 }
2029 #undef FUNC_NAME
2030
2031 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2032 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2033 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2034 defined by HANDLER. */
2035 char *
2036 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2037 scm_t_string_failed_conversion_handler handler)
2038 {
2039 char *buf;
2040 size_t ilen, len, i;
2041 int ret;
2042 const char *enc;
2043
2044 if (!scm_is_string (str))
2045 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2046 ilen = scm_i_string_length (str);
2047
2048 if (ilen == 0)
2049 {
2050 buf = scm_malloc (1);
2051 buf[0] = '\0';
2052 if (lenp)
2053 *lenp = 0;
2054 return buf;
2055 }
2056
2057 if (lenp == NULL)
2058 for (i = 0; i < ilen; i++)
2059 if (scm_i_string_ref (str, i) == '\0')
2060 scm_misc_error (NULL,
2061 "string contains #\\nul character: ~S",
2062 scm_list_1 (str));
2063
2064 if (scm_i_is_narrow_string (str) && (encoding == NULL))
2065 {
2066 /* If using native Latin-1 encoding, just copy the string
2067 contents. */
2068 if (lenp)
2069 {
2070 buf = scm_malloc (ilen);
2071 memcpy (buf, scm_i_string_chars (str), ilen);
2072 *lenp = ilen;
2073 return buf;
2074 }
2075 else
2076 {
2077 buf = scm_malloc (ilen + 1);
2078 memcpy (buf, scm_i_string_chars (str), ilen);
2079 buf[ilen] = '\0';
2080 return buf;
2081 }
2082 }
2083
2084
2085 buf = NULL;
2086 len = 0;
2087 enc = encoding;
2088 if (enc == NULL)
2089 enc = "ISO-8859-1";
2090 if (scm_i_is_narrow_string (str))
2091 {
2092 scm_i_lock_iconv ();
2093 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2094 "ISO-8859-1", enc,
2095 (enum iconv_ilseq_handler) handler, NULL,
2096 &buf, &len);
2097 scm_i_unlock_iconv ();
2098
2099 if (ret != 0)
2100 scm_encoding_error (__func__, errno,
2101 "cannot convert narrow string to output locale",
2102 SCM_BOOL_F,
2103 /* FIXME: Faulty character unknown. */
2104 SCM_BOOL_F);
2105 }
2106 else
2107 {
2108 scm_i_lock_iconv ();
2109 buf = u32_conv_to_encoding (enc,
2110 (enum iconv_ilseq_handler) handler,
2111 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2112 ilen,
2113 NULL,
2114 NULL, &len);
2115 scm_i_unlock_iconv ();
2116 if (buf == NULL)
2117 scm_encoding_error (__func__, errno,
2118 "cannot convert wide string to output locale",
2119 SCM_BOOL_F,
2120 /* FIXME: Faulty character unknown. */
2121 SCM_BOOL_F);
2122 }
2123 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2124 {
2125 if (SCM_R6RS_ESCAPES_P)
2126 {
2127 /* The worst case is if the input string contains all 4-digit
2128 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2129 (seven characters). Make BUF large enough to hold
2130 that. */
2131 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2132 unistring_escapes_to_r6rs_escapes (buf, &len);
2133 }
2134 else
2135 unistring_escapes_to_guile_escapes (buf, &len);
2136
2137 buf = scm_realloc (buf, len);
2138 }
2139 if (lenp)
2140 *lenp = len;
2141 else
2142 {
2143 buf = scm_realloc (buf, len + 1);
2144 buf[len] = '\0';
2145 }
2146
2147 scm_remember_upto_here_1 (str);
2148 return buf;
2149 }
2150
2151 size_t
2152 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2153 {
2154 size_t len;
2155 char *result = NULL;
2156 if (!scm_is_string (str))
2157 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2158 result = scm_to_locale_stringn (str, &len);
2159
2160 memcpy (buf, result, (len > max_len) ? max_len : len);
2161 free (result);
2162
2163 scm_remember_upto_here_1 (str);
2164 return len;
2165 }
2166
2167 \f
2168 /* Unicode string normalization. */
2169
2170 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2171 libguile/i18n.c. It would be useful to have this factored out into a more
2172 convenient location, but its use of alloca makes that tricky to do. */
2173
2174 static SCM
2175 normalize_str (SCM string, uninorm_t form)
2176 {
2177 SCM ret;
2178 scm_t_uint32 *w_str;
2179 scm_t_wchar *cbuf;
2180 size_t rlen, len = scm_i_string_length (string);
2181
2182 if (scm_i_is_narrow_string (string))
2183 {
2184 size_t i;
2185 const char *buf = scm_i_string_chars (string);
2186
2187 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2188
2189 for (i = 0; i < len; i ++)
2190 w_str[i] = (unsigned char) buf[i];
2191 w_str[len] = 0;
2192 }
2193 else
2194 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2195
2196 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2197
2198 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2199 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2200 free (w_str);
2201
2202 scm_i_try_narrow_string (ret);
2203
2204 return ret;
2205 }
2206
2207 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2208 (SCM string),
2209 "Returns the NFC normalized form of @var{string}.")
2210 #define FUNC_NAME s_scm_string_normalize_nfc
2211 {
2212 SCM_VALIDATE_STRING (1, string);
2213 return normalize_str (string, UNINORM_NFC);
2214 }
2215 #undef FUNC_NAME
2216
2217 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2218 (SCM string),
2219 "Returns the NFD normalized form of @var{string}.")
2220 #define FUNC_NAME s_scm_string_normalize_nfd
2221 {
2222 SCM_VALIDATE_STRING (1, string);
2223 return normalize_str (string, UNINORM_NFD);
2224 }
2225 #undef FUNC_NAME
2226
2227 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2228 (SCM string),
2229 "Returns the NFKC normalized form of @var{string}.")
2230 #define FUNC_NAME s_scm_string_normalize_nfkc
2231 {
2232 SCM_VALIDATE_STRING (1, string);
2233 return normalize_str (string, UNINORM_NFKC);
2234 }
2235 #undef FUNC_NAME
2236
2237 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2238 (SCM string),
2239 "Returns the NFKD normalized form of @var{string}.")
2240 #define FUNC_NAME s_scm_string_normalize_nfkd
2241 {
2242 SCM_VALIDATE_STRING (1, string);
2243 return normalize_str (string, UNINORM_NFKD);
2244 }
2245 #undef FUNC_NAME
2246
2247 /* converts C scm_array of strings to SCM scm_list of strings.
2248 If argc < 0, a null terminated scm_array is assumed.
2249 The current locale encoding is assumed */
2250 SCM
2251 scm_makfromstrs (int argc, char **argv)
2252 {
2253 int i = argc;
2254 SCM lst = SCM_EOL;
2255 if (0 > i)
2256 for (i = 0; argv[i]; i++);
2257 while (i--)
2258 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2259 return lst;
2260 }
2261
2262 /* Return a newly allocated array of char pointers to each of the strings
2263 in args, with a terminating NULL pointer. The strings are encoded using
2264 the current locale. */
2265
2266 char **
2267 scm_i_allocate_string_pointers (SCM list)
2268 #define FUNC_NAME "scm_i_allocate_string_pointers"
2269 {
2270 char **result;
2271 int list_len = scm_ilength (list);
2272 int i;
2273
2274 if (list_len < 0)
2275 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2276
2277 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2278 "string pointers");
2279 result[list_len] = NULL;
2280
2281 /* The list might have been modified in another thread, so
2282 we check LIST before each access.
2283 */
2284 for (i = 0; i < list_len && scm_is_pair (list); i++)
2285 {
2286 SCM str = SCM_CAR (list);
2287 size_t len; /* String length in bytes */
2288 char *c_str = scm_to_locale_stringn (str, &len);
2289
2290 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2291 scm_malloc to allocate the returned string, which must be
2292 explicitly deallocated. This forces us to copy the string a
2293 second time into a new buffer. Ideally there would be variants
2294 of scm_to_*_stringn that can return garbage-collected buffers. */
2295
2296 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2297 memcpy (result[i], c_str, len);
2298 result[i][len] = '\0';
2299 free (c_str);
2300
2301 list = SCM_CDR (list);
2302 }
2303
2304 return result;
2305 }
2306 #undef FUNC_NAME
2307
2308 void
2309 scm_i_get_substring_spec (size_t len,
2310 SCM start, size_t *cstart,
2311 SCM end, size_t *cend)
2312 {
2313 if (SCM_UNBNDP (start))
2314 *cstart = 0;
2315 else
2316 *cstart = scm_to_unsigned_integer (start, 0, len);
2317
2318 if (SCM_UNBNDP (end))
2319 *cend = len;
2320 else
2321 *cend = scm_to_unsigned_integer (end, *cstart, len);
2322 }
2323
2324 static SCM
2325 string_handle_ref (scm_t_array_handle *h, size_t index)
2326 {
2327 return scm_c_string_ref (h->array, index);
2328 }
2329
2330 static void
2331 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2332 {
2333 scm_c_string_set_x (h->array, index, val);
2334 }
2335
2336 static void
2337 string_get_handle (SCM v, scm_t_array_handle *h)
2338 {
2339 h->array = v;
2340 h->ndims = 1;
2341 h->dims = &h->dim0;
2342 h->dim0.lbnd = 0;
2343 h->dim0.ubnd = scm_c_string_length (v) - 1;
2344 h->dim0.inc = 1;
2345 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2346 h->elements = h->writable_elements = NULL;
2347 }
2348
2349 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2350 string_handle_ref, string_handle_set,
2351 string_get_handle)
2352 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2353
2354 void
2355 scm_init_strings ()
2356 {
2357 scm_nullstr = scm_i_make_string (0, NULL, 0);
2358
2359 scm_i_pthread_atfork (scm_i_lock_iconv, scm_i_unlock_iconv,
2360 scm_i_unlock_iconv);
2361
2362 #include "libguile/strings.x"
2363 }
2364
2365
2366 /*
2367 Local Variables:
2368 c-file-style: "gnu"
2369 End:
2370 */