Avoid signed overflow and use size_t in bytevectors.c.
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229
230 \f
231 /* Copy-on-write strings.
232 */
233
234 #define STRING_TAG scm_tc7_string
235
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
245 /* Read-only strings.
246 */
247
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
251 /* Mutation-sharing substrings
252 */
253
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
258
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
261 SCM scm_nullstr;
262
263 static SCM null_stringbuf;
264
265 static void
266 init_null_stringbuf (void)
267 {
268 null_stringbuf = make_stringbuf (0);
269 SET_STRINGBUF_SHARED (null_stringbuf);
270 }
271
272 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
273 characters. CHARSP, if not NULL, will be set to location of the
274 char array. If READ_ONLY_P, the returned string is read-only;
275 otherwise it is writable. */
276 SCM
277 scm_i_make_string (size_t len, char **charsp, int read_only_p)
278 {
279 SCM buf;
280 SCM res;
281
282 if (len == 0)
283 {
284 static scm_i_pthread_once_t once = SCM_I_PTHREAD_ONCE_INIT;
285 scm_i_pthread_once (&once, init_null_stringbuf);
286 buf = null_stringbuf;
287 }
288 else
289 buf = make_stringbuf (len);
290
291 if (charsp)
292 *charsp = (char *) STRINGBUF_CHARS (buf);
293 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
294 SCM_UNPACK (buf),
295 (scm_t_bits) 0, (scm_t_bits) len);
296 return res;
297 }
298
299 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
300 characters. CHARSP, if not NULL, will be set to location of the
301 character array. If READ_ONLY_P, the returned string is read-only;
302 otherwise it is writable. */
303 SCM
304 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
305 {
306 SCM buf = make_wide_stringbuf (len);
307 SCM res;
308 if (charsp)
309 *charsp = STRINGBUF_WIDE_CHARS (buf);
310 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
311 SCM_UNPACK (buf),
312 (scm_t_bits) 0, (scm_t_bits) len);
313 return res;
314 }
315
316 static void
317 validate_substring_args (SCM str, size_t start, size_t end)
318 {
319 if (!IS_STRING (str))
320 scm_wrong_type_arg_msg (NULL, 0, str, "string");
321 if (start > STRING_LENGTH (str))
322 scm_out_of_range (NULL, scm_from_size_t (start));
323 if (end > STRING_LENGTH (str) || end < start)
324 scm_out_of_range (NULL, scm_from_size_t (end));
325 }
326
327 static inline void
328 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
329 {
330 *start = STRING_START (*str);
331 if (IS_SH_STRING (*str))
332 {
333 *str = SH_STRING_STRING (*str);
334 *start += STRING_START (*str);
335 }
336 *buf = STRING_STRINGBUF (*str);
337 }
338
339 SCM
340 scm_i_substring (SCM str, size_t start, size_t end)
341 {
342 if (start == end)
343 return scm_i_make_string (0, NULL, 0);
344 else
345 {
346 SCM buf;
347 size_t str_start;
348 get_str_buf_start (&str, &buf, &str_start);
349 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
350 SET_STRINGBUF_SHARED (buf);
351 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
352 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
353 (scm_t_bits)str_start + start,
354 (scm_t_bits) end - start);
355 }
356 }
357
358 SCM
359 scm_i_substring_read_only (SCM str, size_t start, size_t end)
360 {
361 if (start == end)
362 return scm_i_make_string (0, NULL, 1);
363 else
364 {
365 SCM buf;
366 size_t str_start;
367 get_str_buf_start (&str, &buf, &str_start);
368 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
369 SET_STRINGBUF_SHARED (buf);
370 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
371 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
372 (scm_t_bits)str_start + start,
373 (scm_t_bits) end - start);
374 }
375 }
376
377 SCM
378 scm_i_substring_copy (SCM str, size_t start, size_t end)
379 {
380 if (start == end)
381 return scm_i_make_string (0, NULL, 0);
382 else
383 {
384 size_t len = end - start;
385 SCM buf, my_buf, substr;
386 size_t str_start;
387 int wide = 0;
388 get_str_buf_start (&str, &buf, &str_start);
389 if (scm_i_is_narrow_string (str))
390 {
391 my_buf = make_stringbuf (len);
392 memcpy (STRINGBUF_CHARS (my_buf),
393 STRINGBUF_CHARS (buf) + str_start + start, len);
394 }
395 else
396 {
397 my_buf = make_wide_stringbuf (len);
398 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
399 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
400 + start), len);
401 wide = 1;
402 }
403 scm_remember_upto_here_1 (buf);
404 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
405 (scm_t_bits) 0, (scm_t_bits) len);
406 if (wide)
407 scm_i_try_narrow_string (substr);
408 return substr;
409 }
410 }
411
412 SCM
413 scm_i_substring_shared (SCM str, size_t start, size_t end)
414 {
415 if (start == 0 && end == STRING_LENGTH (str))
416 return str;
417 else if (start == end)
418 return scm_i_make_string (0, NULL, 0);
419 else
420 {
421 size_t len = end - start;
422 if (IS_SH_STRING (str))
423 {
424 start += STRING_START (str);
425 str = SH_STRING_STRING (str);
426 }
427 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
428 (scm_t_bits)start, (scm_t_bits) len);
429 }
430 }
431
432 SCM
433 scm_c_substring (SCM str, size_t start, size_t end)
434 {
435 validate_substring_args (str, start, end);
436 return scm_i_substring (str, start, end);
437 }
438
439 SCM
440 scm_c_substring_read_only (SCM str, size_t start, size_t end)
441 {
442 validate_substring_args (str, start, end);
443 return scm_i_substring_read_only (str, start, end);
444 }
445
446 SCM
447 scm_c_substring_copy (SCM str, size_t start, size_t end)
448 {
449 validate_substring_args (str, start, end);
450 return scm_i_substring_copy (str, start, end);
451 }
452
453 SCM
454 scm_c_substring_shared (SCM str, size_t start, size_t end)
455 {
456 validate_substring_args (str, start, end);
457 return scm_i_substring_shared (str, start, end);
458 }
459
460 \f
461 /* Internal accessors
462 */
463
464 /* Returns the number of characters in STR. This may be different
465 than the memory size of the string storage. */
466 size_t
467 scm_i_string_length (SCM str)
468 {
469 return STRING_LENGTH (str);
470 }
471
472 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
473 encoding. False if it is 'wide', having a 32-bit UCS-4
474 encoding. */
475 int
476 scm_i_is_narrow_string (SCM str)
477 {
478 if (IS_SH_STRING (str))
479 str = SH_STRING_STRING (str);
480
481 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
482 }
483
484 /* Try to coerce a string to be narrow. It if is narrow already, do
485 nothing. If it is wide, shrink it to narrow if none of its
486 characters are above 0xFF. Return true if the string is narrow or
487 was made to be narrow. */
488 int
489 scm_i_try_narrow_string (SCM str)
490 {
491 if (IS_SH_STRING (str))
492 str = SH_STRING_STRING (str);
493
494 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
495
496 return scm_i_is_narrow_string (str);
497 }
498
499 /* Return a pointer to the raw data of the string, which can be either Latin-1
500 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
501 const void *
502 scm_i_string_data (SCM str)
503 {
504 SCM buf;
505 size_t start;
506 const char *data;
507
508 get_str_buf_start (&str, &buf, &start);
509
510 data = STRINGBUF_CONTENTS (buf);
511 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
512
513 return data;
514 }
515
516 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
517 STR. */
518 const char *
519 scm_i_string_chars (SCM str)
520 {
521 SCM buf;
522 size_t start;
523 get_str_buf_start (&str, &buf, &start);
524 if (scm_i_is_narrow_string (str))
525 return (const char *) STRINGBUF_CHARS (buf) + start;
526 else
527 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
528 scm_list_1 (str));
529 return NULL;
530 }
531
532 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
533 STR. */
534 const scm_t_wchar *
535 scm_i_string_wide_chars (SCM str)
536 {
537 SCM buf;
538 size_t start;
539
540 get_str_buf_start (&str, &buf, &start);
541 if (!scm_i_is_narrow_string (str))
542 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
543 else
544 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
545 scm_list_1 (str));
546 }
547
548 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
549 a new string buffer, so that it can be modified without modifying
550 other strings. Also, lock the string mutex. Later, one must call
551 scm_i_string_stop_writing to unlock the mutex. */
552 SCM
553 scm_i_string_start_writing (SCM orig_str)
554 {
555 SCM buf, str = orig_str;
556 size_t start;
557
558 get_str_buf_start (&str, &buf, &start);
559 if (IS_RO_STRING (str))
560 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
561
562 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
563 if (STRINGBUF_SHARED (buf))
564 {
565 /* Clone the stringbuf. */
566 size_t len = STRING_LENGTH (str);
567 SCM new_buf;
568
569 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
570
571 if (scm_i_is_narrow_string (str))
572 {
573 new_buf = make_stringbuf (len);
574 memcpy (STRINGBUF_CHARS (new_buf),
575 STRINGBUF_CHARS (buf) + STRING_START (str), len);
576
577 }
578 else
579 {
580 new_buf = make_wide_stringbuf (len);
581 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
582 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
583 + STRING_START (str)), len);
584 }
585
586 SET_STRING_STRINGBUF (str, new_buf);
587 start -= STRING_START (str);
588
589 /* FIXME: The following operations are not atomic, so other threads
590 looking at STR may see an inconsistent state. Nevertheless it can't
591 hurt much since (i) accessing STR while it is being mutated can't
592 yield a crash, and (ii) concurrent accesses to STR should be
593 protected by a mutex at the application level. The latter may not
594 apply when STR != ORIG_STR, though. */
595 SET_STRING_START (str, 0);
596 SET_STRING_STRINGBUF (str, new_buf);
597
598 buf = new_buf;
599
600 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
601 }
602 return orig_str;
603 }
604
605 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
606 char *
607 scm_i_string_writable_chars (SCM str)
608 {
609 SCM buf;
610 size_t start;
611
612 get_str_buf_start (&str, &buf, &start);
613 if (scm_i_is_narrow_string (str))
614 return (char *) STRINGBUF_CHARS (buf) + start;
615 else
616 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
617 scm_list_1 (str));
618 return NULL;
619 }
620
621 /* Return a pointer to the UCS-4 codepoints of a string. */
622 static scm_t_wchar *
623 scm_i_string_writable_wide_chars (SCM str)
624 {
625 SCM buf;
626 size_t start;
627
628 get_str_buf_start (&str, &buf, &start);
629 if (!scm_i_is_narrow_string (str))
630 return STRINGBUF_WIDE_CHARS (buf) + start;
631 else
632 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
633 scm_list_1 (str));
634 }
635
636 /* Unlock the string mutex that was locked when
637 scm_i_string_start_writing was called. */
638 void
639 scm_i_string_stop_writing (void)
640 {
641 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
642 }
643
644 /* Return the Xth character of STR as a UCS-4 codepoint. */
645 scm_t_wchar
646 scm_i_string_ref (SCM str, size_t x)
647 {
648 if (scm_i_is_narrow_string (str))
649 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
650 else
651 return scm_i_string_wide_chars (str)[x];
652 }
653
654 /* Returns index+1 of the first char in STR that matches C, or
655 0 if the char is not found. */
656 int
657 scm_i_string_contains_char (SCM str, char ch)
658 {
659 size_t i;
660 size_t len = scm_i_string_length (str);
661
662 i = 0;
663 if (scm_i_is_narrow_string (str))
664 {
665 while (i < len)
666 {
667 if (scm_i_string_chars (str)[i] == ch)
668 return i+1;
669 i++;
670 }
671 }
672 else
673 {
674 while (i < len)
675 {
676 if (scm_i_string_wide_chars (str)[i]
677 == (unsigned char) ch)
678 return i+1;
679 i++;
680 }
681 }
682 return 0;
683 }
684
685 int
686 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
687 {
688 if (scm_i_is_narrow_string (sstr))
689 {
690 const char *a = scm_i_string_chars (sstr) + start_x;
691 const char *b = cstr;
692 return strncmp (a, b, strlen(b));
693 }
694 else
695 {
696 size_t i;
697 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
698 const char *b = cstr;
699 for (i = 0; i < strlen (b); i++)
700 {
701 if (a[i] != (unsigned char) b[i])
702 return 1;
703 }
704 }
705 return 0;
706 }
707
708 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
709 void
710 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
711 {
712 if (IS_SH_STRING (str))
713 {
714 p += STRING_START (str);
715 str = SH_STRING_STRING (str);
716 }
717
718 if (chr > 0xFF && scm_i_is_narrow_string (str))
719 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
720
721 if (scm_i_is_narrow_string (str))
722 {
723 char *dst = scm_i_string_writable_chars (str);
724 dst[p] = chr;
725 }
726 else
727 {
728 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
729 dst[p] = chr;
730 }
731 }
732
733 \f
734 /* Symbols.
735
736 Basic symbol creation and accessing is done here, the rest is in
737 symbols.[hc]. This has been done to keep stringbufs and the
738 internals of strings and string-like objects confined to this file.
739 */
740
741 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
742
743 SCM
744 scm_i_make_symbol (SCM name, scm_t_bits flags,
745 unsigned long hash, SCM props)
746 {
747 SCM buf;
748 size_t start = STRING_START (name);
749 size_t length = STRING_LENGTH (name);
750
751 if (IS_SH_STRING (name))
752 {
753 name = SH_STRING_STRING (name);
754 start += STRING_START (name);
755 }
756 buf = STRING_STRINGBUF (name);
757
758 if (start == 0 && length == STRINGBUF_LENGTH (buf))
759 {
760 /* reuse buf. */
761 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
762 SET_STRINGBUF_SHARED (buf);
763 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
764 }
765 else
766 {
767 /* make new buf. */
768 if (scm_i_is_narrow_string (name))
769 {
770 SCM new_buf = make_stringbuf (length);
771 memcpy (STRINGBUF_CHARS (new_buf),
772 STRINGBUF_CHARS (buf) + start, length);
773 buf = new_buf;
774 }
775 else
776 {
777 SCM new_buf = make_wide_stringbuf (length);
778 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
779 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
780 length);
781 buf = new_buf;
782 }
783 }
784 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
785 (scm_t_bits) hash, SCM_UNPACK (props));
786 }
787
788 SCM
789 scm_i_c_make_symbol (const char *name, size_t len,
790 scm_t_bits flags, unsigned long hash, SCM props)
791 {
792 SCM buf = make_stringbuf (len);
793 memcpy (STRINGBUF_CHARS (buf), name, len);
794
795 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
796 (scm_t_bits) hash, SCM_UNPACK (props));
797 }
798
799 /* Returns the number of characters in SYM. This may be different
800 from the memory size of SYM. */
801 size_t
802 scm_i_symbol_length (SCM sym)
803 {
804 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
805 }
806
807 size_t
808 scm_c_symbol_length (SCM sym)
809 #define FUNC_NAME "scm_c_symbol_length"
810 {
811 SCM_VALIDATE_SYMBOL (1, sym);
812
813 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
814 }
815 #undef FUNC_NAME
816
817 /* True if the name of SYM is stored as a Latin-1 encoded string.
818 False if it is stored as a 32-bit UCS-4-encoded string. */
819 int
820 scm_i_is_narrow_symbol (SCM sym)
821 {
822 SCM buf;
823
824 buf = SYMBOL_STRINGBUF (sym);
825 return !STRINGBUF_WIDE (buf);
826 }
827
828 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
829 contains the name of SYM. */
830 const char *
831 scm_i_symbol_chars (SCM sym)
832 {
833 SCM buf;
834
835 buf = SYMBOL_STRINGBUF (sym);
836 if (!STRINGBUF_WIDE (buf))
837 return (const char *) STRINGBUF_CHARS (buf);
838 else
839 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
840 scm_list_1 (sym));
841 }
842
843 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
844 symbol's name. */
845 const scm_t_wchar *
846 scm_i_symbol_wide_chars (SCM sym)
847 {
848 SCM buf;
849
850 buf = SYMBOL_STRINGBUF (sym);
851 if (STRINGBUF_WIDE (buf))
852 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
853 else
854 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
855 scm_list_1 (sym));
856 }
857
858 SCM
859 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
860 {
861 SCM buf = SYMBOL_STRINGBUF (sym);
862 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
863 SET_STRINGBUF_SHARED (buf);
864 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
865 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
866 (scm_t_bits)start, (scm_t_bits) end - start);
867 }
868
869 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
870 scm_t_wchar
871 scm_i_symbol_ref (SCM sym, size_t x)
872 {
873 if (scm_i_is_narrow_symbol (sym))
874 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
875 else
876 return scm_i_symbol_wide_chars (sym)[x];
877 }
878
879 /* Debugging
880 */
881
882 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
883 "Returns an association list containing debugging information\n"
884 "for @var{str}. The association list has the following entries."
885 "@table @code\n"
886 "@item string\n"
887 "The string itself.\n"
888 "@item start\n"
889 "The start index of the string into its stringbuf\n"
890 "@item length\n"
891 "The length of the string\n"
892 "@item shared\n"
893 "If this string is a substring, it returns its parent string.\n"
894 "Otherwise, it returns @code{#f}\n"
895 "@item read-only\n"
896 "@code{#t} if the string is read-only\n"
897 "@item stringbuf-chars\n"
898 "A new string containing this string's stringbuf's characters\n"
899 "@item stringbuf-length\n"
900 "The number of characters in this stringbuf\n"
901 "@item stringbuf-shared\n"
902 "@code{#t} if this stringbuf is shared\n"
903 "@item stringbuf-wide\n"
904 "@code{#t} if this stringbuf's characters are stored in a\n"
905 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
906 "buffer\n"
907 "@end table")
908 #define FUNC_NAME s_scm_sys_string_dump
909 {
910 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
911 SCM buf;
912 SCM_VALIDATE_STRING (1, str);
913
914 /* String info */
915 e1 = scm_cons (scm_from_latin1_symbol ("string"),
916 str);
917 e2 = scm_cons (scm_from_latin1_symbol ("start"),
918 scm_from_size_t (STRING_START (str)));
919 e3 = scm_cons (scm_from_latin1_symbol ("length"),
920 scm_from_size_t (STRING_LENGTH (str)));
921
922 if (IS_SH_STRING (str))
923 {
924 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
925 SH_STRING_STRING (str));
926 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
927 }
928 else
929 {
930 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
931 SCM_BOOL_F);
932 buf = STRING_STRINGBUF (str);
933 }
934
935 if (IS_RO_STRING (str))
936 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
937 SCM_BOOL_T);
938 else
939 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
940 SCM_BOOL_F);
941
942 /* Stringbuf info */
943 if (!STRINGBUF_WIDE (buf))
944 {
945 size_t len = STRINGBUF_LENGTH (buf);
946 char *cbuf;
947 SCM sbc = scm_i_make_string (len, &cbuf, 0);
948 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
949 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
950 sbc);
951 }
952 else
953 {
954 size_t len = STRINGBUF_LENGTH (buf);
955 scm_t_wchar *cbuf;
956 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
957 u32_cpy ((scm_t_uint32 *) cbuf,
958 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
959 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
960 sbc);
961 }
962 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
963 scm_from_size_t (STRINGBUF_LENGTH (buf)));
964 if (STRINGBUF_SHARED (buf))
965 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
966 SCM_BOOL_T);
967 else
968 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
969 SCM_BOOL_F);
970 if (STRINGBUF_WIDE (buf))
971 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
972 SCM_BOOL_T);
973 else
974 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
975 SCM_BOOL_F);
976
977 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
978 }
979 #undef FUNC_NAME
980
981 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
982 "Returns an association list containing debugging information\n"
983 "for @var{sym}. The association list has the following entries."
984 "@table @code\n"
985 "@item symbol\n"
986 "The symbol itself\n"
987 "@item hash\n"
988 "Its hash value\n"
989 "@item interned\n"
990 "@code{#t} if it is an interned symbol\n"
991 "@item stringbuf-chars\n"
992 "A new string containing this symbols's stringbuf's characters\n"
993 "@item stringbuf-length\n"
994 "The number of characters in this stringbuf\n"
995 "@item stringbuf-shared\n"
996 "@code{#t} if this stringbuf is shared\n"
997 "@item stringbuf-wide\n"
998 "@code{#t} if this stringbuf's characters are stored in a\n"
999 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1000 "buffer\n"
1001 "@end table")
1002 #define FUNC_NAME s_scm_sys_symbol_dump
1003 {
1004 SCM e1, e2, e3, e4, e5, e6, e7;
1005 SCM buf;
1006 SCM_VALIDATE_SYMBOL (1, sym);
1007 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1008 sym);
1009 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1010 scm_from_ulong (scm_i_symbol_hash (sym)));
1011 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1012 scm_symbol_interned_p (sym));
1013 buf = SYMBOL_STRINGBUF (sym);
1014
1015 /* Stringbuf info */
1016 if (!STRINGBUF_WIDE (buf))
1017 {
1018 size_t len = STRINGBUF_LENGTH (buf);
1019 char *cbuf;
1020 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1021 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1022 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1023 sbc);
1024 }
1025 else
1026 {
1027 size_t len = STRINGBUF_LENGTH (buf);
1028 scm_t_wchar *cbuf;
1029 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1030 u32_cpy ((scm_t_uint32 *) cbuf,
1031 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1032 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1033 sbc);
1034 }
1035 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1036 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1037 if (STRINGBUF_SHARED (buf))
1038 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1039 SCM_BOOL_T);
1040 else
1041 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1042 SCM_BOOL_F);
1043 if (STRINGBUF_WIDE (buf))
1044 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1045 SCM_BOOL_T);
1046 else
1047 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1048 SCM_BOOL_F);
1049 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1050
1051 }
1052 #undef FUNC_NAME
1053
1054 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1055
1056 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1057 #define FUNC_NAME s_scm_sys_stringbuf_hist
1058 {
1059 int i;
1060 for (i = 0; i < 1000; i++)
1061 if (lenhist[i])
1062 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1063 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1064 return SCM_UNSPECIFIED;
1065 }
1066 #undef FUNC_NAME
1067
1068 #endif
1069
1070 \f
1071
1072 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1073 (SCM obj),
1074 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1075 #define FUNC_NAME s_scm_string_p
1076 {
1077 return scm_from_bool (IS_STRING (obj));
1078 }
1079 #undef FUNC_NAME
1080
1081
1082 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1083
1084 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1085 (SCM chrs),
1086 "@deffnx {Scheme Procedure} list->string chrs\n"
1087 "Return a newly allocated string composed of the arguments,\n"
1088 "@var{chrs}.")
1089 #define FUNC_NAME s_scm_string
1090 {
1091 SCM result = SCM_BOOL_F;
1092 SCM rest;
1093 size_t len;
1094 size_t p = 0;
1095 long i;
1096 int wide = 0;
1097
1098 /* Verify that this is a list of chars. */
1099 i = scm_ilength (chrs);
1100 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1101
1102 len = (size_t) i;
1103 rest = chrs;
1104
1105 while (len > 0 && scm_is_pair (rest))
1106 {
1107 SCM elt = SCM_CAR (rest);
1108 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1109 if (SCM_CHAR (elt) > 0xFF)
1110 wide = 1;
1111 rest = SCM_CDR (rest);
1112 len--;
1113 scm_remember_upto_here_1 (elt);
1114 }
1115
1116 /* Construct a string containing this list of chars. */
1117 len = (size_t) i;
1118 rest = chrs;
1119
1120 if (wide == 0)
1121 {
1122 char *buf;
1123
1124 result = scm_i_make_string (len, NULL, 0);
1125 result = scm_i_string_start_writing (result);
1126 buf = scm_i_string_writable_chars (result);
1127 while (len > 0 && scm_is_pair (rest))
1128 {
1129 SCM elt = SCM_CAR (rest);
1130 buf[p] = (unsigned char) SCM_CHAR (elt);
1131 p++;
1132 rest = SCM_CDR (rest);
1133 len--;
1134 scm_remember_upto_here_1 (elt);
1135 }
1136 }
1137 else
1138 {
1139 scm_t_wchar *buf;
1140
1141 result = scm_i_make_wide_string (len, NULL, 0);
1142 result = scm_i_string_start_writing (result);
1143 buf = scm_i_string_writable_wide_chars (result);
1144 while (len > 0 && scm_is_pair (rest))
1145 {
1146 SCM elt = SCM_CAR (rest);
1147 buf[p] = SCM_CHAR (elt);
1148 p++;
1149 rest = SCM_CDR (rest);
1150 len--;
1151 scm_remember_upto_here_1 (elt);
1152 }
1153 }
1154 scm_i_string_stop_writing ();
1155
1156 if (len > 0)
1157 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1158 if (!scm_is_null (rest))
1159 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1160
1161 return result;
1162 }
1163 #undef FUNC_NAME
1164
1165 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1166 (SCM k, SCM chr),
1167 "Return a newly allocated string of\n"
1168 "length @var{k}. If @var{chr} is given, then all elements of\n"
1169 "the string are initialized to @var{chr}, otherwise the contents\n"
1170 "of the string are all set to @code{#\nul}.")
1171 #define FUNC_NAME s_scm_make_string
1172 {
1173 return scm_c_make_string (scm_to_size_t (k), chr);
1174 }
1175 #undef FUNC_NAME
1176
1177 SCM
1178 scm_c_make_string (size_t len, SCM chr)
1179 #define FUNC_NAME NULL
1180 {
1181 size_t p;
1182 char *contents = NULL;
1183 SCM res = scm_i_make_string (len, &contents, 0);
1184
1185 /* If no char is given, initialize string contents to NULL. */
1186 if (SCM_UNBNDP (chr))
1187 memset (contents, 0, len);
1188 else
1189 {
1190 SCM_VALIDATE_CHAR (0, chr);
1191 res = scm_i_string_start_writing (res);
1192 for (p = 0; p < len; p++)
1193 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1194 scm_i_string_stop_writing ();
1195 }
1196
1197 return res;
1198 }
1199 #undef FUNC_NAME
1200
1201 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1202 (SCM string),
1203 "Return the number of characters in @var{string}.")
1204 #define FUNC_NAME s_scm_string_length
1205 {
1206 SCM_VALIDATE_STRING (1, string);
1207 return scm_from_size_t (STRING_LENGTH (string));
1208 }
1209 #undef FUNC_NAME
1210
1211 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1212 (SCM string),
1213 "Return the bytes used to represent a character in @var{string}."
1214 "This will return 1 or 4.")
1215 #define FUNC_NAME s_scm_string_bytes_per_char
1216 {
1217 SCM_VALIDATE_STRING (1, string);
1218 if (!scm_i_is_narrow_string (string))
1219 return scm_from_int (4);
1220
1221 return scm_from_int (1);
1222 }
1223 #undef FUNC_NAME
1224
1225 size_t
1226 scm_c_string_length (SCM string)
1227 {
1228 if (!IS_STRING (string))
1229 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1230 return STRING_LENGTH (string);
1231 }
1232
1233 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1234 (SCM str, SCM k),
1235 "Return character @var{k} of @var{str} using zero-origin\n"
1236 "indexing. @var{k} must be a valid index of @var{str}.")
1237 #define FUNC_NAME s_scm_string_ref
1238 {
1239 size_t len;
1240 unsigned long idx;
1241
1242 SCM_VALIDATE_STRING (1, str);
1243
1244 len = scm_i_string_length (str);
1245 if (SCM_LIKELY (len > 0))
1246 idx = scm_to_unsigned_integer (k, 0, len - 1);
1247 else
1248 scm_out_of_range (NULL, k);
1249
1250 if (scm_i_is_narrow_string (str))
1251 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1252 else
1253 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1254 }
1255 #undef FUNC_NAME
1256
1257 SCM
1258 scm_c_string_ref (SCM str, size_t p)
1259 {
1260 if (p >= scm_i_string_length (str))
1261 scm_out_of_range (NULL, scm_from_size_t (p));
1262 if (scm_i_is_narrow_string (str))
1263 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1264 else
1265 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1266
1267 }
1268
1269 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1270 (SCM str, SCM k, SCM chr),
1271 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1272 "an unspecified value. @var{k} must be a valid index of\n"
1273 "@var{str}.")
1274 #define FUNC_NAME s_scm_string_set_x
1275 {
1276 size_t len;
1277 unsigned long idx;
1278
1279 SCM_VALIDATE_STRING (1, str);
1280
1281 len = scm_i_string_length (str);
1282 if (SCM_LIKELY (len > 0))
1283 idx = scm_to_unsigned_integer (k, 0, len - 1);
1284 else
1285 scm_out_of_range (NULL, k);
1286
1287 SCM_VALIDATE_CHAR (3, chr);
1288 str = scm_i_string_start_writing (str);
1289 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1290 scm_i_string_stop_writing ();
1291
1292 return SCM_UNSPECIFIED;
1293 }
1294 #undef FUNC_NAME
1295
1296 void
1297 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1298 {
1299 if (p >= scm_i_string_length (str))
1300 scm_out_of_range (NULL, scm_from_size_t (p));
1301 str = scm_i_string_start_writing (str);
1302 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1303 scm_i_string_stop_writing ();
1304 }
1305
1306 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1307 (SCM str, SCM start, SCM end),
1308 "Return a newly allocated string formed from the characters\n"
1309 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1310 "ending with index @var{end} (exclusive).\n"
1311 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1312 "exact integers satisfying:\n\n"
1313 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1314 #define FUNC_NAME s_scm_substring
1315 {
1316 size_t len, from, to;
1317
1318 SCM_VALIDATE_STRING (1, str);
1319 len = scm_i_string_length (str);
1320 from = scm_to_unsigned_integer (start, 0, len);
1321 if (SCM_UNBNDP (end))
1322 to = len;
1323 else
1324 to = scm_to_unsigned_integer (end, from, len);
1325 return scm_i_substring (str, from, to);
1326 }
1327 #undef FUNC_NAME
1328
1329 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1330 (SCM str, SCM start, SCM end),
1331 "Return a newly allocated string formed from the characters\n"
1332 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1333 "ending with index @var{end} (exclusive).\n"
1334 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1335 "exact integers satisfying:\n"
1336 "\n"
1337 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1338 "\n"
1339 "The returned string is read-only.\n")
1340 #define FUNC_NAME s_scm_substring_read_only
1341 {
1342 size_t len, from, to;
1343
1344 SCM_VALIDATE_STRING (1, str);
1345 len = scm_i_string_length (str);
1346 from = scm_to_unsigned_integer (start, 0, len);
1347 if (SCM_UNBNDP (end))
1348 to = len;
1349 else
1350 to = scm_to_unsigned_integer (end, from, len);
1351 return scm_i_substring_read_only (str, from, to);
1352 }
1353 #undef FUNC_NAME
1354
1355 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1356 (SCM str, SCM start, SCM end),
1357 "Return a newly allocated string formed from the characters\n"
1358 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1359 "ending with index @var{end} (exclusive).\n"
1360 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1361 "exact integers satisfying:\n\n"
1362 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1363 #define FUNC_NAME s_scm_substring_copy
1364 {
1365 /* For the Scheme version, START is mandatory, but for the C
1366 version, it is optional. See scm_string_copy in srfi-13.c for a
1367 rationale.
1368 */
1369
1370 size_t from, to;
1371
1372 SCM_VALIDATE_STRING (1, str);
1373 scm_i_get_substring_spec (scm_i_string_length (str),
1374 start, &from, end, &to);
1375 return scm_i_substring_copy (str, from, to);
1376 }
1377 #undef FUNC_NAME
1378
1379 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1380 (SCM str, SCM start, SCM end),
1381 "Return string that indirectly refers to the characters\n"
1382 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1383 "ending with index @var{end} (exclusive).\n"
1384 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1385 "exact integers satisfying:\n\n"
1386 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1387 #define FUNC_NAME s_scm_substring_shared
1388 {
1389 size_t len, from, to;
1390
1391 SCM_VALIDATE_STRING (1, str);
1392 len = scm_i_string_length (str);
1393 from = scm_to_unsigned_integer (start, 0, len);
1394 if (SCM_UNBNDP (end))
1395 to = len;
1396 else
1397 to = scm_to_unsigned_integer (end, from, len);
1398 return scm_i_substring_shared (str, from, to);
1399 }
1400 #undef FUNC_NAME
1401
1402 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1403 (SCM args),
1404 "Return a newly allocated string whose characters form the\n"
1405 "concatenation of the given strings, @var{args}.")
1406 #define FUNC_NAME s_scm_string_append
1407 {
1408 SCM res;
1409 size_t total = 0;
1410 size_t len;
1411 int wide = 0;
1412 SCM l, s;
1413 size_t i;
1414 union
1415 {
1416 char *narrow;
1417 scm_t_wchar *wide;
1418 } data;
1419
1420 SCM_VALIDATE_REST_ARGUMENT (args);
1421 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1422 {
1423 s = SCM_CAR (l);
1424 SCM_VALIDATE_STRING (SCM_ARGn, s);
1425 len = scm_i_string_length (s);
1426 if (((size_t) -1) - total < len)
1427 scm_num_overflow (s_scm_string_append);
1428 total += len;
1429 if (!scm_i_is_narrow_string (s))
1430 wide = 1;
1431 }
1432 data.narrow = NULL;
1433 if (!wide)
1434 res = scm_i_make_string (total, &data.narrow, 0);
1435 else
1436 res = scm_i_make_wide_string (total, &data.wide, 0);
1437
1438 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1439 {
1440 size_t len;
1441 s = SCM_CAR (l);
1442 SCM_VALIDATE_STRING (SCM_ARGn, s);
1443 len = scm_i_string_length (s);
1444 if (len > total)
1445 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1446 if (!wide)
1447 {
1448 memcpy (data.narrow, scm_i_string_chars (s), len);
1449 data.narrow += len;
1450 }
1451 else
1452 {
1453 if (scm_i_is_narrow_string (s))
1454 {
1455 const char *src = scm_i_string_chars (s);
1456 for (i = 0; i < len; i++)
1457 data.wide[i] = (unsigned char) src[i];
1458 }
1459 else
1460 u32_cpy ((scm_t_uint32 *) data.wide,
1461 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1462 data.wide += len;
1463 }
1464 total -= len;
1465 scm_remember_upto_here_1 (s);
1466 }
1467 if (total != 0)
1468 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1469 return res;
1470 }
1471 #undef FUNC_NAME
1472
1473
1474 \f
1475 /* Charset conversion error handling. */
1476
1477 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1478 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1479
1480 /* Raise an exception informing that character CHR could not be written
1481 to PORT in its current encoding. */
1482 void
1483 scm_encoding_error (const char *subr, int err, const char *message,
1484 SCM port, SCM chr)
1485 {
1486 scm_throw (scm_encoding_error_key,
1487 scm_list_n (scm_from_latin1_string (subr),
1488 scm_from_latin1_string (message),
1489 scm_from_int (err),
1490 port, chr,
1491 SCM_UNDEFINED));
1492 }
1493
1494 /* Raise an exception informing of an encoding error on PORT. This
1495 means that a character could not be written in PORT's encoding. */
1496 void
1497 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1498 {
1499 scm_throw (scm_decoding_error_key,
1500 scm_list_n (scm_from_latin1_string (subr),
1501 scm_from_latin1_string (message),
1502 scm_from_int (err),
1503 port,
1504 SCM_UNDEFINED));
1505 }
1506
1507 \f
1508 /* String conversion to/from C. */
1509
1510 SCM
1511 scm_from_stringn (const char *str, size_t len, const char *encoding,
1512 scm_t_string_failed_conversion_handler handler)
1513 {
1514 size_t u32len, i;
1515 scm_t_wchar *u32;
1516 int wide = 0;
1517 SCM res;
1518
1519 /* The order of these checks is important. */
1520 if (!str && len != 0)
1521 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1522 if (len == (size_t) -1)
1523 len = strlen (str);
1524
1525 if (encoding == NULL || len == 0)
1526 {
1527 /* If encoding is null (or the string is empty), use Latin-1. */
1528 char *buf;
1529 res = scm_i_make_string (len, &buf, 0);
1530 memcpy (buf, str, len);
1531 return res;
1532 }
1533
1534 u32len = 0;
1535 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1536 (enum iconv_ilseq_handler)
1537 handler,
1538 str, len,
1539 NULL,
1540 NULL, &u32len);
1541
1542 if (SCM_UNLIKELY (u32 == NULL))
1543 {
1544 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1545 handler. */
1546 SCM bv;
1547 signed char *buf;
1548
1549 buf = scm_gc_malloc_pointerless (len, "bytevector");
1550 memcpy (buf, str, len);
1551 bv = scm_c_take_gc_bytevector (buf, len);
1552
1553 scm_decoding_error (__func__, errno,
1554 "input locale conversion error", bv);
1555 }
1556
1557 i = 0;
1558 while (i < u32len)
1559 if (u32[i++] > 0xFF)
1560 {
1561 wide = 1;
1562 break;
1563 }
1564
1565 if (!wide)
1566 {
1567 char *dst;
1568 res = scm_i_make_string (u32len, &dst, 0);
1569 for (i = 0; i < u32len; i ++)
1570 dst[i] = (unsigned char) u32[i];
1571 dst[u32len] = '\0';
1572 }
1573 else
1574 {
1575 scm_t_wchar *wdst;
1576 res = scm_i_make_wide_string (u32len, &wdst, 0);
1577 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578 wdst[u32len] = 0;
1579 }
1580
1581 free (u32);
1582 return res;
1583 }
1584
1585 SCM
1586 scm_from_locale_string (const char *str)
1587 {
1588 return scm_from_locale_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_locale_stringn (const char *str, size_t len)
1593 {
1594 return scm_from_stringn (str, len, locale_charset (),
1595 scm_i_default_port_conversion_handler ());
1596 }
1597
1598 SCM
1599 scm_from_latin1_string (const char *str)
1600 {
1601 return scm_from_latin1_stringn (str, -1);
1602 }
1603
1604 SCM
1605 scm_from_latin1_stringn (const char *str, size_t len)
1606 {
1607 char *buf;
1608 SCM result;
1609
1610 if (len == (size_t) -1)
1611 len = strlen (str);
1612
1613 /* Make a narrow string and copy STR as is. */
1614 result = scm_i_make_string (len, &buf, 0);
1615 memcpy (buf, str, len);
1616
1617 return result;
1618 }
1619
1620 SCM
1621 scm_from_utf8_string (const char *str)
1622 {
1623 return scm_from_utf8_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_utf8_stringn (const char *str, size_t len)
1628 {
1629 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1630 }
1631
1632 SCM
1633 scm_from_utf32_string (const scm_t_wchar *str)
1634 {
1635 return scm_from_utf32_stringn (str, -1);
1636 }
1637
1638 SCM
1639 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1640 {
1641 SCM result;
1642 scm_t_wchar *buf;
1643
1644 if (len == (size_t) -1)
1645 len = u32_strlen ((uint32_t *) str);
1646
1647 result = scm_i_make_wide_string (len, &buf, 0);
1648 memcpy (buf, str, len * sizeof (scm_t_wchar));
1649 scm_i_try_narrow_string (result);
1650
1651 return result;
1652 }
1653
1654 /* Create a new scheme string from the C string STR. The memory of
1655 STR may be used directly as storage for the new string. */
1656 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1657 would be to register a finalizer to eventually free(3) STR, which isn't
1658 worth it. Should we just deprecate the `scm_take_' functions? */
1659 SCM
1660 scm_take_locale_stringn (char *str, size_t len)
1661 {
1662 SCM res;
1663
1664 res = scm_from_locale_stringn (str, len);
1665 free (str);
1666
1667 return res;
1668 }
1669
1670 SCM
1671 scm_take_locale_string (char *str)
1672 {
1673 return scm_take_locale_stringn (str, -1);
1674 }
1675
1676 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1677 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1678 Set *LENP to the size of the resulting string.
1679
1680 FIXME: This is a hack we should get rid of. See
1681 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1682 for details. */
1683 static void
1684 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1685 {
1686 char *before, *after;
1687 size_t i, j;
1688
1689 before = buf;
1690 after = buf;
1691 i = 0;
1692 j = 0;
1693 while (i < *lenp)
1694 {
1695 if ((i <= *lenp - 6)
1696 && before[i] == '\\'
1697 && before[i + 1] == 'u'
1698 && before[i + 2] == '0' && before[i + 3] == '0')
1699 {
1700 /* Convert \u00NN to \xNN */
1701 after[j] = '\\';
1702 after[j + 1] = 'x';
1703 after[j + 2] = tolower ((int) before[i + 4]);
1704 after[j + 3] = tolower ((int) before[i + 5]);
1705 i += 6;
1706 j += 4;
1707 }
1708 else if ((i <= *lenp - 10)
1709 && before[i] == '\\'
1710 && before[i + 1] == 'U'
1711 && before[i + 2] == '0' && before[i + 3] == '0')
1712 {
1713 /* Convert \U00NNNNNN to \UNNNNNN */
1714 after[j] = '\\';
1715 after[j + 1] = 'U';
1716 after[j + 2] = tolower ((int) before[i + 4]);
1717 after[j + 3] = tolower ((int) before[i + 5]);
1718 after[j + 4] = tolower ((int) before[i + 6]);
1719 after[j + 5] = tolower ((int) before[i + 7]);
1720 after[j + 6] = tolower ((int) before[i + 8]);
1721 after[j + 7] = tolower ((int) before[i + 9]);
1722 i += 10;
1723 j += 8;
1724 }
1725 else
1726 {
1727 after[j] = before[i];
1728 i++;
1729 j++;
1730 }
1731 }
1732 *lenp = j;
1733 }
1734
1735 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1736 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1737 of the resulting string. BUF must be large enough to handle the
1738 worst case when `\uXXXX' escapes (6 characters) are replaced by
1739 `\xXXXX;' (7 characters). */
1740 static void
1741 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1742 {
1743 char *before, *after;
1744 size_t i, j;
1745 /* The worst case is if the input string contains all 4-digit hex escapes.
1746 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1747 size_t max_out_len = (*lenp * 7) / 6 + 1;
1748 size_t nzeros, ndigits;
1749
1750 before = buf;
1751 after = alloca (max_out_len);
1752 i = 0;
1753 j = 0;
1754 while (i < *lenp)
1755 {
1756 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1757 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1758 {
1759 if (before[i + 1] == 'u')
1760 ndigits = 4;
1761 else if (before[i + 1] == 'U')
1762 ndigits = 8;
1763 else
1764 abort ();
1765
1766 /* Add the R6RS hex escape initial sequence. */
1767 after[j] = '\\';
1768 after[j + 1] = 'x';
1769
1770 /* Move string positions to the start of the hex numbers. */
1771 i += 2;
1772 j += 2;
1773
1774 /* Find the number of initial zeros in this hex number. */
1775 nzeros = 0;
1776 while (before[i + nzeros] == '0' && nzeros < ndigits)
1777 nzeros++;
1778
1779 /* Copy the number, skipping initial zeros, and then move the string
1780 positions. */
1781 if (nzeros == ndigits)
1782 {
1783 after[j] = '0';
1784 i += ndigits;
1785 j += 1;
1786 }
1787 else
1788 {
1789 int pos;
1790 for (pos = 0; pos < ndigits - nzeros; pos++)
1791 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1792 i += ndigits;
1793 j += (ndigits - nzeros);
1794 }
1795
1796 /* Add terminating semicolon. */
1797 after[j] = ';';
1798 j++;
1799 }
1800 else
1801 {
1802 after[j] = before[i];
1803 i++;
1804 j++;
1805 }
1806 }
1807 *lenp = j;
1808 memcpy (before, after, j);
1809 }
1810
1811 char *
1812 scm_to_locale_string (SCM str)
1813 {
1814 return scm_to_locale_stringn (str, NULL);
1815 }
1816
1817 char *
1818 scm_to_locale_stringn (SCM str, size_t *lenp)
1819 {
1820 return scm_to_stringn (str, lenp,
1821 locale_charset (),
1822 scm_i_default_port_conversion_handler ());
1823 }
1824
1825 char *
1826 scm_to_latin1_string (SCM str)
1827 {
1828 return scm_to_latin1_stringn (str, NULL);
1829 }
1830
1831 char *
1832 scm_to_latin1_stringn (SCM str, size_t *lenp)
1833 #define FUNC_NAME "scm_to_latin1_stringn"
1834 {
1835 char *result;
1836
1837 SCM_VALIDATE_STRING (1, str);
1838
1839 if (scm_i_is_narrow_string (str))
1840 {
1841 size_t len = scm_i_string_length (str);
1842
1843 if (lenp)
1844 *lenp = len;
1845
1846 result = scm_strndup (scm_i_string_data (str), len);
1847 }
1848 else
1849 result = scm_to_stringn (str, lenp, NULL,
1850 SCM_FAILED_CONVERSION_ERROR);
1851
1852 return result;
1853 }
1854 #undef FUNC_NAME
1855
1856 char *
1857 scm_to_utf8_string (SCM str)
1858 {
1859 return scm_to_utf8_stringn (str, NULL);
1860 }
1861
1862 static size_t
1863 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1864 {
1865 size_t ret, i;
1866 for (i = 0, ret = 0; i < len; i++)
1867 ret += (str[i] < 128) ? 1 : 2;
1868 return ret;
1869 }
1870
1871 static scm_t_uint8*
1872 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1873 scm_t_uint8 *u8_result, size_t *u8_lenp)
1874 {
1875 size_t i, n;
1876 size_t u8_len = latin1_u8_strlen (str, latin_len);
1877
1878 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1879 u8_result = scm_malloc (u8_len + 1);
1880 if (u8_lenp)
1881 *u8_lenp = u8_len;
1882
1883 for (i = 0, n = 0; i < latin_len; i++)
1884 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1885 if (n != u8_len)
1886 abort ();
1887 u8_result[n] = 0;
1888
1889 return u8_result;
1890 }
1891
1892 /* UTF-8 code table
1893
1894 (Note that this includes code points that are not allowed by Unicode,
1895 but since this function has no way to report an error, and its
1896 purpose is to determine the size of destination buffers for
1897 libunicode conversion functions, we err on the safe side and handle
1898 everything that libunicode might conceivably handle, now or in the
1899 future.)
1900
1901 Char. number range | UTF-8 octet sequence
1902 (hexadecimal) | (binary)
1903 --------------------+------------------------------------------------------
1904 0000 0000-0000 007F | 0xxxxxxx
1905 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
1906 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1907 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1908 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1909 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1910 */
1911
1912 static size_t
1913 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
1914 {
1915 size_t ret, i;
1916
1917 for (i = 0, ret = 0; i < len; i++)
1918 {
1919 scm_t_uint32 c = str[i];
1920
1921 if (c <= 0x7f)
1922 ret += 1;
1923 else if (c <= 0x7ff)
1924 ret += 2;
1925 else if (c <= 0xffff)
1926 ret += 3;
1927 else if (c <= 0x1fffff)
1928 ret += 4;
1929 else if (c <= 0x3ffffff)
1930 ret += 5;
1931 else
1932 ret += 6;
1933 }
1934
1935 return ret;
1936 }
1937
1938 char *
1939 scm_to_utf8_stringn (SCM str, size_t *lenp)
1940 #define FUNC_NAME "scm_to_utf8_stringn"
1941 {
1942 SCM_VALIDATE_STRING (1, str);
1943
1944 if (scm_i_is_narrow_string (str))
1945 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1946 scm_i_string_length (str),
1947 NULL, lenp);
1948 else
1949 {
1950 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
1951 scm_t_uint8 *buf, *ret;
1952 size_t num_chars = scm_i_string_length (str);
1953 size_t num_bytes_predicted, num_bytes_actual;
1954
1955 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
1956
1957 if (lenp)
1958 {
1959 *lenp = num_bytes_predicted;
1960 buf = scm_malloc (num_bytes_predicted);
1961 }
1962 else
1963 {
1964 buf = scm_malloc (num_bytes_predicted + 1);
1965 buf[num_bytes_predicted] = 0;
1966 }
1967
1968 num_bytes_actual = num_bytes_predicted;
1969 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
1970
1971 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
1972 return (char *) ret;
1973
1974 /* An error: a bad codepoint. */
1975 {
1976 int saved_errno = errno;
1977
1978 free (buf);
1979 if (!saved_errno)
1980 abort ();
1981
1982 scm_decoding_error ("scm_to_utf8_stringn", errno,
1983 "invalid codepoint in string", str);
1984
1985 /* Not reached. */
1986 return NULL;
1987 }
1988 }
1989 }
1990 #undef FUNC_NAME
1991
1992 scm_t_wchar *
1993 scm_to_utf32_string (SCM str)
1994 {
1995 return scm_to_utf32_stringn (str, NULL);
1996 }
1997
1998 scm_t_wchar *
1999 scm_to_utf32_stringn (SCM str, size_t *lenp)
2000 #define FUNC_NAME "scm_to_utf32_stringn"
2001 {
2002 scm_t_wchar *result;
2003
2004 SCM_VALIDATE_STRING (1, str);
2005
2006 if (scm_i_is_narrow_string (str))
2007 {
2008 scm_t_uint8 *codepoints;
2009 size_t i, len;
2010
2011 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2012 len = scm_i_string_length (str);
2013 if (lenp)
2014 *lenp = len;
2015
2016 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2017 for (i = 0; i < len; i++)
2018 result[i] = codepoints[i];
2019 result[len] = 0;
2020 }
2021 else
2022 {
2023 size_t len;
2024
2025 len = scm_i_string_length (str);
2026 if (lenp)
2027 *lenp = len;
2028
2029 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2030 memcpy (result, scm_i_string_wide_chars (str),
2031 len * sizeof (scm_t_wchar));
2032 result[len] = 0;
2033 }
2034
2035 return result;
2036 }
2037 #undef FUNC_NAME
2038
2039 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2040 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2041 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2042 defined by HANDLER. */
2043 char *
2044 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2045 scm_t_string_failed_conversion_handler handler)
2046 {
2047 char *buf;
2048 size_t ilen, len, i;
2049 int ret;
2050 const char *enc;
2051
2052 if (!scm_is_string (str))
2053 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2054 ilen = scm_i_string_length (str);
2055
2056 if (ilen == 0)
2057 {
2058 buf = scm_malloc (1);
2059 buf[0] = '\0';
2060 if (lenp)
2061 *lenp = 0;
2062 return buf;
2063 }
2064
2065 if (lenp == NULL)
2066 for (i = 0; i < ilen; i++)
2067 if (scm_i_string_ref (str, i) == '\0')
2068 scm_misc_error (NULL,
2069 "string contains #\\nul character: ~S",
2070 scm_list_1 (str));
2071
2072 if (scm_i_is_narrow_string (str) && (encoding == NULL))
2073 {
2074 /* If using native Latin-1 encoding, just copy the string
2075 contents. */
2076 if (lenp)
2077 {
2078 buf = scm_malloc (ilen);
2079 memcpy (buf, scm_i_string_chars (str), ilen);
2080 *lenp = ilen;
2081 return buf;
2082 }
2083 else
2084 {
2085 buf = scm_malloc (ilen + 1);
2086 memcpy (buf, scm_i_string_chars (str), ilen);
2087 buf[ilen] = '\0';
2088 return buf;
2089 }
2090 }
2091
2092
2093 buf = NULL;
2094 len = 0;
2095 enc = encoding;
2096 if (enc == NULL)
2097 enc = "ISO-8859-1";
2098 if (scm_i_is_narrow_string (str))
2099 {
2100 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2101 "ISO-8859-1", enc,
2102 (enum iconv_ilseq_handler) handler, NULL,
2103 &buf, &len);
2104
2105 if (ret != 0)
2106 scm_encoding_error (__func__, errno,
2107 "cannot convert narrow string to output locale",
2108 SCM_BOOL_F,
2109 /* FIXME: Faulty character unknown. */
2110 SCM_BOOL_F);
2111 }
2112 else
2113 {
2114 buf = u32_conv_to_encoding (enc,
2115 (enum iconv_ilseq_handler) handler,
2116 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2117 ilen,
2118 NULL,
2119 NULL, &len);
2120 if (buf == NULL)
2121 scm_encoding_error (__func__, errno,
2122 "cannot convert wide string to output locale",
2123 SCM_BOOL_F,
2124 /* FIXME: Faulty character unknown. */
2125 SCM_BOOL_F);
2126 }
2127 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2128 {
2129 if (SCM_R6RS_ESCAPES_P)
2130 {
2131 /* The worst case is if the input string contains all 4-digit
2132 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2133 (seven characters). Make BUF large enough to hold
2134 that. */
2135 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2136 unistring_escapes_to_r6rs_escapes (buf, &len);
2137 }
2138 else
2139 unistring_escapes_to_guile_escapes (buf, &len);
2140
2141 buf = scm_realloc (buf, len);
2142 }
2143 if (lenp)
2144 *lenp = len;
2145 else
2146 {
2147 buf = scm_realloc (buf, len + 1);
2148 buf[len] = '\0';
2149 }
2150
2151 scm_remember_upto_here_1 (str);
2152 return buf;
2153 }
2154
2155 size_t
2156 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2157 {
2158 size_t len;
2159 char *result = NULL;
2160 if (!scm_is_string (str))
2161 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2162 result = scm_to_locale_stringn (str, &len);
2163
2164 memcpy (buf, result, (len > max_len) ? max_len : len);
2165 free (result);
2166
2167 scm_remember_upto_here_1 (str);
2168 return len;
2169 }
2170
2171 \f
2172 /* Unicode string normalization. */
2173
2174 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2175 libguile/i18n.c. It would be useful to have this factored out into a more
2176 convenient location, but its use of alloca makes that tricky to do. */
2177
2178 static SCM
2179 normalize_str (SCM string, uninorm_t form)
2180 {
2181 SCM ret;
2182 scm_t_uint32 *w_str;
2183 scm_t_wchar *cbuf;
2184 size_t rlen, len = scm_i_string_length (string);
2185
2186 if (scm_i_is_narrow_string (string))
2187 {
2188 size_t i;
2189 const char *buf = scm_i_string_chars (string);
2190
2191 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2192
2193 for (i = 0; i < len; i ++)
2194 w_str[i] = (unsigned char) buf[i];
2195 w_str[len] = 0;
2196 }
2197 else
2198 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2199
2200 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2201
2202 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2203 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2204 free (w_str);
2205
2206 scm_i_try_narrow_string (ret);
2207
2208 return ret;
2209 }
2210
2211 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2212 (SCM string),
2213 "Returns the NFC normalized form of @var{string}.")
2214 #define FUNC_NAME s_scm_string_normalize_nfc
2215 {
2216 SCM_VALIDATE_STRING (1, string);
2217 return normalize_str (string, UNINORM_NFC);
2218 }
2219 #undef FUNC_NAME
2220
2221 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2222 (SCM string),
2223 "Returns the NFD normalized form of @var{string}.")
2224 #define FUNC_NAME s_scm_string_normalize_nfd
2225 {
2226 SCM_VALIDATE_STRING (1, string);
2227 return normalize_str (string, UNINORM_NFD);
2228 }
2229 #undef FUNC_NAME
2230
2231 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2232 (SCM string),
2233 "Returns the NFKC normalized form of @var{string}.")
2234 #define FUNC_NAME s_scm_string_normalize_nfkc
2235 {
2236 SCM_VALIDATE_STRING (1, string);
2237 return normalize_str (string, UNINORM_NFKC);
2238 }
2239 #undef FUNC_NAME
2240
2241 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2242 (SCM string),
2243 "Returns the NFKD normalized form of @var{string}.")
2244 #define FUNC_NAME s_scm_string_normalize_nfkd
2245 {
2246 SCM_VALIDATE_STRING (1, string);
2247 return normalize_str (string, UNINORM_NFKD);
2248 }
2249 #undef FUNC_NAME
2250
2251 /* converts C scm_array of strings to SCM scm_list of strings.
2252 If argc < 0, a null terminated scm_array is assumed.
2253 The current locale encoding is assumed */
2254 SCM
2255 scm_makfromstrs (int argc, char **argv)
2256 {
2257 int i = argc;
2258 SCM lst = SCM_EOL;
2259 if (0 > i)
2260 for (i = 0; argv[i]; i++);
2261 while (i--)
2262 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2263 return lst;
2264 }
2265
2266 /* Return a newly allocated array of char pointers to each of the strings
2267 in args, with a terminating NULL pointer. The strings are encoded using
2268 the current locale. */
2269
2270 char **
2271 scm_i_allocate_string_pointers (SCM list)
2272 #define FUNC_NAME "scm_i_allocate_string_pointers"
2273 {
2274 char **result;
2275 int list_len = scm_ilength (list);
2276 int i;
2277
2278 if (list_len < 0)
2279 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2280
2281 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2282 "string pointers");
2283 result[list_len] = NULL;
2284
2285 /* The list might have been modified in another thread, so
2286 we check LIST before each access.
2287 */
2288 for (i = 0; i < list_len && scm_is_pair (list); i++)
2289 {
2290 SCM str = SCM_CAR (list);
2291 size_t len; /* String length in bytes */
2292 char *c_str = scm_to_locale_stringn (str, &len);
2293
2294 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2295 scm_malloc to allocate the returned string, which must be
2296 explicitly deallocated. This forces us to copy the string a
2297 second time into a new buffer. Ideally there would be variants
2298 of scm_to_*_stringn that can return garbage-collected buffers. */
2299
2300 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2301 memcpy (result[i], c_str, len);
2302 result[i][len] = '\0';
2303 free (c_str);
2304
2305 list = SCM_CDR (list);
2306 }
2307
2308 return result;
2309 }
2310 #undef FUNC_NAME
2311
2312 void
2313 scm_i_get_substring_spec (size_t len,
2314 SCM start, size_t *cstart,
2315 SCM end, size_t *cend)
2316 {
2317 if (SCM_UNBNDP (start))
2318 *cstart = 0;
2319 else
2320 *cstart = scm_to_unsigned_integer (start, 0, len);
2321
2322 if (SCM_UNBNDP (end))
2323 *cend = len;
2324 else
2325 *cend = scm_to_unsigned_integer (end, *cstart, len);
2326 }
2327
2328 #if SCM_ENABLE_DEPRECATED
2329
2330 /* When these definitions are removed, it becomes reasonable to use
2331 read-only strings for string literals. For that, change the reader
2332 to create string literals with scm_c_substring_read_only instead of
2333 with scm_c_substring_copy.
2334 */
2335
2336 int
2337 scm_i_deprecated_stringp (SCM str)
2338 {
2339 scm_c_issue_deprecation_warning
2340 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2341
2342 return scm_is_string (str);
2343 }
2344
2345 char *
2346 scm_i_deprecated_string_chars (SCM str)
2347 {
2348 char *chars;
2349
2350 scm_c_issue_deprecation_warning
2351 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2352
2353 /* We don't accept shared substrings here since they are not
2354 null-terminated.
2355 */
2356 if (IS_SH_STRING (str))
2357 scm_misc_error (NULL,
2358 "SCM_STRING_CHARS does not work with shared substrings",
2359 SCM_EOL);
2360
2361 /* We explicitly test for read-only strings to produce a better
2362 error message.
2363 */
2364
2365 if (IS_RO_STRING (str))
2366 scm_misc_error (NULL,
2367 "SCM_STRING_CHARS does not work with read-only strings",
2368 SCM_EOL);
2369
2370 /* The following is still wrong, of course...
2371 */
2372 str = scm_i_string_start_writing (str);
2373 chars = scm_i_string_writable_chars (str);
2374 scm_i_string_stop_writing ();
2375 return chars;
2376 }
2377
2378 size_t
2379 scm_i_deprecated_string_length (SCM str)
2380 {
2381 scm_c_issue_deprecation_warning
2382 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2383 return scm_c_string_length (str);
2384 }
2385
2386 #endif
2387
2388 static SCM
2389 string_handle_ref (scm_t_array_handle *h, size_t index)
2390 {
2391 return scm_c_string_ref (h->array, index);
2392 }
2393
2394 static void
2395 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2396 {
2397 scm_c_string_set_x (h->array, index, val);
2398 }
2399
2400 static void
2401 string_get_handle (SCM v, scm_t_array_handle *h)
2402 {
2403 h->array = v;
2404 h->ndims = 1;
2405 h->dims = &h->dim0;
2406 h->dim0.lbnd = 0;
2407 h->dim0.ubnd = scm_c_string_length (v) - 1;
2408 h->dim0.inc = 1;
2409 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2410 h->elements = h->writable_elements = NULL;
2411 }
2412
2413 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2414 string_handle_ref, string_handle_set,
2415 string_get_handle)
2416 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2417
2418 void
2419 scm_init_strings ()
2420 {
2421 scm_nullstr = scm_i_make_string (0, NULL, 0);
2422
2423 #include "libguile/strings.x"
2424 }
2425
2426
2427 /*
2428 Local Variables:
2429 c-file-style: "gnu"
2430 End:
2431 */