21295addf85af8acb78505f47055c850cf0aa86d
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <string.h>
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <unistr.h>
29 #include <uniconv.h>
30
31 #include "striconveh.h"
32
33 #include "libguile/_scm.h"
34 #include "libguile/chars.h"
35 #include "libguile/root.h"
36 #include "libguile/strings.h"
37 #include "libguile/generalized-vectors.h"
38 #include "libguile/deprecation.h"
39 #include "libguile/validate.h"
40
41 \f
42
43 /* {Strings}
44 */
45
46
47 /* Stringbufs
48 *
49 * XXX - keeping an accurate refcount during GC seems to be quite
50 * tricky, so we just keep score of whether a stringbuf might be
51 * shared, not whether it definitely is.
52 *
53 * The scheme I (mvo) tried to keep an accurate reference count would
54 * recount all strings that point to a stringbuf during the mark-phase
55 * of the GC. This was done since one cannot access the stringbuf of
56 * a string when that string is freed (in order to decrease the
57 * reference count). The memory of the stringbuf might have been
58 * reused already for something completely different.
59 *
60 * This recounted worked for a small number of threads beating on
61 * cow-strings, but it failed randomly with more than 10 threads, say.
62 * I couldn't figure out what went wrong, so I used the conservative
63 * approach implemented below.
64 *
65 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
66 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
67 * strings.
68 */
69
70 /* The size in words of the stringbuf header (type tag + size). */
71 #define STRINGBUF_HEADER_SIZE 2U
72
73 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
74
75 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
76 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
77
78 #define STRINGBUF_TAG scm_tc7_stringbuf
79 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
80 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
81
82 #define STRINGBUF_CHARS(buf) ((unsigned char *) \
83 SCM_CELL_OBJECT_LOC (buf, \
84 STRINGBUF_HEADER_SIZE))
85 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
86
87 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
88
89 #define SET_STRINGBUF_SHARED(buf) \
90 do \
91 { \
92 /* Don't modify BUF if it's already marked as shared since it might be \
93 a read-only, statically allocated stringbuf. */ \
94 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
95 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
96 } \
97 while (0)
98
99 #if SCM_STRING_LENGTH_HISTOGRAM
100 static size_t lenhist[1001];
101 #endif
102
103 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
104 characters. */
105 static SCM
106 make_stringbuf (size_t len)
107 {
108 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
109 scm_i_symbol_chars, all stringbufs are null-terminated. Once
110 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
111 has been changed for scm_i_symbol_chars, this null-termination
112 can be dropped.
113 */
114
115 SCM buf;
116
117 #if SCM_STRING_LENGTH_HISTOGRAM
118 if (len < 1000)
119 lenhist[len]++;
120 else
121 lenhist[1000]++;
122 #endif
123
124 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
125 "string"));
126
127 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
128 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
129
130 STRINGBUF_CHARS (buf)[len] = 0;
131
132 return buf;
133 }
134
135 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
136 characters. */
137 static SCM
138 make_wide_stringbuf (size_t len)
139 {
140 SCM buf;
141 size_t raw_len;
142
143 #if SCM_STRING_LENGTH_HISTOGRAM
144 if (len < 1000)
145 lenhist[len]++;
146 else
147 lenhist[1000]++;
148 #endif
149
150 raw_len = (len + 1) * sizeof (scm_t_wchar);
151 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
152 "string"));
153
154 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
155 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
156
157 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
158
159 return buf;
160 }
161
162 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
163 characters from BUF. */
164 static SCM
165 wide_stringbuf (SCM buf)
166 {
167 SCM new_buf;
168
169 if (STRINGBUF_WIDE (buf))
170 new_buf = buf;
171 else
172 {
173 size_t i, len;
174 scm_t_wchar *mem;
175
176 len = STRINGBUF_LENGTH (buf);
177
178 new_buf = make_wide_stringbuf (len);
179
180 mem = STRINGBUF_WIDE_CHARS (new_buf);
181 for (i = 0; i < len; i++)
182 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
183 mem[len] = 0;
184 }
185
186 return new_buf;
187 }
188
189 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
190 characters from BUF, if possible. */
191 static SCM
192 narrow_stringbuf (SCM buf)
193 {
194 SCM new_buf;
195
196 if (!STRINGBUF_WIDE (buf))
197 new_buf = buf;
198 else
199 {
200 size_t i, len;
201 scm_t_wchar *wmem;
202 unsigned char *mem;
203
204 len = STRINGBUF_LENGTH (buf);
205 wmem = STRINGBUF_WIDE_CHARS (buf);
206
207 for (i = 0; i < len; i++)
208 if (wmem[i] > 0xFF)
209 /* BUF cannot be narrowed. */
210 return buf;
211
212 new_buf = make_stringbuf (len);
213
214 mem = STRINGBUF_CHARS (new_buf);
215 for (i = 0; i < len; i++)
216 mem[i] = (unsigned char) wmem[i];
217 mem[len] = 0;
218 }
219
220 return new_buf;
221 }
222
223 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
224
225 \f
226 /* Copy-on-write strings.
227 */
228
229 #define STRING_TAG scm_tc7_string
230
231 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
232 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
233 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
234
235 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
236 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
237
238 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
239
240 /* Read-only strings.
241 */
242
243 #define RO_STRING_TAG scm_tc7_ro_string
244 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
245
246 /* Mutation-sharing substrings
247 */
248
249 #define SH_STRING_TAG (scm_tc7_string + 0x100)
250
251 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
252 /* START and LENGTH as for STRINGs. */
253
254 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
255
256 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
257 characters. CHARSP, if not NULL, will be set to location of the
258 char array. */
259 SCM
260 scm_i_make_string (size_t len, char **charsp)
261 {
262 SCM buf = make_stringbuf (len);
263 SCM res;
264 if (charsp)
265 *charsp = (char *) STRINGBUF_CHARS (buf);
266 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
267 (scm_t_bits)0, (scm_t_bits) len);
268 return res;
269 }
270
271 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
272 characters. CHARSP, if not NULL, will be set to location of the
273 character array. */
274 SCM
275 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
276 {
277 SCM buf = make_wide_stringbuf (len);
278 SCM res;
279 if (charsp)
280 *charsp = STRINGBUF_WIDE_CHARS (buf);
281 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
282 (scm_t_bits) 0, (scm_t_bits) len);
283 return res;
284 }
285
286 static void
287 validate_substring_args (SCM str, size_t start, size_t end)
288 {
289 if (!IS_STRING (str))
290 scm_wrong_type_arg_msg (NULL, 0, str, "string");
291 if (start > STRING_LENGTH (str))
292 scm_out_of_range (NULL, scm_from_size_t (start));
293 if (end > STRING_LENGTH (str) || end < start)
294 scm_out_of_range (NULL, scm_from_size_t (end));
295 }
296
297 static inline void
298 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
299 {
300 *start = STRING_START (*str);
301 if (IS_SH_STRING (*str))
302 {
303 *str = SH_STRING_STRING (*str);
304 *start += STRING_START (*str);
305 }
306 *buf = STRING_STRINGBUF (*str);
307 }
308
309 SCM
310 scm_i_substring (SCM str, size_t start, size_t end)
311 {
312 SCM buf;
313 size_t str_start;
314 get_str_buf_start (&str, &buf, &str_start);
315 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
316 SET_STRINGBUF_SHARED (buf);
317 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
318 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
319 (scm_t_bits)str_start + start,
320 (scm_t_bits) end - start);
321 }
322
323 SCM
324 scm_i_substring_read_only (SCM str, size_t start, size_t end)
325 {
326 SCM buf;
327 size_t str_start;
328 get_str_buf_start (&str, &buf, &str_start);
329 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
330 SET_STRINGBUF_SHARED (buf);
331 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
332 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
333 (scm_t_bits)str_start + start,
334 (scm_t_bits) end - start);
335 }
336
337 SCM
338 scm_i_substring_copy (SCM str, size_t start, size_t end)
339 {
340 size_t len = end - start;
341 SCM buf, my_buf;
342 size_t str_start;
343 get_str_buf_start (&str, &buf, &str_start);
344 if (scm_i_is_narrow_string (str))
345 {
346 my_buf = make_stringbuf (len);
347 memcpy (STRINGBUF_CHARS (my_buf),
348 STRINGBUF_CHARS (buf) + str_start + start, len);
349 }
350 else
351 {
352 my_buf = make_wide_stringbuf (len);
353 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
354 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
355 + start), len);
356 /* Even though this string is wide, the substring may be narrow.
357 Consider adding code to narrow the string. */
358 }
359 scm_remember_upto_here_1 (buf);
360 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
361 (scm_t_bits) 0, (scm_t_bits) len);
362 }
363
364 SCM
365 scm_i_substring_shared (SCM str, size_t start, size_t end)
366 {
367 if (start == 0 && end == STRING_LENGTH (str))
368 return str;
369 else
370 {
371 size_t len = end - start;
372 if (IS_SH_STRING (str))
373 {
374 start += STRING_START (str);
375 str = SH_STRING_STRING (str);
376 }
377 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
378 (scm_t_bits)start, (scm_t_bits) len);
379 }
380 }
381
382 SCM
383 scm_c_substring (SCM str, size_t start, size_t end)
384 {
385 validate_substring_args (str, start, end);
386 return scm_i_substring (str, start, end);
387 }
388
389 SCM
390 scm_c_substring_read_only (SCM str, size_t start, size_t end)
391 {
392 validate_substring_args (str, start, end);
393 return scm_i_substring_read_only (str, start, end);
394 }
395
396 SCM
397 scm_c_substring_copy (SCM str, size_t start, size_t end)
398 {
399 validate_substring_args (str, start, end);
400 return scm_i_substring_copy (str, start, end);
401 }
402
403 SCM
404 scm_c_substring_shared (SCM str, size_t start, size_t end)
405 {
406 validate_substring_args (str, start, end);
407 return scm_i_substring_shared (str, start, end);
408 }
409
410 \f
411 /* Internal accessors
412 */
413
414 /* Returns the number of characters in STR. This may be different
415 than the memory size of the string storage. */
416 size_t
417 scm_i_string_length (SCM str)
418 {
419 return STRING_LENGTH (str);
420 }
421
422 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
423 encoding. False if it is 'wide', having a 32-bit UCS-4
424 encoding. */
425 int
426 scm_i_is_narrow_string (SCM str)
427 {
428 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
429 }
430
431 /* Try to coerce a string to be narrow. It if is narrow already, do
432 nothing. If it is wide, shrink it to narrow if none of its
433 characters are above 0xFF. Return true if the string is narrow or
434 was made to be narrow. */
435 int
436 scm_i_try_narrow_string (SCM str)
437 {
438 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
439
440 return scm_i_is_narrow_string (str);
441 }
442
443 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
444 STR. */
445 const char *
446 scm_i_string_chars (SCM str)
447 {
448 SCM buf;
449 size_t start;
450 get_str_buf_start (&str, &buf, &start);
451 if (scm_i_is_narrow_string (str))
452 return (const char *) STRINGBUF_CHARS (buf) + start;
453 else
454 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
455 scm_list_1 (str));
456 return NULL;
457 }
458
459 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
460 STR. */
461 const scm_t_wchar *
462 scm_i_string_wide_chars (SCM str)
463 {
464 SCM buf;
465 size_t start;
466
467 get_str_buf_start (&str, &buf, &start);
468 if (!scm_i_is_narrow_string (str))
469 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
470 else
471 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
472 scm_list_1 (str));
473 }
474
475 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
476 a new string buffer, so that it can be modified without modifying
477 other strings. Also, lock the string mutex. Later, one must call
478 scm_i_string_stop_writing to unlock the mutex. */
479 SCM
480 scm_i_string_start_writing (SCM orig_str)
481 {
482 SCM buf, str = orig_str;
483 size_t start;
484
485 get_str_buf_start (&str, &buf, &start);
486 if (IS_RO_STRING (str))
487 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
488
489 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
490 if (STRINGBUF_SHARED (buf))
491 {
492 /* Clone the stringbuf. */
493 size_t len = STRING_LENGTH (str);
494 SCM new_buf;
495
496 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
497
498 if (scm_i_is_narrow_string (str))
499 {
500 new_buf = make_stringbuf (len);
501 memcpy (STRINGBUF_CHARS (new_buf),
502 STRINGBUF_CHARS (buf) + STRING_START (str), len);
503
504 }
505 else
506 {
507 new_buf = make_wide_stringbuf (len);
508 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
509 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
510 + STRING_START (str)), len);
511 }
512
513 SET_STRING_STRINGBUF (str, new_buf);
514 start -= STRING_START (str);
515
516 /* FIXME: The following operations are not atomic, so other threads
517 looking at STR may see an inconsistent state. Nevertheless it can't
518 hurt much since (i) accessing STR while it is being mutated can't
519 yield a crash, and (ii) concurrent accesses to STR should be
520 protected by a mutex at the application level. The latter may not
521 apply when STR != ORIG_STR, though. */
522 SET_STRING_START (str, 0);
523 SET_STRING_STRINGBUF (str, new_buf);
524
525 buf = new_buf;
526
527 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
528 }
529 return orig_str;
530 }
531
532 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
533 char *
534 scm_i_string_writable_chars (SCM str)
535 {
536 SCM buf;
537 size_t start;
538
539 get_str_buf_start (&str, &buf, &start);
540 if (scm_i_is_narrow_string (str))
541 return (char *) STRINGBUF_CHARS (buf) + start;
542 else
543 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
544 scm_list_1 (str));
545 return NULL;
546 }
547
548 /* Return a pointer to the UCS-4 codepoints of a string. */
549 static scm_t_wchar *
550 scm_i_string_writable_wide_chars (SCM str)
551 {
552 SCM buf;
553 size_t start;
554
555 get_str_buf_start (&str, &buf, &start);
556 if (!scm_i_is_narrow_string (str))
557 return STRINGBUF_WIDE_CHARS (buf) + start;
558 else
559 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
560 scm_list_1 (str));
561 }
562
563 /* Unlock the string mutex that was locked when
564 scm_i_string_start_writing was called. */
565 void
566 scm_i_string_stop_writing (void)
567 {
568 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
569 }
570
571 /* Return the Xth character of STR as a UCS-4 codepoint. */
572 scm_t_wchar
573 scm_i_string_ref (SCM str, size_t x)
574 {
575 if (scm_i_is_narrow_string (str))
576 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
577 else
578 return scm_i_string_wide_chars (str)[x];
579 }
580
581 /* Returns index+1 of the first char in STR that matches C, or
582 0 if the char is not found. */
583 int
584 scm_i_string_contains_char (SCM str, char ch)
585 {
586 size_t i;
587 size_t len = scm_i_string_length (str);
588
589 i = 0;
590 if (scm_i_is_narrow_string (str))
591 {
592 while (i < len)
593 {
594 if (scm_i_string_chars (str)[i] == ch)
595 return i+1;
596 i++;
597 }
598 }
599 else
600 {
601 while (i < len)
602 {
603 if (scm_i_string_wide_chars (str)[i]
604 == (unsigned char) ch)
605 return i+1;
606 i++;
607 }
608 }
609 return 0;
610 }
611
612 int
613 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
614 {
615 if (scm_i_is_narrow_string (sstr))
616 {
617 const char *a = scm_i_string_chars (sstr) + start_x;
618 const char *b = cstr;
619 return strncmp (a, b, strlen(b));
620 }
621 else
622 {
623 size_t i;
624 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
625 const char *b = cstr;
626 for (i = 0; i < strlen (b); i++)
627 {
628 if (a[i] != (unsigned char) b[i])
629 return 1;
630 }
631 }
632 return 0;
633 }
634
635 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
636 void
637 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
638 {
639 if (chr > 0xFF && scm_i_is_narrow_string (str))
640 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
641
642 if (scm_i_is_narrow_string (str))
643 {
644 char *dst = scm_i_string_writable_chars (str);
645 dst[p] = chr;
646 }
647 else
648 {
649 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
650 dst[p] = chr;
651 }
652 }
653
654 \f
655 /* Symbols.
656
657 Basic symbol creation and accessing is done here, the rest is in
658 symbols.[hc]. This has been done to keep stringbufs and the
659 internals of strings and string-like objects confined to this file.
660 */
661
662 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
663
664 SCM
665 scm_i_make_symbol (SCM name, scm_t_bits flags,
666 unsigned long hash, SCM props)
667 {
668 SCM buf;
669 size_t start = STRING_START (name);
670 size_t length = STRING_LENGTH (name);
671
672 if (IS_SH_STRING (name))
673 {
674 name = SH_STRING_STRING (name);
675 start += STRING_START (name);
676 }
677 buf = SYMBOL_STRINGBUF (name);
678
679 if (start == 0 && length == STRINGBUF_LENGTH (buf))
680 {
681 /* reuse buf. */
682 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
683 SET_STRINGBUF_SHARED (buf);
684 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
685 }
686 else
687 {
688 /* make new buf. */
689 if (scm_i_is_narrow_string (name))
690 {
691 SCM new_buf = make_stringbuf (length);
692 memcpy (STRINGBUF_CHARS (new_buf),
693 STRINGBUF_CHARS (buf) + start, length);
694 buf = new_buf;
695 }
696 else
697 {
698 SCM new_buf = make_wide_stringbuf (length);
699 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
700 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
701 length);
702 buf = new_buf;
703 }
704 }
705 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
706 (scm_t_bits) hash, SCM_UNPACK (props));
707 }
708
709 SCM
710 scm_i_c_make_symbol (const char *name, size_t len,
711 scm_t_bits flags, unsigned long hash, SCM props)
712 {
713 SCM buf = make_stringbuf (len);
714 memcpy (STRINGBUF_CHARS (buf), name, len);
715
716 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
717 (scm_t_bits) hash, SCM_UNPACK (props));
718 }
719
720 /* Returns the number of characters in SYM. This may be different
721 from the memory size of SYM. */
722 size_t
723 scm_i_symbol_length (SCM sym)
724 {
725 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
726 }
727
728 size_t
729 scm_c_symbol_length (SCM sym)
730 #define FUNC_NAME "scm_c_symbol_length"
731 {
732 SCM_VALIDATE_SYMBOL (1, sym);
733
734 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
735 }
736 #undef FUNC_NAME
737
738 /* True if the name of SYM is stored as a Latin-1 encoded string.
739 False if it is stored as a 32-bit UCS-4-encoded string. */
740 int
741 scm_i_is_narrow_symbol (SCM sym)
742 {
743 SCM buf;
744
745 buf = SYMBOL_STRINGBUF (sym);
746 return !STRINGBUF_WIDE (buf);
747 }
748
749 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
750 contains the name of SYM. */
751 const char *
752 scm_i_symbol_chars (SCM sym)
753 {
754 SCM buf;
755
756 buf = SYMBOL_STRINGBUF (sym);
757 if (!STRINGBUF_WIDE (buf))
758 return (const char *) STRINGBUF_CHARS (buf);
759 else
760 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
761 scm_list_1 (sym));
762 }
763
764 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
765 symbol's name. */
766 const scm_t_wchar *
767 scm_i_symbol_wide_chars (SCM sym)
768 {
769 SCM buf;
770
771 buf = SYMBOL_STRINGBUF (sym);
772 if (STRINGBUF_WIDE (buf))
773 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
774 else
775 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
776 scm_list_1 (sym));
777 }
778
779 SCM
780 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
781 {
782 SCM buf = SYMBOL_STRINGBUF (sym);
783 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
784 SET_STRINGBUF_SHARED (buf);
785 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
786 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
787 (scm_t_bits)start, (scm_t_bits) end - start);
788 }
789
790 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
791 scm_t_wchar
792 scm_i_symbol_ref (SCM sym, size_t x)
793 {
794 if (scm_i_is_narrow_symbol (sym))
795 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
796 else
797 return scm_i_symbol_wide_chars (sym)[x];
798 }
799
800 /* Debugging
801 */
802
803 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
804 "Returns an association list containing debugging information\n"
805 "for @var{str}. The association list has the following entries."
806 "@table @code\n"
807 "@item string\n"
808 "The string itself.\n"
809 "@item start\n"
810 "The start index of the string into its stringbuf\n"
811 "@item length\n"
812 "The length of the string\n"
813 "@item shared\n"
814 "If this string is a substring, it returns its parent string.\n"
815 "Otherwise, it returns @code{#f}\n"
816 "@item read-only\n"
817 "@code{#t} if the string is read-only\n"
818 "@item stringbuf-chars\n"
819 "A new string containing this string's stringbuf's characters\n"
820 "@item stringbuf-length\n"
821 "The number of characters in this stringbuf\n"
822 "@item stringbuf-shared\n"
823 "@code{#t} if this stringbuf is shared\n"
824 "@item stringbuf-wide\n"
825 "@code{#t} if this stringbuf's characters are stored in a\n"
826 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
827 "buffer\n"
828 "@end table")
829 #define FUNC_NAME s_scm_sys_string_dump
830 {
831 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
832 SCM buf;
833 SCM_VALIDATE_STRING (1, str);
834
835 /* String info */
836 e1 = scm_cons (scm_from_locale_symbol ("string"),
837 str);
838 e2 = scm_cons (scm_from_locale_symbol ("start"),
839 scm_from_size_t (STRING_START (str)));
840 e3 = scm_cons (scm_from_locale_symbol ("length"),
841 scm_from_size_t (STRING_LENGTH (str)));
842
843 if (IS_SH_STRING (str))
844 {
845 e4 = scm_cons (scm_from_locale_symbol ("shared"),
846 SH_STRING_STRING (str));
847 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
848 }
849 else
850 {
851 e4 = scm_cons (scm_from_locale_symbol ("shared"),
852 SCM_BOOL_F);
853 buf = STRING_STRINGBUF (str);
854 }
855
856 if (IS_RO_STRING (str))
857 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
858 SCM_BOOL_T);
859 else
860 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
861 SCM_BOOL_F);
862
863 /* Stringbuf info */
864 if (!STRINGBUF_WIDE (buf))
865 {
866 size_t len = STRINGBUF_LENGTH (buf);
867 char *cbuf;
868 SCM sbc = scm_i_make_string (len, &cbuf);
869 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
870 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
871 sbc);
872 }
873 else
874 {
875 size_t len = STRINGBUF_LENGTH (buf);
876 scm_t_wchar *cbuf;
877 SCM sbc = scm_i_make_wide_string (len, &cbuf);
878 u32_cpy ((scm_t_uint32 *) cbuf,
879 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
880 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
881 sbc);
882 }
883 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
884 scm_from_size_t (STRINGBUF_LENGTH (buf)));
885 if (STRINGBUF_SHARED (buf))
886 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
887 SCM_BOOL_T);
888 else
889 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
890 SCM_BOOL_F);
891 if (STRINGBUF_WIDE (buf))
892 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
893 SCM_BOOL_T);
894 else
895 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
896 SCM_BOOL_F);
897
898 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
899 }
900 #undef FUNC_NAME
901
902 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
903 "Returns an association list containing debugging information\n"
904 "for @var{sym}. The association list has the following entries."
905 "@table @code\n"
906 "@item symbol\n"
907 "The symbol itself\n"
908 "@item hash\n"
909 "Its hash value\n"
910 "@item interned\n"
911 "@code{#t} if it is an interned symbol\n"
912 "@item stringbuf-chars\n"
913 "A new string containing this symbols's stringbuf's characters\n"
914 "@item stringbuf-length\n"
915 "The number of characters in this stringbuf\n"
916 "@item stringbuf-shared\n"
917 "@code{#t} if this stringbuf is shared\n"
918 "@item stringbuf-wide\n"
919 "@code{#t} if this stringbuf's characters are stored in a\n"
920 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
921 "buffer\n"
922 "@end table")
923 #define FUNC_NAME s_scm_sys_symbol_dump
924 {
925 SCM e1, e2, e3, e4, e5, e6, e7;
926 SCM buf;
927 SCM_VALIDATE_SYMBOL (1, sym);
928 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
929 sym);
930 e2 = scm_cons (scm_from_locale_symbol ("hash"),
931 scm_from_ulong (scm_i_symbol_hash (sym)));
932 e3 = scm_cons (scm_from_locale_symbol ("interned"),
933 scm_symbol_interned_p (sym));
934 buf = SYMBOL_STRINGBUF (sym);
935
936 /* Stringbuf info */
937 if (!STRINGBUF_WIDE (buf))
938 {
939 size_t len = STRINGBUF_LENGTH (buf);
940 char *cbuf;
941 SCM sbc = scm_i_make_string (len, &cbuf);
942 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
943 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
944 sbc);
945 }
946 else
947 {
948 size_t len = STRINGBUF_LENGTH (buf);
949 scm_t_wchar *cbuf;
950 SCM sbc = scm_i_make_wide_string (len, &cbuf);
951 u32_cpy ((scm_t_uint32 *) cbuf,
952 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
953 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
954 sbc);
955 }
956 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
957 scm_from_size_t (STRINGBUF_LENGTH (buf)));
958 if (STRINGBUF_SHARED (buf))
959 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
960 SCM_BOOL_T);
961 else
962 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
963 SCM_BOOL_F);
964 if (STRINGBUF_WIDE (buf))
965 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
966 SCM_BOOL_T);
967 else
968 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
969 SCM_BOOL_F);
970 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
971
972 }
973 #undef FUNC_NAME
974
975 #if SCM_STRING_LENGTH_HISTOGRAM
976
977 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
978 #define FUNC_NAME s_scm_sys_stringbuf_hist
979 {
980 int i;
981 for (i = 0; i < 1000; i++)
982 if (lenhist[i])
983 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
984 fprintf (stderr, ">999: %u\n", lenhist[1000]);
985 return SCM_UNSPECIFIED;
986 }
987 #undef FUNC_NAME
988
989 #endif
990
991 \f
992
993 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
994 (SCM obj),
995 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
996 #define FUNC_NAME s_scm_string_p
997 {
998 return scm_from_bool (IS_STRING (obj));
999 }
1000 #undef FUNC_NAME
1001
1002
1003 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1004
1005 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1006 (SCM chrs),
1007 "@deffnx {Scheme Procedure} list->string chrs\n"
1008 "Return a newly allocated string composed of the arguments,\n"
1009 "@var{chrs}.")
1010 #define FUNC_NAME s_scm_string
1011 {
1012 SCM result = SCM_BOOL_F;
1013 SCM rest;
1014 size_t len;
1015 size_t p = 0;
1016 long i;
1017 int wide = 0;
1018
1019 /* Verify that this is a list of chars. */
1020 i = scm_ilength (chrs);
1021 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1022
1023 len = (size_t) i;
1024 rest = chrs;
1025
1026 while (len > 0 && scm_is_pair (rest))
1027 {
1028 SCM elt = SCM_CAR (rest);
1029 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1030 if (SCM_CHAR (elt) > 0xFF)
1031 wide = 1;
1032 rest = SCM_CDR (rest);
1033 len--;
1034 scm_remember_upto_here_1 (elt);
1035 }
1036
1037 /* Construct a string containing this list of chars. */
1038 len = (size_t) i;
1039 rest = chrs;
1040
1041 if (wide == 0)
1042 {
1043 result = scm_i_make_string (len, NULL);
1044 result = scm_i_string_start_writing (result);
1045 char *buf = scm_i_string_writable_chars (result);
1046 while (len > 0 && scm_is_pair (rest))
1047 {
1048 SCM elt = SCM_CAR (rest);
1049 buf[p] = (unsigned char) SCM_CHAR (elt);
1050 p++;
1051 rest = SCM_CDR (rest);
1052 len--;
1053 scm_remember_upto_here_1 (elt);
1054 }
1055 }
1056 else
1057 {
1058 result = scm_i_make_wide_string (len, NULL);
1059 result = scm_i_string_start_writing (result);
1060 scm_t_wchar *buf = scm_i_string_writable_wide_chars (result);
1061 while (len > 0 && scm_is_pair (rest))
1062 {
1063 SCM elt = SCM_CAR (rest);
1064 buf[p] = SCM_CHAR (elt);
1065 p++;
1066 rest = SCM_CDR (rest);
1067 len--;
1068 scm_remember_upto_here_1 (elt);
1069 }
1070 }
1071 scm_i_string_stop_writing ();
1072
1073 if (len > 0)
1074 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1075 if (!scm_is_null (rest))
1076 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1077
1078 return result;
1079 }
1080 #undef FUNC_NAME
1081
1082 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1083 (SCM k, SCM chr),
1084 "Return a newly allocated string of\n"
1085 "length @var{k}. If @var{chr} is given, then all elements of\n"
1086 "the string are initialized to @var{chr}, otherwise the contents\n"
1087 "of the @var{string} are unspecified.")
1088 #define FUNC_NAME s_scm_make_string
1089 {
1090 return scm_c_make_string (scm_to_size_t (k), chr);
1091 }
1092 #undef FUNC_NAME
1093
1094 SCM
1095 scm_c_make_string (size_t len, SCM chr)
1096 #define FUNC_NAME NULL
1097 {
1098 size_t p;
1099 SCM res = scm_i_make_string (len, NULL);
1100
1101 if (!SCM_UNBNDP (chr))
1102 {
1103 SCM_VALIDATE_CHAR (0, chr);
1104 res = scm_i_string_start_writing (res);
1105 for (p = 0; p < len; p++)
1106 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1107 scm_i_string_stop_writing ();
1108 }
1109
1110 return res;
1111 }
1112 #undef FUNC_NAME
1113
1114 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1115 (SCM string),
1116 "Return the number of characters in @var{string}.")
1117 #define FUNC_NAME s_scm_string_length
1118 {
1119 SCM_VALIDATE_STRING (1, string);
1120 return scm_from_size_t (STRING_LENGTH (string));
1121 }
1122 #undef FUNC_NAME
1123
1124 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1125 (SCM string),
1126 "Return the bytes used to represent a character in @var{string}."
1127 "This will return 1 or 4.")
1128 #define FUNC_NAME s_scm_string_bytes_per_char
1129 {
1130 SCM_VALIDATE_STRING (1, string);
1131 if (!scm_i_is_narrow_string (string))
1132 return scm_from_int (4);
1133
1134 return scm_from_int (1);
1135 }
1136 #undef FUNC_NAME
1137
1138 size_t
1139 scm_c_string_length (SCM string)
1140 {
1141 if (!IS_STRING (string))
1142 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1143 return STRING_LENGTH (string);
1144 }
1145
1146 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1147 (SCM str, SCM k),
1148 "Return character @var{k} of @var{str} using zero-origin\n"
1149 "indexing. @var{k} must be a valid index of @var{str}.")
1150 #define FUNC_NAME s_scm_string_ref
1151 {
1152 size_t len;
1153 unsigned long idx;
1154
1155 SCM_VALIDATE_STRING (1, str);
1156
1157 len = scm_i_string_length (str);
1158 if (SCM_LIKELY (len > 0))
1159 idx = scm_to_unsigned_integer (k, 0, len - 1);
1160 else
1161 scm_out_of_range (NULL, k);
1162
1163 if (scm_i_is_narrow_string (str))
1164 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1165 else
1166 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1167 }
1168 #undef FUNC_NAME
1169
1170 SCM
1171 scm_c_string_ref (SCM str, size_t p)
1172 {
1173 if (p >= scm_i_string_length (str))
1174 scm_out_of_range (NULL, scm_from_size_t (p));
1175 if (scm_i_is_narrow_string (str))
1176 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1177 else
1178 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1179
1180 }
1181
1182 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1183 (SCM str, SCM k, SCM chr),
1184 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1185 "an unspecified value. @var{k} must be a valid index of\n"
1186 "@var{str}.")
1187 #define FUNC_NAME s_scm_string_set_x
1188 {
1189 size_t len;
1190 unsigned long idx;
1191
1192 SCM_VALIDATE_STRING (1, str);
1193
1194 len = scm_i_string_length (str);
1195 if (SCM_LIKELY (len > 0))
1196 idx = scm_to_unsigned_integer (k, 0, len - 1);
1197 else
1198 scm_out_of_range (NULL, k);
1199
1200 SCM_VALIDATE_CHAR (3, chr);
1201 str = scm_i_string_start_writing (str);
1202 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1203 scm_i_string_stop_writing ();
1204
1205 return SCM_UNSPECIFIED;
1206 }
1207 #undef FUNC_NAME
1208
1209 void
1210 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1211 {
1212 if (p >= scm_i_string_length (str))
1213 scm_out_of_range (NULL, scm_from_size_t (p));
1214 str = scm_i_string_start_writing (str);
1215 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1216 scm_i_string_stop_writing ();
1217 }
1218
1219 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1220 (SCM str, SCM start, SCM end),
1221 "Return a newly allocated string formed from the characters\n"
1222 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1223 "ending with index @var{end} (exclusive).\n"
1224 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1225 "exact integers satisfying:\n\n"
1226 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1227 #define FUNC_NAME s_scm_substring
1228 {
1229 size_t len, from, to;
1230
1231 SCM_VALIDATE_STRING (1, str);
1232 len = scm_i_string_length (str);
1233 from = scm_to_unsigned_integer (start, 0, len);
1234 if (SCM_UNBNDP (end))
1235 to = len;
1236 else
1237 to = scm_to_unsigned_integer (end, from, len);
1238 return scm_i_substring (str, from, to);
1239 }
1240 #undef FUNC_NAME
1241
1242 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1243 (SCM str, SCM start, SCM end),
1244 "Return a newly allocated string formed from the characters\n"
1245 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1246 "ending with index @var{end} (exclusive).\n"
1247 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1248 "exact integers satisfying:\n"
1249 "\n"
1250 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1251 "\n"
1252 "The returned string is read-only.\n")
1253 #define FUNC_NAME s_scm_substring_read_only
1254 {
1255 size_t len, from, to;
1256
1257 SCM_VALIDATE_STRING (1, str);
1258 len = scm_i_string_length (str);
1259 from = scm_to_unsigned_integer (start, 0, len);
1260 if (SCM_UNBNDP (end))
1261 to = len;
1262 else
1263 to = scm_to_unsigned_integer (end, from, len);
1264 return scm_i_substring_read_only (str, from, to);
1265 }
1266 #undef FUNC_NAME
1267
1268 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1269 (SCM str, SCM start, SCM end),
1270 "Return a newly allocated string formed from the characters\n"
1271 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1272 "ending with index @var{end} (exclusive).\n"
1273 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1274 "exact integers satisfying:\n\n"
1275 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1276 #define FUNC_NAME s_scm_substring_copy
1277 {
1278 /* For the Scheme version, START is mandatory, but for the C
1279 version, it is optional. See scm_string_copy in srfi-13.c for a
1280 rationale.
1281 */
1282
1283 size_t from, to;
1284
1285 SCM_VALIDATE_STRING (1, str);
1286 scm_i_get_substring_spec (scm_i_string_length (str),
1287 start, &from, end, &to);
1288 return scm_i_substring_copy (str, from, to);
1289 }
1290 #undef FUNC_NAME
1291
1292 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1293 (SCM str, SCM start, SCM end),
1294 "Return string that indirectly refers to the characters\n"
1295 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1296 "ending with index @var{end} (exclusive).\n"
1297 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1298 "exact integers satisfying:\n\n"
1299 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1300 #define FUNC_NAME s_scm_substring_shared
1301 {
1302 size_t len, from, to;
1303
1304 SCM_VALIDATE_STRING (1, str);
1305 len = scm_i_string_length (str);
1306 from = scm_to_unsigned_integer (start, 0, len);
1307 if (SCM_UNBNDP (end))
1308 to = len;
1309 else
1310 to = scm_to_unsigned_integer (end, from, len);
1311 return scm_i_substring_shared (str, from, to);
1312 }
1313 #undef FUNC_NAME
1314
1315 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1316 (SCM args),
1317 "Return a newly allocated string whose characters form the\n"
1318 "concatenation of the given strings, @var{args}.")
1319 #define FUNC_NAME s_scm_string_append
1320 {
1321 SCM res;
1322 size_t len = 0;
1323 int wide = 0;
1324 SCM l, s;
1325 size_t i;
1326 union
1327 {
1328 char *narrow;
1329 scm_t_wchar *wide;
1330 } data;
1331
1332 SCM_VALIDATE_REST_ARGUMENT (args);
1333 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1334 {
1335 s = SCM_CAR (l);
1336 SCM_VALIDATE_STRING (SCM_ARGn, s);
1337 len += scm_i_string_length (s);
1338 if (!scm_i_is_narrow_string (s))
1339 wide = 1;
1340 }
1341 data.narrow = NULL;
1342 if (!wide)
1343 res = scm_i_make_string (len, &data.narrow);
1344 else
1345 res = scm_i_make_wide_string (len, &data.wide);
1346
1347 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1348 {
1349 size_t len;
1350 s = SCM_CAR (l);
1351 SCM_VALIDATE_STRING (SCM_ARGn, s);
1352 len = scm_i_string_length (s);
1353 if (!wide)
1354 {
1355 memcpy (data.narrow, scm_i_string_chars (s), len);
1356 data.narrow += len;
1357 }
1358 else
1359 {
1360 if (scm_i_is_narrow_string (s))
1361 {
1362 for (i = 0; i < scm_i_string_length (s); i++)
1363 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1364 }
1365 else
1366 u32_cpy ((scm_t_uint32 *) data.wide,
1367 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1368 data.wide += len;
1369 }
1370 scm_remember_upto_here_1 (s);
1371 }
1372 return res;
1373 }
1374 #undef FUNC_NAME
1375
1376 int
1377 scm_is_string (SCM obj)
1378 {
1379 return IS_STRING (obj);
1380 }
1381
1382 SCM
1383 scm_from_stringn (const char *str, size_t len, const char *encoding,
1384 scm_t_string_failed_conversion_handler handler)
1385 {
1386 size_t u32len, i;
1387 scm_t_wchar *u32;
1388 int wide = 0;
1389 SCM res;
1390
1391 if (len == 0)
1392 return scm_nullstr;
1393
1394 if (encoding == NULL)
1395 {
1396 /* If encoding is null, use Latin-1. */
1397 char *buf;
1398 res = scm_i_make_string (len, &buf);
1399 memcpy (buf, str, len);
1400 return res;
1401 }
1402
1403 u32len = 0;
1404 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1405 (enum iconv_ilseq_handler)
1406 handler,
1407 str, len,
1408 NULL,
1409 NULL, &u32len);
1410
1411 if (u32 == NULL)
1412 {
1413 if (errno == ENOMEM)
1414 scm_memory_error ("locale string conversion");
1415 else
1416 {
1417 /* There are invalid sequences in the input string. */
1418 SCM errstr;
1419 char *dst;
1420 errstr = scm_i_make_string (len, &dst);
1421 memcpy (dst, str, len);
1422 scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
1423 scm_list_2 (scm_from_locale_string (encoding),
1424 errstr));
1425 scm_remember_upto_here_1 (errstr);
1426 }
1427 }
1428
1429 i = 0;
1430 while (i < u32len)
1431 if (u32[i++] > 0xFF)
1432 {
1433 wide = 1;
1434 break;
1435 }
1436
1437 if (!wide)
1438 {
1439 char *dst;
1440 res = scm_i_make_string (u32len, &dst);
1441 for (i = 0; i < u32len; i ++)
1442 dst[i] = (unsigned char) u32[i];
1443 dst[u32len] = '\0';
1444 }
1445 else
1446 {
1447 scm_t_wchar *wdst;
1448 res = scm_i_make_wide_string (u32len, &wdst);
1449 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1450 wdst[u32len] = 0;
1451 }
1452
1453 free (u32);
1454 return res;
1455 }
1456
1457 SCM
1458 scm_from_locale_stringn (const char *str, size_t len)
1459 {
1460 const char *enc;
1461 scm_t_string_failed_conversion_handler hndl;
1462 SCM inport;
1463 scm_t_port *pt;
1464
1465 if (len == (size_t) -1)
1466 len = strlen (str);
1467 if (len == 0)
1468 return scm_nullstr;
1469
1470 inport = scm_current_input_port ();
1471 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1472 {
1473 pt = SCM_PTAB_ENTRY (inport);
1474 enc = pt->encoding;
1475 hndl = pt->ilseq_handler;
1476 }
1477 else
1478 {
1479 enc = NULL;
1480 hndl = SCM_FAILED_CONVERSION_ERROR;
1481 }
1482
1483 return scm_from_stringn (str, len, enc, hndl);
1484 }
1485
1486 SCM
1487 scm_from_locale_string (const char *str)
1488 {
1489 if (str == NULL)
1490 return scm_nullstr;
1491
1492 return scm_from_locale_stringn (str, -1);
1493 }
1494
1495 SCM
1496 scm_i_from_utf8_string (const scm_t_uint8 *str)
1497 {
1498 return scm_from_stringn ((const char *) str,
1499 strlen ((char *) str), "UTF-8",
1500 SCM_FAILED_CONVERSION_ERROR);
1501 }
1502
1503 /* Create a new scheme string from the C string STR. The memory of
1504 STR may be used directly as storage for the new string. */
1505 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1506 would be to register a finalizer to eventually free(3) STR, which isn't
1507 worth it. Should we just deprecate the `scm_take_' functions? */
1508 SCM
1509 scm_take_locale_stringn (char *str, size_t len)
1510 {
1511 SCM res;
1512
1513 res = scm_from_locale_stringn (str, len);
1514 free (str);
1515
1516 return res;
1517 }
1518
1519 SCM
1520 scm_take_locale_string (char *str)
1521 {
1522 return scm_take_locale_stringn (str, -1);
1523 }
1524
1525 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1526 and \UXXXXXX. */
1527 static void
1528 unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1529 {
1530 char *before, *after;
1531 size_t i, j;
1532
1533 before = *bufp;
1534 after = *bufp;
1535 i = 0;
1536 j = 0;
1537 while (i < *lenp)
1538 {
1539 if ((i <= *lenp - 6)
1540 && before[i] == '\\'
1541 && before[i + 1] == 'u'
1542 && before[i + 2] == '0' && before[i + 3] == '0')
1543 {
1544 /* Convert \u00NN to \xNN */
1545 after[j] = '\\';
1546 after[j + 1] = 'x';
1547 after[j + 2] = tolower ((int) before[i + 4]);
1548 after[j + 3] = tolower ((int) before[i + 5]);
1549 i += 6;
1550 j += 4;
1551 }
1552 else if ((i <= *lenp - 10)
1553 && before[i] == '\\'
1554 && before[i + 1] == 'U'
1555 && before[i + 2] == '0' && before[i + 3] == '0')
1556 {
1557 /* Convert \U00NNNNNN to \UNNNNNN */
1558 after[j] = '\\';
1559 after[j + 1] = 'U';
1560 after[j + 2] = tolower ((int) before[i + 4]);
1561 after[j + 3] = tolower ((int) before[i + 5]);
1562 after[j + 4] = tolower ((int) before[i + 6]);
1563 after[j + 5] = tolower ((int) before[i + 7]);
1564 after[j + 6] = tolower ((int) before[i + 8]);
1565 after[j + 7] = tolower ((int) before[i + 9]);
1566 i += 10;
1567 j += 8;
1568 }
1569 else
1570 {
1571 after[j] = before[i];
1572 i++;
1573 j++;
1574 }
1575 }
1576 *lenp = j;
1577 after = scm_realloc (after, j);
1578 }
1579
1580 char *
1581 scm_to_locale_stringn (SCM str, size_t *lenp)
1582 {
1583 SCM outport;
1584 scm_t_port *pt;
1585 const char *enc;
1586
1587 outport = scm_current_output_port ();
1588 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1589 {
1590 pt = SCM_PTAB_ENTRY (outport);
1591 enc = pt->encoding;
1592 }
1593 else
1594 enc = NULL;
1595
1596 return scm_to_stringn (str, lenp,
1597 enc,
1598 scm_i_get_conversion_strategy (SCM_BOOL_F));
1599 }
1600
1601 /* Low-level scheme to C string conversion function. */
1602 char *
1603 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1604 scm_t_string_failed_conversion_handler handler)
1605 {
1606 char *buf;
1607 size_t ilen, len, i;
1608 int ret;
1609 const char *enc;
1610
1611 if (!scm_is_string (str))
1612 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1613 ilen = scm_i_string_length (str);
1614
1615 if (ilen == 0)
1616 {
1617 buf = scm_malloc (1);
1618 buf[0] = '\0';
1619 if (lenp)
1620 *lenp = 0;
1621 return buf;
1622 }
1623
1624 if (lenp == NULL)
1625 for (i = 0; i < ilen; i++)
1626 if (scm_i_string_ref (str, i) == '\0')
1627 scm_misc_error (NULL,
1628 "string contains #\\nul character: ~S",
1629 scm_list_1 (str));
1630
1631 if (scm_i_is_narrow_string (str) && (encoding == NULL))
1632 {
1633 /* If using native Latin-1 encoding, just copy the string
1634 contents. */
1635 if (lenp)
1636 {
1637 buf = scm_malloc (ilen);
1638 memcpy (buf, scm_i_string_chars (str), ilen);
1639 *lenp = ilen;
1640 return buf;
1641 }
1642 else
1643 {
1644 buf = scm_malloc (ilen + 1);
1645 memcpy (buf, scm_i_string_chars (str), ilen);
1646 buf[ilen] = '\0';
1647 return buf;
1648 }
1649 }
1650
1651
1652 buf = NULL;
1653 len = 0;
1654 enc = encoding;
1655 if (enc == NULL)
1656 enc = "ISO-8859-1";
1657 if (scm_i_is_narrow_string (str))
1658 {
1659 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1660 "ISO-8859-1", enc,
1661 (enum iconv_ilseq_handler) handler, NULL,
1662 &buf, &len);
1663
1664 if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1665 unistring_escapes_to_guile_escapes (&buf, &len);
1666
1667 if (ret != 0)
1668 {
1669 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1670 scm_list_2 (scm_from_locale_string (enc),
1671 str));
1672 }
1673 }
1674 else
1675 {
1676 buf = u32_conv_to_encoding (enc,
1677 (enum iconv_ilseq_handler) handler,
1678 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1679 ilen,
1680 NULL,
1681 NULL, &len);
1682 if (buf == NULL)
1683 {
1684 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1685 scm_list_2 (scm_from_locale_string (enc),
1686 str));
1687 }
1688 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1689 unistring_escapes_to_guile_escapes (&buf, &len);
1690 }
1691 if (lenp)
1692 *lenp = len;
1693 else
1694 {
1695 buf = scm_realloc (buf, len + 1);
1696 buf[len] = '\0';
1697 }
1698
1699 scm_remember_upto_here_1 (str);
1700 return buf;
1701 }
1702
1703 char *
1704 scm_to_locale_string (SCM str)
1705 {
1706 return scm_to_locale_stringn (str, NULL);
1707 }
1708
1709 scm_t_uint8 *
1710 scm_i_to_utf8_string (SCM str)
1711 {
1712 char *u8str;
1713 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1714 return (scm_t_uint8 *) u8str;
1715 }
1716
1717 size_t
1718 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1719 {
1720 size_t len;
1721 char *result = NULL;
1722 if (!scm_is_string (str))
1723 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1724 result = scm_to_locale_stringn (str, &len);
1725
1726 memcpy (buf, result, (len > max_len) ? max_len : len);
1727 free (result);
1728
1729 scm_remember_upto_here_1 (str);
1730 return len;
1731 }
1732
1733 /* converts C scm_array of strings to SCM scm_list of strings. */
1734 /* If argc < 0, a null terminated scm_array is assumed. */
1735 SCM
1736 scm_makfromstrs (int argc, char **argv)
1737 {
1738 int i = argc;
1739 SCM lst = SCM_EOL;
1740 if (0 > i)
1741 for (i = 0; argv[i]; i++);
1742 while (i--)
1743 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1744 return lst;
1745 }
1746
1747 /* Return a newly allocated array of char pointers to each of the strings
1748 in args, with a terminating NULL pointer. */
1749
1750 char **
1751 scm_i_allocate_string_pointers (SCM list)
1752 #define FUNC_NAME "scm_i_allocate_string_pointers"
1753 {
1754 char **result;
1755 int len = scm_ilength (list);
1756 int i;
1757
1758 if (len < 0)
1759 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1760
1761 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1762 "string pointers");
1763 result[len] = NULL;
1764
1765 /* The list might be have been modified in another thread, so
1766 we check LIST before each access.
1767 */
1768 for (i = 0; i < len && scm_is_pair (list); i++)
1769 {
1770 SCM str;
1771 size_t len;
1772
1773 str = SCM_CAR (list);
1774 len = scm_c_string_length (str);
1775
1776 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1777 memcpy (result[i], scm_i_string_chars (str), len);
1778 result[i][len] = '\0';
1779
1780 list = SCM_CDR (list);
1781 }
1782
1783 return result;
1784 }
1785 #undef FUNC_NAME
1786
1787 void
1788 scm_i_get_substring_spec (size_t len,
1789 SCM start, size_t *cstart,
1790 SCM end, size_t *cend)
1791 {
1792 if (SCM_UNBNDP (start))
1793 *cstart = 0;
1794 else
1795 *cstart = scm_to_unsigned_integer (start, 0, len);
1796
1797 if (SCM_UNBNDP (end))
1798 *cend = len;
1799 else
1800 *cend = scm_to_unsigned_integer (end, *cstart, len);
1801 }
1802
1803 #if SCM_ENABLE_DEPRECATED
1804
1805 /* When these definitions are removed, it becomes reasonable to use
1806 read-only strings for string literals. For that, change the reader
1807 to create string literals with scm_c_substring_read_only instead of
1808 with scm_c_substring_copy.
1809 */
1810
1811 int
1812 scm_i_deprecated_stringp (SCM str)
1813 {
1814 scm_c_issue_deprecation_warning
1815 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1816
1817 return scm_is_string (str);
1818 }
1819
1820 char *
1821 scm_i_deprecated_string_chars (SCM str)
1822 {
1823 char *chars;
1824
1825 scm_c_issue_deprecation_warning
1826 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1827
1828 /* We don't accept shared substrings here since they are not
1829 null-terminated.
1830 */
1831 if (IS_SH_STRING (str))
1832 scm_misc_error (NULL,
1833 "SCM_STRING_CHARS does not work with shared substrings.",
1834 SCM_EOL);
1835
1836 /* We explicitly test for read-only strings to produce a better
1837 error message.
1838 */
1839
1840 if (IS_RO_STRING (str))
1841 scm_misc_error (NULL,
1842 "SCM_STRING_CHARS does not work with read-only strings.",
1843 SCM_EOL);
1844
1845 /* The following is still wrong, of course...
1846 */
1847 str = scm_i_string_start_writing (str);
1848 chars = scm_i_string_writable_chars (str);
1849 scm_i_string_stop_writing ();
1850 return chars;
1851 }
1852
1853 size_t
1854 scm_i_deprecated_string_length (SCM str)
1855 {
1856 scm_c_issue_deprecation_warning
1857 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1858 return scm_c_string_length (str);
1859 }
1860
1861 #endif
1862
1863 static SCM
1864 string_handle_ref (scm_t_array_handle *h, size_t index)
1865 {
1866 return scm_c_string_ref (h->array, index);
1867 }
1868
1869 static void
1870 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
1871 {
1872 scm_c_string_set_x (h->array, index, val);
1873 }
1874
1875 static void
1876 string_get_handle (SCM v, scm_t_array_handle *h)
1877 {
1878 h->array = v;
1879 h->ndims = 1;
1880 h->dims = &h->dim0;
1881 h->dim0.lbnd = 0;
1882 h->dim0.ubnd = scm_c_string_length (v) - 1;
1883 h->dim0.inc = 1;
1884 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
1885 h->elements = h->writable_elements = NULL;
1886 }
1887
1888 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f & ~2,
1889 string_handle_ref, string_handle_set,
1890 string_get_handle);
1891 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string);
1892
1893 void
1894 scm_init_strings ()
1895 {
1896 scm_nullstr = scm_i_make_string (0, NULL);
1897
1898 #include "libguile/strings.x"
1899 }
1900
1901
1902 /*
1903 Local Variables:
1904 c-file-style: "gnu"
1905 End:
1906 */