Thanks, Mike.
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <string.h>
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <unistr.h>
29 #include <uniconv.h>
30
31 #include "libguile/_scm.h"
32 #include "libguile/chars.h"
33 #include "libguile/root.h"
34 #include "libguile/strings.h"
35 #include "libguile/deprecation.h"
36 #include "libguile/validate.h"
37 #include "libguile/dynwind.h"
38
39 \f
40
41 /* {Strings}
42 */
43
44
45 /* Stringbufs
46 *
47 * XXX - keeping an accurate refcount during GC seems to be quite
48 * tricky, so we just keep score of whether a stringbuf might be
49 * shared, not whether it definitely is.
50 *
51 * The scheme I (mvo) tried to keep an accurate reference count would
52 * recount all strings that point to a stringbuf during the mark-phase
53 * of the GC. This was done since one cannot access the stringbuf of
54 * a string when that string is freed (in order to decrease the
55 * reference count). The memory of the stringbuf might have been
56 * reused already for something completely different.
57 *
58 * This recounted worked for a small number of threads beating on
59 * cow-strings, but it failed randomly with more than 10 threads, say.
60 * I couldn't figure out what went wrong, so I used the conservative
61 * approach implemented below.
62 *
63 * A stringbuf needs to know its length, but only so that it can be
64 * reported when the stringbuf is freed.
65 *
66 * There are 3 storage strategies for stringbufs: inline, outline, and
67 * wide.
68 *
69 * Inline strings are small 8-bit strings stored within the double
70 * cell itself. Outline strings are larger 8-bit strings with GC
71 * allocated storage. Wide strings are 32-bit strings with allocated
72 * storage.
73 *
74 * There was little value in making wide string inlineable, since
75 * there is only room for three inlined 32-bit characters. Thus wide
76 * stringbufs are never inlined.
77 */
78
79 #define STRINGBUF_F_SHARED 0x100
80 #define STRINGBUF_F_INLINE 0x200
81 #define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
82 encoding. Otherwise, strings
83 are Latin-1. */
84
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
88 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
89
90 #define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
91 #define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
92 #define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
93 #define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
94
95 #define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
96 ? STRINGBUF_INLINE_CHARS (buf) \
97 : STRINGBUF_OUTLINE_CHARS (buf))
98
99 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
100 #define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
101 ? STRINGBUF_INLINE_LENGTH (buf) \
102 : STRINGBUF_OUTLINE_LENGTH (buf))
103
104 #define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
105
106 #define SET_STRINGBUF_SHARED(buf) \
107 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
108
109 #if SCM_STRING_LENGTH_HISTOGRAM
110 static size_t lenhist[1001];
111 #endif
112
113 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
114 characters. */
115 static SCM
116 make_stringbuf (size_t len)
117 {
118 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
119 scm_i_symbol_chars, all stringbufs are null-terminated. Once
120 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
121 has been changed for scm_i_symbol_chars, this null-termination
122 can be dropped.
123 */
124
125 #if SCM_STRING_LENGTH_HISTOGRAM
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130 #endif
131
132 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
133 {
134 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
135 0, 0, 0);
136 }
137 else
138 {
139 char *mem = scm_gc_malloc (len+1, "string");
140 mem[len] = '\0';
141 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
142 (scm_t_bits) len, (scm_t_bits) 0);
143 }
144 }
145
146 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
147 characters. */
148 static SCM
149 make_wide_stringbuf (size_t len)
150 {
151 scm_t_wchar *mem;
152 #if SCM_STRING_LENGTH_HISTOGRAM
153 if (len < 1000)
154 lenhist[len]++;
155 else
156 lenhist[1000]++;
157 #endif
158
159 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
160 mem[len] = 0;
161 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
162 (scm_t_bits) len, (scm_t_bits) 0);
163 }
164
165 /* Return a new stringbuf whose underlying storage consists of the LEN+1
166 octets pointed to by STR (the last octet is zero). */
167 SCM
168 scm_i_take_stringbufn (char *str, size_t len)
169 {
170 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
171
172 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
173 (scm_t_bits) len, (scm_t_bits) 0);
174 }
175
176 SCM
177 scm_i_stringbuf_mark (SCM buf)
178 {
179 return SCM_BOOL_F;
180 }
181
182 void
183 scm_i_stringbuf_free (SCM buf)
184 {
185 if (!STRINGBUF_INLINE (buf))
186 {
187 if (!STRINGBUF_WIDE (buf))
188 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
189 STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
190 else
191 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
192 sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
193 + 1), "string");
194 }
195
196 }
197
198 /* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
199 one containing 32-bit UCS-4-encoded characters. */
200 static void
201 widen_stringbuf (SCM buf)
202 {
203 size_t i, len;
204 scm_t_wchar *mem;
205
206 if (STRINGBUF_WIDE (buf))
207 return;
208
209 if (STRINGBUF_INLINE (buf))
210 {
211 len = STRINGBUF_INLINE_LENGTH (buf);
212
213 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
214 for (i = 0; i < len; i++)
215 mem[i] =
216 (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
217 mem[len] = 0;
218
219 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
220 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
221 SCM_SET_CELL_WORD_1 (buf, mem);
222 SCM_SET_CELL_WORD_2 (buf, len);
223 }
224 else
225 {
226 len = STRINGBUF_OUTLINE_LENGTH (buf);
227
228 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
229 for (i = 0; i < len; i++)
230 mem[i] =
231 (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
232 mem[len] = 0;
233
234 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
235
236 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
237 SCM_SET_CELL_WORD_1 (buf, mem);
238 SCM_SET_CELL_WORD_2 (buf, len);
239 }
240 }
241
242 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
243
244 /* Copy-on-write strings.
245 */
246
247 #define STRING_TAG scm_tc7_string
248
249 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
250 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
251 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
252
253 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
254 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
255
256 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
257
258 /* Read-only strings.
259 */
260
261 #define RO_STRING_TAG (scm_tc7_string + 0x200)
262 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
263
264 /* Mutation-sharing substrings
265 */
266
267 #define SH_STRING_TAG (scm_tc7_string + 0x100)
268
269 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
270 /* START and LENGTH as for STRINGs. */
271
272 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
273
274 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
275 characters. CHARSP, if not NULL, will be set to location of the
276 char array. */
277 SCM
278 scm_i_make_string (size_t len, char **charsp)
279 {
280 SCM buf = make_stringbuf (len);
281 SCM res;
282 if (charsp)
283 *charsp = STRINGBUF_CHARS (buf);
284 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
285 (scm_t_bits)0, (scm_t_bits) len);
286 return res;
287 }
288
289 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
290 characters. CHARSP, if not NULL, will be set to location of the
291 character array. */
292 SCM
293 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
294 {
295 SCM buf = make_wide_stringbuf (len);
296 SCM res;
297 if (charsp)
298 *charsp = STRINGBUF_WIDE_CHARS (buf);
299 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
300 (scm_t_bits) 0, (scm_t_bits) len);
301 return res;
302 }
303
304 static void
305 validate_substring_args (SCM str, size_t start, size_t end)
306 {
307 if (!IS_STRING (str))
308 scm_wrong_type_arg_msg (NULL, 0, str, "string");
309 if (start > STRING_LENGTH (str))
310 scm_out_of_range (NULL, scm_from_size_t (start));
311 if (end > STRING_LENGTH (str) || end < start)
312 scm_out_of_range (NULL, scm_from_size_t (end));
313 }
314
315 static inline void
316 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
317 {
318 *start = STRING_START (*str);
319 if (IS_SH_STRING (*str))
320 {
321 *str = SH_STRING_STRING (*str);
322 *start += STRING_START (*str);
323 }
324 *buf = STRING_STRINGBUF (*str);
325 }
326
327 SCM
328 scm_i_substring (SCM str, size_t start, size_t end)
329 {
330 SCM buf;
331 size_t str_start;
332 get_str_buf_start (&str, &buf, &str_start);
333 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
334 SET_STRINGBUF_SHARED (buf);
335 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
336 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
337 (scm_t_bits)str_start + start,
338 (scm_t_bits) end - start);
339 }
340
341 SCM
342 scm_i_substring_read_only (SCM str, size_t start, size_t end)
343 {
344 SCM buf;
345 size_t str_start;
346 get_str_buf_start (&str, &buf, &str_start);
347 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
348 SET_STRINGBUF_SHARED (buf);
349 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
350 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
351 (scm_t_bits)str_start + start,
352 (scm_t_bits) end - start);
353 }
354
355 SCM
356 scm_i_substring_copy (SCM str, size_t start, size_t end)
357 {
358 size_t len = end - start;
359 SCM buf, my_buf;
360 size_t str_start;
361 get_str_buf_start (&str, &buf, &str_start);
362 if (scm_i_is_narrow_string (str))
363 {
364 my_buf = make_stringbuf (len);
365 memcpy (STRINGBUF_CHARS (my_buf),
366 STRINGBUF_CHARS (buf) + str_start + start, len);
367 }
368 else
369 {
370 my_buf = make_wide_stringbuf (len);
371 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
372 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
373 + start), len);
374 /* Even though this string is wide, the substring may be narrow.
375 Consider adding code to narrow the string. */
376 }
377 scm_remember_upto_here_1 (buf);
378 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
379 (scm_t_bits) 0, (scm_t_bits) len);
380 }
381
382 SCM
383 scm_i_substring_shared (SCM str, size_t start, size_t end)
384 {
385 if (start == 0 && end == STRING_LENGTH (str))
386 return str;
387 else
388 {
389 size_t len = end - start;
390 if (IS_SH_STRING (str))
391 {
392 start += STRING_START (str);
393 str = SH_STRING_STRING (str);
394 }
395 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
396 (scm_t_bits)start, (scm_t_bits) len);
397 }
398 }
399
400 SCM
401 scm_c_substring (SCM str, size_t start, size_t end)
402 {
403 validate_substring_args (str, start, end);
404 return scm_i_substring (str, start, end);
405 }
406
407 SCM
408 scm_c_substring_read_only (SCM str, size_t start, size_t end)
409 {
410 validate_substring_args (str, start, end);
411 return scm_i_substring_read_only (str, start, end);
412 }
413
414 SCM
415 scm_c_substring_copy (SCM str, size_t start, size_t end)
416 {
417 validate_substring_args (str, start, end);
418 return scm_i_substring_copy (str, start, end);
419 }
420
421 SCM
422 scm_c_substring_shared (SCM str, size_t start, size_t end)
423 {
424 validate_substring_args (str, start, end);
425 return scm_i_substring_shared (str, start, end);
426 }
427
428 SCM
429 scm_i_string_mark (SCM str)
430 {
431 if (IS_SH_STRING (str))
432 return SH_STRING_STRING (str);
433 else
434 return STRING_STRINGBUF (str);
435 }
436
437 void
438 scm_i_string_free (SCM str)
439 {
440 }
441
442 /* Internal accessors
443 */
444
445 /* Returns the number of characters in STR. This may be different
446 than the memory size of the string storage. */
447 size_t
448 scm_i_string_length (SCM str)
449 {
450 return STRING_LENGTH (str);
451 }
452
453 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
454 encoding. False if it is 'wide', having a 32-bit UCS-4
455 encoding. */
456 int
457 scm_i_is_narrow_string (SCM str)
458 {
459 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
460 }
461
462 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
463 STR. */
464 const char *
465 scm_i_string_chars (SCM str)
466 {
467 SCM buf;
468 size_t start;
469 get_str_buf_start (&str, &buf, &start);
470 if (scm_i_is_narrow_string (str))
471 return STRINGBUF_CHARS (buf) + start;
472 else
473 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
474 scm_list_1 (str));
475 return NULL;
476 }
477
478 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
479 STR. */
480 const scm_t_wchar *
481 scm_i_string_wide_chars (SCM str)
482 {
483 SCM buf;
484 size_t start;
485
486 get_str_buf_start (&str, &buf, &start);
487 if (!scm_i_is_narrow_string (str))
488 return STRINGBUF_WIDE_CHARS (buf) + start;
489 else
490 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
491 scm_list_1 (str));
492 }
493
494 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
495 a new string buffer, so that it can be modified without modifying
496 other strings. Also, lock the string mutex. Later, one must call
497 scm_i_string_stop_writing to unlock the mutex. */
498 SCM
499 scm_i_string_start_writing (SCM orig_str)
500 {
501 SCM buf, str = orig_str;
502 size_t start;
503
504 get_str_buf_start (&str, &buf, &start);
505 if (IS_RO_STRING (str))
506 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
507
508 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
509 if (STRINGBUF_SHARED (buf))
510 {
511 /* Clone the stringbuf. */
512 size_t len = STRING_LENGTH (str);
513 SCM new_buf;
514
515 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
516
517 if (scm_i_is_narrow_string (str))
518 {
519 new_buf = make_stringbuf (len);
520 memcpy (STRINGBUF_CHARS (new_buf),
521 STRINGBUF_CHARS (buf) + STRING_START (str), len);
522
523 }
524 else
525 {
526 new_buf = make_wide_stringbuf (len);
527 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
528 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
529 + STRING_START (str)), len);
530 }
531 scm_i_thread_put_to_sleep ();
532 SET_STRING_STRINGBUF (str, new_buf);
533 start -= STRING_START (str);
534 SET_STRING_START (str, 0);
535 scm_i_thread_wake_up ();
536
537 buf = new_buf;
538
539 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
540 }
541 return orig_str;
542 }
543
544 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
545 char *
546 scm_i_string_writable_chars (SCM str)
547 {
548 SCM buf;
549 size_t start;
550
551 get_str_buf_start (&str, &buf, &start);
552 if (scm_i_is_narrow_string (str))
553 return STRINGBUF_CHARS (buf) + start;
554 else
555 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
556 scm_list_1 (str));
557 return NULL;
558 }
559
560 /* Return a pointer to the UCS-4 codepoints of a string. */
561 static scm_t_wchar *
562 scm_i_string_writable_wide_chars (SCM str)
563 {
564 SCM buf;
565 size_t start;
566
567 get_str_buf_start (&str, &buf, &start);
568 if (!scm_i_is_narrow_string (str))
569 return STRINGBUF_WIDE_CHARS (buf) + start;
570 else
571 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
572 scm_list_1 (str));
573 }
574
575 /* Unlock the string mutex that was locked when
576 scm_i_string_start_writing was called. */
577 void
578 scm_i_string_stop_writing (void)
579 {
580 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
581 }
582
583 /* Return the Xth character of STR as a UCS-4 codepoint. */
584 scm_t_wchar
585 scm_i_string_ref (SCM str, size_t x)
586 {
587 if (scm_i_is_narrow_string (str))
588 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
589 else
590 return scm_i_string_wide_chars (str)[x];
591 }
592
593 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
594 void
595 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
596 {
597 if (chr > 0xFF && scm_i_is_narrow_string (str))
598 widen_stringbuf (STRING_STRINGBUF (str));
599
600 if (scm_i_is_narrow_string (str))
601 {
602 char *dst = scm_i_string_writable_chars (str);
603 dst[p] = (char) (unsigned char) chr;
604 }
605 else
606 {
607 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
608 dst[p] = chr;
609 }
610 }
611
612 /* Symbols.
613
614 Basic symbol creation and accessing is done here, the rest is in
615 symbols.[hc]. This has been done to keep stringbufs and the
616 internals of strings and string-like objects confined to this file.
617 */
618
619 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
620
621 SCM
622 scm_i_make_symbol (SCM name, scm_t_bits flags,
623 unsigned long hash, SCM props)
624 {
625 SCM buf;
626 size_t start = STRING_START (name);
627 size_t length = STRING_LENGTH (name);
628
629 if (IS_SH_STRING (name))
630 {
631 name = SH_STRING_STRING (name);
632 start += STRING_START (name);
633 }
634 buf = SYMBOL_STRINGBUF (name);
635
636 if (start == 0 && length == STRINGBUF_LENGTH (buf))
637 {
638 /* reuse buf. */
639 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
640 SET_STRINGBUF_SHARED (buf);
641 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
642 }
643 else
644 {
645 /* make new buf. */
646 if (scm_i_is_narrow_string (name))
647 {
648 SCM new_buf = make_stringbuf (length);
649 memcpy (STRINGBUF_CHARS (new_buf),
650 STRINGBUF_CHARS (buf) + start, length);
651 buf = new_buf;
652 }
653 else
654 {
655 SCM new_buf = make_wide_stringbuf (length);
656 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
657 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
658 length);
659 buf = new_buf;
660 }
661 }
662 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
663 (scm_t_bits) hash, SCM_UNPACK (props));
664 }
665
666 SCM
667 scm_i_c_make_symbol (const char *name, size_t len,
668 scm_t_bits flags, unsigned long hash, SCM props)
669 {
670 SCM buf = make_stringbuf (len);
671 memcpy (STRINGBUF_CHARS (buf), name, len);
672
673 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
674 (scm_t_bits) hash, SCM_UNPACK (props));
675 }
676
677 /* Return a new symbol that uses the LEN bytes pointed to by NAME as its
678 underlying storage. */
679 SCM
680 scm_i_c_take_symbol (char *name, size_t len,
681 scm_t_bits flags, unsigned long hash, SCM props)
682 {
683 SCM buf = scm_i_take_stringbufn (name, len);
684
685 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
686 (scm_t_bits) hash, SCM_UNPACK (props));
687 }
688
689 /* Returns the number of characters in SYM. This may be different
690 from the memory size of SYM. */
691 size_t
692 scm_i_symbol_length (SCM sym)
693 {
694 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
695 }
696
697 size_t
698 scm_c_symbol_length (SCM sym)
699 #define FUNC_NAME "scm_c_symbol_length"
700 {
701 SCM_VALIDATE_SYMBOL (1, sym);
702
703 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
704 }
705 #undef FUNC_NAME
706
707 /* True if the name of SYM is stored as a Latin-1 encoded string.
708 False if it is stored as a 32-bit UCS-4-encoded string. */
709 int
710 scm_i_is_narrow_symbol (SCM sym)
711 {
712 SCM buf;
713
714 buf = SYMBOL_STRINGBUF (sym);
715 return !STRINGBUF_WIDE (buf);
716 }
717
718 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
719 contains the name of SYM. */
720 const char *
721 scm_i_symbol_chars (SCM sym)
722 {
723 SCM buf;
724
725 buf = SYMBOL_STRINGBUF (sym);
726 if (!STRINGBUF_WIDE (buf))
727 return STRINGBUF_CHARS (buf);
728 else
729 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
730 scm_list_1 (sym));
731 }
732
733 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
734 symbol's name. */
735 const scm_t_wchar *
736 scm_i_symbol_wide_chars (SCM sym)
737 {
738 SCM buf;
739
740 buf = SYMBOL_STRINGBUF (sym);
741 if (STRINGBUF_WIDE (buf))
742 return STRINGBUF_WIDE_CHARS (buf);
743 else
744 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
745 scm_list_1 (sym));
746 }
747
748 SCM
749 scm_i_symbol_mark (SCM sym)
750 {
751 scm_gc_mark (SYMBOL_STRINGBUF (sym));
752 return SCM_CELL_OBJECT_3 (sym);
753 }
754
755 void
756 scm_i_symbol_free (SCM sym)
757 {
758 }
759
760 SCM
761 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
762 {
763 SCM buf = SYMBOL_STRINGBUF (sym);
764 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
765 SET_STRINGBUF_SHARED (buf);
766 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
767 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
768 (scm_t_bits)start, (scm_t_bits) end - start);
769 }
770
771 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
772 scm_t_wchar
773 scm_i_symbol_ref (SCM sym, size_t x)
774 {
775 if (scm_i_is_narrow_symbol (sym))
776 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
777 else
778 return scm_i_symbol_wide_chars (sym)[x];
779 }
780
781 /* Debugging
782 */
783
784 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
785 "Returns an association list containing debugging information\n"
786 "for @var{str}. The association list has the following entries."
787 "@table @code\n"
788 "@item string\n"
789 "The string itself.\n"
790 "@item start\n"
791 "The start index of the string into its stringbuf\n"
792 "@item length\n"
793 "The length of the string\n"
794 "@item shared\n"
795 "If this string is a substring, it returns its parent string.\n"
796 "Otherwise, it returns @code{#f}\n"
797 "@item read-only\n"
798 "@code{#t} if the string is read-only\n"
799 "@item stringbuf-chars\n"
800 "A new string containing this string's stringbuf's characters\n"
801 "@item stringbuf-length\n"
802 "The number of characters in this stringbuf\n"
803 "@item stringbuf-shared\n"
804 "@code{#t} if this stringbuf is shared\n"
805 "@item stringbuf-inline\n"
806 "@code{#t} if this stringbuf's characters are stored in the\n"
807 "cell itself, or @code{#f} if they were allocated in memory\n"
808 "@item stringbuf-wide\n"
809 "@code{#t} if this stringbuf's characters are stored in a\n"
810 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
811 "buffer\n"
812 "@end table")
813 #define FUNC_NAME s_scm_sys_string_dump
814 {
815 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
816 SCM buf;
817 SCM_VALIDATE_STRING (1, str);
818
819 /* String info */
820 e1 = scm_cons (scm_from_locale_symbol ("string"),
821 str);
822 e2 = scm_cons (scm_from_locale_symbol ("start"),
823 scm_from_size_t (STRING_START (str)));
824 e3 = scm_cons (scm_from_locale_symbol ("length"),
825 scm_from_size_t (STRING_LENGTH (str)));
826
827 if (IS_SH_STRING (str))
828 {
829 e4 = scm_cons (scm_from_locale_symbol ("shared"),
830 SH_STRING_STRING (str));
831 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
832 }
833 else
834 {
835 e4 = scm_cons (scm_from_locale_symbol ("shared"),
836 SCM_BOOL_F);
837 buf = STRING_STRINGBUF (str);
838 }
839
840 if (IS_RO_STRING (str))
841 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
842 SCM_BOOL_T);
843 else
844 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
845 SCM_BOOL_F);
846
847 /* Stringbuf info */
848 if (!STRINGBUF_WIDE (buf))
849 {
850 size_t len = STRINGBUF_LENGTH (buf);
851 char *cbuf;
852 SCM sbc = scm_i_make_string (len, &cbuf);
853 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
854 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
855 sbc);
856 }
857 else
858 {
859 size_t len = STRINGBUF_LENGTH (buf);
860 scm_t_wchar *cbuf;
861 SCM sbc = scm_i_make_wide_string (len, &cbuf);
862 u32_cpy ((scm_t_uint32 *) cbuf,
863 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
864 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
865 sbc);
866 }
867 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
868 scm_from_size_t (STRINGBUF_LENGTH (buf)));
869 if (STRINGBUF_SHARED (buf))
870 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
871 SCM_BOOL_T);
872 else
873 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
874 SCM_BOOL_F);
875 if (STRINGBUF_INLINE (buf))
876 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
877 SCM_BOOL_T);
878 else
879 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
880 SCM_BOOL_F);
881 if (STRINGBUF_WIDE (buf))
882 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
883 SCM_BOOL_T);
884 else
885 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
886 SCM_BOOL_F);
887
888 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
889 }
890 #undef FUNC_NAME
891
892 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
893 "Returns an association list containing debugging information\n"
894 "for @var{sym}. The association list has the following entries."
895 "@table @code\n"
896 "@item symbol\n"
897 "The symbol itself\n"
898 "@item hash\n"
899 "Its hash value\n"
900 "@item interned\n"
901 "@code{#t} if it is an interned symbol\n"
902 "@item stringbuf-chars\n"
903 "A new string containing this symbols's stringbuf's characters\n"
904 "@item stringbuf-length\n"
905 "The number of characters in this stringbuf\n"
906 "@item stringbuf-shared\n"
907 "@code{#t} if this stringbuf is shared\n"
908 "@item stringbuf-inline\n"
909 "@code{#t} if this stringbuf's characters are stored in the\n"
910 "cell itself, or @code{#f} if they were allocated in memory\n"
911 "@item stringbuf-wide\n"
912 "@code{#t} if this stringbuf's characters are stored in a\n"
913 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
914 "buffer\n"
915 "@end table")
916 #define FUNC_NAME s_scm_sys_symbol_dump
917 {
918 SCM e1, e2, e3, e4, e5, e6, e7, e8;
919 SCM buf;
920 SCM_VALIDATE_SYMBOL (1, sym);
921 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
922 sym);
923 e2 = scm_cons (scm_from_locale_symbol ("hash"),
924 scm_from_ulong (scm_i_symbol_hash (sym)));
925 e3 = scm_cons (scm_from_locale_symbol ("interned"),
926 scm_symbol_interned_p (sym));
927 buf = SYMBOL_STRINGBUF (sym);
928
929 /* Stringbuf info */
930 if (!STRINGBUF_WIDE (buf))
931 {
932 size_t len = STRINGBUF_LENGTH (buf);
933 char *cbuf;
934 SCM sbc = scm_i_make_string (len, &cbuf);
935 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
936 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
937 sbc);
938 }
939 else
940 {
941 size_t len = STRINGBUF_LENGTH (buf);
942 scm_t_wchar *cbuf;
943 SCM sbc = scm_i_make_wide_string (len, &cbuf);
944 u32_cpy ((scm_t_uint32 *) cbuf,
945 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
946 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
947 sbc);
948 }
949 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
950 scm_from_size_t (STRINGBUF_LENGTH (buf)));
951 if (STRINGBUF_SHARED (buf))
952 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
953 SCM_BOOL_T);
954 else
955 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
956 SCM_BOOL_F);
957 if (STRINGBUF_INLINE (buf))
958 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
959 SCM_BOOL_T);
960 else
961 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
962 SCM_BOOL_F);
963 if (STRINGBUF_WIDE (buf))
964 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
965 SCM_BOOL_T);
966 else
967 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
968 SCM_BOOL_F);
969 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
970
971 }
972 #undef FUNC_NAME
973
974 #if SCM_STRING_LENGTH_HISTOGRAM
975
976 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
977 #define FUNC_NAME s_scm_sys_stringbuf_hist
978 {
979 int i;
980 for (i = 0; i < 1000; i++)
981 if (lenhist[i])
982 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
983 fprintf (stderr, ">999: %u\n", lenhist[1000]);
984 return SCM_UNSPECIFIED;
985 }
986 #undef FUNC_NAME
987
988 #endif
989
990 \f
991
992 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
993 (SCM obj),
994 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
995 #define FUNC_NAME s_scm_string_p
996 {
997 return scm_from_bool (IS_STRING (obj));
998 }
999 #undef FUNC_NAME
1000
1001
1002 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1003
1004 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1005 (SCM chrs),
1006 "@deffnx {Scheme Procedure} list->string chrs\n"
1007 "Return a newly allocated string composed of the arguments,\n"
1008 "@var{chrs}.")
1009 #define FUNC_NAME s_scm_string
1010 {
1011 SCM result;
1012 SCM rest;
1013 size_t len;
1014 size_t p = 0;
1015 long i;
1016
1017 /* Verify that this is a list of chars. */
1018 i = scm_ilength (chrs);
1019 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1020
1021 len = (size_t) i;
1022 rest = chrs;
1023
1024 while (len > 0 && scm_is_pair (rest))
1025 {
1026 SCM elt = SCM_CAR (rest);
1027 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1028 rest = SCM_CDR (rest);
1029 len--;
1030 scm_remember_upto_here_1 (elt);
1031 }
1032
1033 /* Construct a string containing this list of chars. */
1034 len = (size_t) i;
1035 rest = chrs;
1036
1037 result = scm_i_make_string (len, NULL);
1038 result = scm_i_string_start_writing (result);
1039 while (len > 0 && scm_is_pair (rest))
1040 {
1041 SCM elt = SCM_CAR (rest);
1042 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1043 p++;
1044 rest = SCM_CDR (rest);
1045 len--;
1046 scm_remember_upto_here_1 (elt);
1047 }
1048 scm_i_string_stop_writing ();
1049
1050 if (len > 0)
1051 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1052 if (!scm_is_null (rest))
1053 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1054
1055 return result;
1056 }
1057 #undef FUNC_NAME
1058
1059 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1060 (SCM k, SCM chr),
1061 "Return a newly allocated string of\n"
1062 "length @var{k}. If @var{chr} is given, then all elements of\n"
1063 "the string are initialized to @var{chr}, otherwise the contents\n"
1064 "of the @var{string} are unspecified.")
1065 #define FUNC_NAME s_scm_make_string
1066 {
1067 return scm_c_make_string (scm_to_size_t (k), chr);
1068 }
1069 #undef FUNC_NAME
1070
1071 SCM
1072 scm_c_make_string (size_t len, SCM chr)
1073 #define FUNC_NAME NULL
1074 {
1075 size_t p;
1076 SCM res = scm_i_make_string (len, NULL);
1077
1078 if (!SCM_UNBNDP (chr))
1079 {
1080 SCM_VALIDATE_CHAR (0, chr);
1081 res = scm_i_string_start_writing (res);
1082 for (p = 0; p < len; p++)
1083 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1084 scm_i_string_stop_writing ();
1085 }
1086
1087 return res;
1088 }
1089 #undef FUNC_NAME
1090
1091 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1092 (SCM string),
1093 "Return the number of characters in @var{string}.")
1094 #define FUNC_NAME s_scm_string_length
1095 {
1096 SCM_VALIDATE_STRING (1, string);
1097 return scm_from_size_t (STRING_LENGTH (string));
1098 }
1099 #undef FUNC_NAME
1100
1101 SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1102 (SCM string),
1103 "Return the bytes used to represent a character in @var{string}."
1104 "This will return 1 or 4.")
1105 #define FUNC_NAME s_scm_string_width
1106 {
1107 SCM_VALIDATE_STRING (1, string);
1108 if (!scm_i_is_narrow_string (string))
1109 return scm_from_int (4);
1110
1111 return scm_from_int (1);
1112 }
1113 #undef FUNC_NAME
1114
1115 size_t
1116 scm_c_string_length (SCM string)
1117 {
1118 if (!IS_STRING (string))
1119 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1120 return STRING_LENGTH (string);
1121 }
1122
1123 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1124 (SCM str, SCM k),
1125 "Return character @var{k} of @var{str} using zero-origin\n"
1126 "indexing. @var{k} must be a valid index of @var{str}.")
1127 #define FUNC_NAME s_scm_string_ref
1128 {
1129 size_t len;
1130 unsigned long idx;
1131
1132 SCM_VALIDATE_STRING (1, str);
1133
1134 len = scm_i_string_length (str);
1135 if (SCM_LIKELY (len > 0))
1136 idx = scm_to_unsigned_integer (k, 0, len - 1);
1137 else
1138 scm_out_of_range (NULL, k);
1139
1140 if (scm_i_is_narrow_string (str))
1141 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1142 else
1143 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1144 }
1145 #undef FUNC_NAME
1146
1147 SCM
1148 scm_c_string_ref (SCM str, size_t p)
1149 {
1150 if (p >= scm_i_string_length (str))
1151 scm_out_of_range (NULL, scm_from_size_t (p));
1152 if (scm_i_is_narrow_string (str))
1153 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1154 else
1155 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1156
1157 }
1158
1159 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1160 (SCM str, SCM k, SCM chr),
1161 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1162 "an unspecified value. @var{k} must be a valid index of\n"
1163 "@var{str}.")
1164 #define FUNC_NAME s_scm_string_set_x
1165 {
1166 size_t len;
1167 unsigned long idx;
1168
1169 SCM_VALIDATE_STRING (1, str);
1170
1171 len = scm_i_string_length (str);
1172 if (SCM_LIKELY (len > 0))
1173 idx = scm_to_unsigned_integer (k, 0, len - 1);
1174 else
1175 scm_out_of_range (NULL, k);
1176
1177 SCM_VALIDATE_CHAR (3, chr);
1178 str = scm_i_string_start_writing (str);
1179 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1180 scm_i_string_stop_writing ();
1181
1182 return SCM_UNSPECIFIED;
1183 }
1184 #undef FUNC_NAME
1185
1186 void
1187 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1188 {
1189 if (p >= scm_i_string_length (str))
1190 scm_out_of_range (NULL, scm_from_size_t (p));
1191 str = scm_i_string_start_writing (str);
1192 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1193 scm_i_string_stop_writing ();
1194 }
1195
1196 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1197 (SCM str, SCM start, SCM end),
1198 "Return a newly allocated string formed from the characters\n"
1199 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1200 "ending with index @var{end} (exclusive).\n"
1201 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1202 "exact integers satisfying:\n\n"
1203 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1204 #define FUNC_NAME s_scm_substring
1205 {
1206 size_t len, from, to;
1207
1208 SCM_VALIDATE_STRING (1, str);
1209 len = scm_i_string_length (str);
1210 from = scm_to_unsigned_integer (start, 0, len);
1211 if (SCM_UNBNDP (end))
1212 to = len;
1213 else
1214 to = scm_to_unsigned_integer (end, from, len);
1215 return scm_i_substring (str, from, to);
1216 }
1217 #undef FUNC_NAME
1218
1219 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1220 (SCM str, SCM start, SCM end),
1221 "Return a newly allocated string formed from the characters\n"
1222 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1223 "ending with index @var{end} (exclusive).\n"
1224 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1225 "exact integers satisfying:\n"
1226 "\n"
1227 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1228 "\n"
1229 "The returned string is read-only.\n")
1230 #define FUNC_NAME s_scm_substring_read_only
1231 {
1232 size_t len, from, to;
1233
1234 SCM_VALIDATE_STRING (1, str);
1235 len = scm_i_string_length (str);
1236 from = scm_to_unsigned_integer (start, 0, len);
1237 if (SCM_UNBNDP (end))
1238 to = len;
1239 else
1240 to = scm_to_unsigned_integer (end, from, len);
1241 return scm_i_substring_read_only (str, from, to);
1242 }
1243 #undef FUNC_NAME
1244
1245 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1246 (SCM str, SCM start, SCM end),
1247 "Return a newly allocated string formed from the characters\n"
1248 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1249 "ending with index @var{end} (exclusive).\n"
1250 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1251 "exact integers satisfying:\n\n"
1252 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1253 #define FUNC_NAME s_scm_substring_copy
1254 {
1255 /* For the Scheme version, START is mandatory, but for the C
1256 version, it is optional. See scm_string_copy in srfi-13.c for a
1257 rationale.
1258 */
1259
1260 size_t from, to;
1261
1262 SCM_VALIDATE_STRING (1, str);
1263 scm_i_get_substring_spec (scm_i_string_length (str),
1264 start, &from, end, &to);
1265 return scm_i_substring_copy (str, from, to);
1266 }
1267 #undef FUNC_NAME
1268
1269 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1270 (SCM str, SCM start, SCM end),
1271 "Return string that indirectly refers to the characters\n"
1272 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1273 "ending with index @var{end} (exclusive).\n"
1274 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1275 "exact integers satisfying:\n\n"
1276 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1277 #define FUNC_NAME s_scm_substring_shared
1278 {
1279 size_t len, from, to;
1280
1281 SCM_VALIDATE_STRING (1, str);
1282 len = scm_i_string_length (str);
1283 from = scm_to_unsigned_integer (start, 0, len);
1284 if (SCM_UNBNDP (end))
1285 to = len;
1286 else
1287 to = scm_to_unsigned_integer (end, from, len);
1288 return scm_i_substring_shared (str, from, to);
1289 }
1290 #undef FUNC_NAME
1291
1292 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1293 (SCM args),
1294 "Return a newly allocated string whose characters form the\n"
1295 "concatenation of the given strings, @var{args}.")
1296 #define FUNC_NAME s_scm_string_append
1297 {
1298 SCM res;
1299 size_t len = 0;
1300 int wide = 0;
1301 SCM l, s;
1302 size_t i;
1303 union
1304 {
1305 char *narrow;
1306 scm_t_wchar *wide;
1307 } data;
1308
1309 SCM_VALIDATE_REST_ARGUMENT (args);
1310 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1311 {
1312 s = SCM_CAR (l);
1313 SCM_VALIDATE_STRING (SCM_ARGn, s);
1314 len += scm_i_string_length (s);
1315 if (!scm_i_is_narrow_string (s))
1316 wide = 1;
1317 }
1318 data.narrow = NULL;
1319 if (!wide)
1320 res = scm_i_make_string (len, &data.narrow);
1321 else
1322 res = scm_i_make_wide_string (len, &data.wide);
1323
1324 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1325 {
1326 size_t len;
1327 s = SCM_CAR (l);
1328 SCM_VALIDATE_STRING (SCM_ARGn, s);
1329 len = scm_i_string_length (s);
1330 if (!wide)
1331 {
1332 memcpy (data.narrow, scm_i_string_chars (s), len);
1333 data.narrow += len;
1334 }
1335 else
1336 {
1337 if (scm_i_is_narrow_string (s))
1338 {
1339 for (i = 0; i < scm_i_string_length (s); i++)
1340 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1341 }
1342 else
1343 u32_cpy ((scm_t_uint32 *) data.wide,
1344 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1345 data.wide += len;
1346 }
1347 scm_remember_upto_here_1 (s);
1348 }
1349 return res;
1350 }
1351 #undef FUNC_NAME
1352
1353 int
1354 scm_is_string (SCM obj)
1355 {
1356 return IS_STRING (obj);
1357 }
1358
1359 SCM
1360 scm_from_locale_stringn (const char *str, size_t len)
1361 {
1362 SCM res;
1363 char *dst;
1364
1365 if (len == (size_t) -1)
1366 len = strlen (str);
1367 if (len == 0)
1368 return scm_nullstr;
1369
1370 res = scm_i_make_string (len, &dst);
1371 memcpy (dst, str, len);
1372 return res;
1373 }
1374
1375 SCM
1376 scm_from_locale_string (const char *str)
1377 {
1378 if (str == NULL)
1379 return scm_nullstr;
1380
1381 return scm_from_locale_stringn (str, -1);
1382 }
1383
1384 /* Create a new scheme string from the C string STR. The memory of
1385 STR may be used directly as storage for the new string. */
1386 SCM
1387 scm_take_locale_stringn (char *str, size_t len)
1388 {
1389 SCM buf, res;
1390
1391 if (len == (size_t) -1)
1392 len = strlen (str);
1393 else
1394 {
1395 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1396 often be satisfied from the alignment padding after the block, with
1397 no actual data movement. */
1398 str = scm_realloc (str, len + 1);
1399 str[len] = '\0';
1400 }
1401
1402 buf = scm_i_take_stringbufn (str, len);
1403 res = scm_double_cell (STRING_TAG,
1404 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
1405 return res;
1406 }
1407
1408 SCM
1409 scm_take_locale_string (char *str)
1410 {
1411 return scm_take_locale_stringn (str, -1);
1412 }
1413
1414 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1415 and \UXXXXXX. */
1416 static void
1417 unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1418 {
1419 char *before, *after;
1420 size_t i, j;
1421
1422 before = *bufp;
1423 after = *bufp;
1424 i = 0;
1425 j = 0;
1426 while (i < *lenp)
1427 {
1428 if ((i <= *lenp - 6)
1429 && before[i] == '\\'
1430 && before[i + 1] == 'u'
1431 && before[i + 2] == '0' && before[i + 3] == '0')
1432 {
1433 /* Convert \u00NN to \xNN */
1434 after[j] = '\\';
1435 after[j + 1] = 'x';
1436 after[j + 2] = tolower ((int) before[i + 4]);
1437 after[j + 3] = tolower ((int) before[i + 5]);
1438 i += 6;
1439 j += 4;
1440 }
1441 else if ((i <= *lenp - 10)
1442 && before[i] == '\\'
1443 && before[i + 1] == 'U'
1444 && before[i + 2] == '0' && before[i + 3] == '0')
1445 {
1446 /* Convert \U00NNNNNN to \UNNNNNN */
1447 after[j] = '\\';
1448 after[j + 1] = 'U';
1449 after[j + 2] = tolower ((int) before[i + 4]);
1450 after[j + 3] = tolower ((int) before[i + 5]);
1451 after[j + 4] = tolower ((int) before[i + 6]);
1452 after[j + 5] = tolower ((int) before[i + 7]);
1453 after[j + 6] = tolower ((int) before[i + 8]);
1454 after[j + 7] = tolower ((int) before[i + 9]);
1455 i += 10;
1456 j += 8;
1457 }
1458 else
1459 {
1460 after[j] = before[i];
1461 i++;
1462 j++;
1463 }
1464 }
1465 *lenp = j;
1466 after = scm_realloc (after, j);
1467 }
1468
1469 char *
1470 scm_to_locale_stringn (SCM str, size_t * lenp)
1471 {
1472 const char *enc;
1473
1474 /* In the future, enc will hold the port's encoding. */
1475 enc = NULL;
1476
1477 return scm_to_stringn (str, lenp, enc,
1478 SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
1479 }
1480
1481 /* Low-level scheme to C string conversion function. */
1482 char *
1483 scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
1484 scm_t_string_failed_conversion_handler handler)
1485 {
1486 static const char iso[11] = "ISO-8859-1";
1487 char *buf;
1488 size_t ilen, len, i;
1489
1490 if (!scm_is_string (str))
1491 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1492 ilen = scm_i_string_length (str);
1493
1494 if (ilen == 0)
1495 {
1496 buf = scm_malloc (1);
1497 buf[0] = '\0';
1498 if (lenp)
1499 *lenp = 0;
1500 return buf;
1501 }
1502
1503 if (lenp == NULL)
1504 for (i = 0; i < ilen; i++)
1505 if (scm_i_string_ref (str, i) == '\0')
1506 scm_misc_error (NULL,
1507 "string contains #\\nul character: ~S",
1508 scm_list_1 (str));
1509
1510 if (scm_i_is_narrow_string (str))
1511 {
1512 if (lenp)
1513 {
1514 buf = scm_malloc (ilen);
1515 memcpy (buf, scm_i_string_chars (str), ilen);
1516 *lenp = ilen;
1517 return buf;
1518 }
1519 else
1520 {
1521 buf = scm_malloc (ilen + 1);
1522 memcpy (buf, scm_i_string_chars (str), ilen);
1523 buf[ilen] = '\0';
1524 return buf;
1525 }
1526 }
1527
1528
1529 buf = NULL;
1530 len = 0;
1531 buf = u32_conv_to_encoding (iso,
1532 (enum iconv_ilseq_handler) handler,
1533 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1534 ilen, NULL, NULL, &len);
1535 if (buf == NULL)
1536 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1537 scm_list_2 (scm_from_locale_string (iso), str));
1538
1539 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1540 unistring_escapes_to_guile_escapes (&buf, &len);
1541
1542 if (lenp)
1543 *lenp = len;
1544 else
1545 {
1546 buf = scm_realloc (buf, len + 1);
1547 buf[len] = '\0';
1548 }
1549
1550 scm_remember_upto_here_1 (str);
1551 return buf;
1552 }
1553
1554 char *
1555 scm_to_locale_string (SCM str)
1556 {
1557 return scm_to_locale_stringn (str, NULL);
1558 }
1559
1560 size_t
1561 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1562 {
1563 size_t len;
1564 char *result = NULL;
1565 if (!scm_is_string (str))
1566 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1567 result = scm_to_locale_stringn (str, &len);
1568
1569 memcpy (buf, result, (len > max_len) ? max_len : len);
1570 free (result);
1571
1572 scm_remember_upto_here_1 (str);
1573 return len;
1574 }
1575
1576 /* converts C scm_array of strings to SCM scm_list of strings. */
1577 /* If argc < 0, a null terminated scm_array is assumed. */
1578 SCM
1579 scm_makfromstrs (int argc, char **argv)
1580 {
1581 int i = argc;
1582 SCM lst = SCM_EOL;
1583 if (0 > i)
1584 for (i = 0; argv[i]; i++);
1585 while (i--)
1586 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1587 return lst;
1588 }
1589
1590 /* Return a newly allocated array of char pointers to each of the strings
1591 in args, with a terminating NULL pointer. */
1592
1593 char **
1594 scm_i_allocate_string_pointers (SCM list)
1595 {
1596 char **result;
1597 int len = scm_ilength (list);
1598 int i;
1599
1600 if (len < 0)
1601 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1602
1603 scm_dynwind_begin (0);
1604
1605 result = (char **) scm_malloc ((len + 1) * sizeof (char *));
1606 result[len] = NULL;
1607 scm_dynwind_unwind_handler (free, result, 0);
1608
1609 /* The list might be have been modified in another thread, so
1610 we check LIST before each access.
1611 */
1612 for (i = 0; i < len && scm_is_pair (list); i++)
1613 {
1614 result[i] = scm_to_locale_string (SCM_CAR (list));
1615 list = SCM_CDR (list);
1616 }
1617
1618 scm_dynwind_end ();
1619 return result;
1620 }
1621
1622 void
1623 scm_i_free_string_pointers (char **pointers)
1624 {
1625 int i;
1626
1627 for (i = 0; pointers[i]; i++)
1628 free (pointers[i]);
1629 free (pointers);
1630 }
1631
1632 void
1633 scm_i_get_substring_spec (size_t len,
1634 SCM start, size_t *cstart,
1635 SCM end, size_t *cend)
1636 {
1637 if (SCM_UNBNDP (start))
1638 *cstart = 0;
1639 else
1640 *cstart = scm_to_unsigned_integer (start, 0, len);
1641
1642 if (SCM_UNBNDP (end))
1643 *cend = len;
1644 else
1645 *cend = scm_to_unsigned_integer (end, *cstart, len);
1646 }
1647
1648 #if SCM_ENABLE_DEPRECATED
1649
1650 /* When these definitions are removed, it becomes reasonable to use
1651 read-only strings for string literals. For that, change the reader
1652 to create string literals with scm_c_substring_read_only instead of
1653 with scm_c_substring_copy.
1654 */
1655
1656 int
1657 scm_i_deprecated_stringp (SCM str)
1658 {
1659 scm_c_issue_deprecation_warning
1660 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1661
1662 return scm_is_string (str);
1663 }
1664
1665 char *
1666 scm_i_deprecated_string_chars (SCM str)
1667 {
1668 char *chars;
1669
1670 scm_c_issue_deprecation_warning
1671 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1672
1673 /* We don't accept shared substrings here since they are not
1674 null-terminated.
1675 */
1676 if (IS_SH_STRING (str))
1677 scm_misc_error (NULL,
1678 "SCM_STRING_CHARS does not work with shared substrings.",
1679 SCM_EOL);
1680
1681 /* We explicitly test for read-only strings to produce a better
1682 error message.
1683 */
1684
1685 if (IS_RO_STRING (str))
1686 scm_misc_error (NULL,
1687 "SCM_STRING_CHARS does not work with read-only strings.",
1688 SCM_EOL);
1689
1690 /* The following is still wrong, of course...
1691 */
1692 str = scm_i_string_start_writing (str);
1693 chars = scm_i_string_writable_chars (str);
1694 scm_i_string_stop_writing ();
1695 return chars;
1696 }
1697
1698 size_t
1699 scm_i_deprecated_string_length (SCM str)
1700 {
1701 scm_c_issue_deprecation_warning
1702 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1703 return scm_c_string_length (str);
1704 }
1705
1706 #endif
1707
1708 void
1709 scm_init_strings ()
1710 {
1711 scm_nullstr = scm_i_make_string (0, NULL);
1712
1713 #include "libguile/strings.x"
1714 }
1715
1716
1717 /*
1718 Local Variables:
1719 c-file-style: "gnu"
1720 End:
1721 */