f10c9ebce83c054fa741c202545edce952460e95
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <string.h>
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <unistr.h>
29
30 #include "libguile/_scm.h"
31 #include "libguile/chars.h"
32 #include "libguile/root.h"
33 #include "libguile/strings.h"
34 #include "libguile/deprecation.h"
35 #include "libguile/validate.h"
36 #include "libguile/dynwind.h"
37
38 \f
39
40 /* {Strings}
41 */
42
43
44 /* Stringbufs
45 *
46 * XXX - keeping an accurate refcount during GC seems to be quite
47 * tricky, so we just keep score of whether a stringbuf might be
48 * shared, not whether it definitely is.
49 *
50 * The scheme I (mvo) tried to keep an accurate reference count would
51 * recount all strings that point to a stringbuf during the mark-phase
52 * of the GC. This was done since one cannot access the stringbuf of
53 * a string when that string is freed (in order to decrease the
54 * reference count). The memory of the stringbuf might have been
55 * reused already for something completely different.
56 *
57 * This recounted worked for a small number of threads beating on
58 * cow-strings, but it failed randomly with more than 10 threads, say.
59 * I couldn't figure out what went wrong, so I used the conservative
60 * approach implemented below.
61 *
62 * A stringbuf needs to know its length, but only so that it can be
63 * reported when the stringbuf is freed.
64 *
65 * There are 3 storage strategies for stringbufs: inline, outline, and
66 * wide.
67 *
68 * Inline strings are small 8-bit strings stored within the double
69 * cell itself. Outline strings are larger 8-bit strings with GC
70 * allocated storage. Wide strings are 32-bit strings with allocated
71 * storage.
72 *
73 * There was little value in making wide string inlineable, since
74 * there is only room for three inlined 32-bit characters. Thus wide
75 * stringbufs are never inlined.
76 */
77
78 #define STRINGBUF_F_SHARED 0x100
79 #define STRINGBUF_F_INLINE 0x200
80 #define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
81 encoding. Otherwise, strings
82 are Latin-1. */
83
84 #define STRINGBUF_TAG scm_tc7_stringbuf
85 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
86 #define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
88
89 #define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
90 #define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
91 #define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
92 #define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
93
94 #define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
95 ? STRINGBUF_INLINE_CHARS (buf) \
96 : STRINGBUF_OUTLINE_CHARS (buf))
97
98 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
99 #define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
100 ? STRINGBUF_INLINE_LENGTH (buf) \
101 : STRINGBUF_OUTLINE_LENGTH (buf))
102
103 #define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
104
105 #define SET_STRINGBUF_SHARED(buf) \
106 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
107
108 #if SCM_STRING_LENGTH_HISTOGRAM
109 static size_t lenhist[1001];
110 #endif
111
112 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
113 characters. */
114 static SCM
115 make_stringbuf (size_t len)
116 {
117 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
118 scm_i_symbol_chars, all stringbufs are null-terminated. Once
119 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
120 has been changed for scm_i_symbol_chars, this null-termination
121 can be dropped.
122 */
123
124 #if SCM_STRING_LENGTH_HISTOGRAM
125 if (len < 1000)
126 lenhist[len]++;
127 else
128 lenhist[1000]++;
129 #endif
130
131 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
132 {
133 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
134 0, 0, 0);
135 }
136 else
137 {
138 char *mem = scm_gc_malloc (len+1, "string");
139 mem[len] = '\0';
140 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
141 (scm_t_bits) len, (scm_t_bits) 0);
142 }
143 }
144
145 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 characters. */
147 static SCM
148 make_wide_stringbuf (size_t len)
149 {
150 scm_t_wchar *mem;
151 #if SCM_STRING_LENGTH_HISTOGRAM
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156 #endif
157
158 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
159 mem[len] = 0;
160 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
161 (scm_t_bits) len, (scm_t_bits) 0);
162 }
163
164 /* Return a new stringbuf whose underlying storage consists of the LEN+1
165 octets pointed to by STR (the last octet is zero). */
166 SCM
167 scm_i_take_stringbufn (char *str, size_t len)
168 {
169 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
170
171 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
172 (scm_t_bits) len, (scm_t_bits) 0);
173 }
174
175 SCM
176 scm_i_stringbuf_mark (SCM buf)
177 {
178 return SCM_BOOL_F;
179 }
180
181 void
182 scm_i_stringbuf_free (SCM buf)
183 {
184 if (!STRINGBUF_INLINE (buf))
185 {
186 if (!STRINGBUF_WIDE (buf))
187 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
188 STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
189 else
190 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
191 sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
192 + 1), "string");
193 }
194
195 }
196
197 /* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
198 one containing 32-bit UCS-4-encoded characters. */
199 static void
200 widen_stringbuf (SCM buf)
201 {
202 size_t i, len;
203 scm_t_wchar *mem;
204
205 if (STRINGBUF_WIDE (buf))
206 return;
207
208 if (STRINGBUF_INLINE (buf))
209 {
210 len = STRINGBUF_INLINE_LENGTH (buf);
211
212 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
213 for (i = 0; i < len; i++)
214 mem[i] =
215 (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
216 mem[len] = 0;
217
218 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
219 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
220 SCM_SET_CELL_WORD_1 (buf, mem);
221 SCM_SET_CELL_WORD_2 (buf, len);
222 }
223 else
224 {
225 len = STRINGBUF_OUTLINE_LENGTH (buf);
226
227 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
228 for (i = 0; i < len; i++)
229 mem[i] =
230 (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
231 mem[len] = 0;
232
233 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
234
235 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
236 SCM_SET_CELL_WORD_1 (buf, mem);
237 SCM_SET_CELL_WORD_2 (buf, len);
238 }
239 }
240
241 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
242
243 /* Copy-on-write strings.
244 */
245
246 #define STRING_TAG scm_tc7_string
247
248 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
249 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
250 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
251
252 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
253 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
254
255 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
256
257 /* Read-only strings.
258 */
259
260 #define RO_STRING_TAG (scm_tc7_string + 0x200)
261 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
262
263 /* Mutation-sharing substrings
264 */
265
266 #define SH_STRING_TAG (scm_tc7_string + 0x100)
267
268 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
269 /* START and LENGTH as for STRINGs. */
270
271 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
272
273 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
274 characters. CHARSP, if not NULL, will be set to location of the
275 char array. */
276 SCM
277 scm_i_make_string (size_t len, char **charsp)
278 {
279 SCM buf = make_stringbuf (len);
280 SCM res;
281 if (charsp)
282 *charsp = STRINGBUF_CHARS (buf);
283 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
284 (scm_t_bits)0, (scm_t_bits) len);
285 return res;
286 }
287
288 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
289 characters. CHARSP, if not NULL, will be set to location of the
290 character array. */
291 SCM
292 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
293 {
294 SCM buf = make_wide_stringbuf (len);
295 SCM res;
296 if (charsp)
297 *charsp = STRINGBUF_WIDE_CHARS (buf);
298 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
299 (scm_t_bits) 0, (scm_t_bits) len);
300 return res;
301 }
302
303 static void
304 validate_substring_args (SCM str, size_t start, size_t end)
305 {
306 if (!IS_STRING (str))
307 scm_wrong_type_arg_msg (NULL, 0, str, "string");
308 if (start > STRING_LENGTH (str))
309 scm_out_of_range (NULL, scm_from_size_t (start));
310 if (end > STRING_LENGTH (str) || end < start)
311 scm_out_of_range (NULL, scm_from_size_t (end));
312 }
313
314 static inline void
315 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
316 {
317 *start = STRING_START (*str);
318 if (IS_SH_STRING (*str))
319 {
320 *str = SH_STRING_STRING (*str);
321 *start += STRING_START (*str);
322 }
323 *buf = STRING_STRINGBUF (*str);
324 }
325
326 SCM
327 scm_i_substring (SCM str, size_t start, size_t end)
328 {
329 SCM buf;
330 size_t str_start;
331 get_str_buf_start (&str, &buf, &str_start);
332 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
333 SET_STRINGBUF_SHARED (buf);
334 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
335 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
336 (scm_t_bits)str_start + start,
337 (scm_t_bits) end - start);
338 }
339
340 SCM
341 scm_i_substring_read_only (SCM str, size_t start, size_t end)
342 {
343 SCM buf;
344 size_t str_start;
345 get_str_buf_start (&str, &buf, &str_start);
346 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
347 SET_STRINGBUF_SHARED (buf);
348 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
349 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
350 (scm_t_bits)str_start + start,
351 (scm_t_bits) end - start);
352 }
353
354 SCM
355 scm_i_substring_copy (SCM str, size_t start, size_t end)
356 {
357 size_t len = end - start;
358 SCM buf, my_buf;
359 size_t str_start;
360 get_str_buf_start (&str, &buf, &str_start);
361 if (scm_i_is_narrow_string (str))
362 {
363 my_buf = make_stringbuf (len);
364 memcpy (STRINGBUF_CHARS (my_buf),
365 STRINGBUF_CHARS (buf) + str_start + start, len);
366 }
367 else
368 {
369 my_buf = make_wide_stringbuf (len);
370 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
371 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
372 + start), len);
373 /* Even though this string is wide, the substring may be narrow.
374 Consider adding code to narrow the string. */
375 }
376 scm_remember_upto_here_1 (buf);
377 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
378 (scm_t_bits) 0, (scm_t_bits) len);
379 }
380
381 SCM
382 scm_i_substring_shared (SCM str, size_t start, size_t end)
383 {
384 if (start == 0 && end == STRING_LENGTH (str))
385 return str;
386 else
387 {
388 size_t len = end - start;
389 if (IS_SH_STRING (str))
390 {
391 start += STRING_START (str);
392 str = SH_STRING_STRING (str);
393 }
394 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
395 (scm_t_bits)start, (scm_t_bits) len);
396 }
397 }
398
399 SCM
400 scm_c_substring (SCM str, size_t start, size_t end)
401 {
402 validate_substring_args (str, start, end);
403 return scm_i_substring (str, start, end);
404 }
405
406 SCM
407 scm_c_substring_read_only (SCM str, size_t start, size_t end)
408 {
409 validate_substring_args (str, start, end);
410 return scm_i_substring_read_only (str, start, end);
411 }
412
413 SCM
414 scm_c_substring_copy (SCM str, size_t start, size_t end)
415 {
416 validate_substring_args (str, start, end);
417 return scm_i_substring_copy (str, start, end);
418 }
419
420 SCM
421 scm_c_substring_shared (SCM str, size_t start, size_t end)
422 {
423 validate_substring_args (str, start, end);
424 return scm_i_substring_shared (str, start, end);
425 }
426
427 SCM
428 scm_i_string_mark (SCM str)
429 {
430 if (IS_SH_STRING (str))
431 return SH_STRING_STRING (str);
432 else
433 return STRING_STRINGBUF (str);
434 }
435
436 void
437 scm_i_string_free (SCM str)
438 {
439 }
440
441 /* Internal accessors
442 */
443
444 /* Returns the number of characters in STR. This may be different
445 than the memory size of the string storage. */
446 size_t
447 scm_i_string_length (SCM str)
448 {
449 return STRING_LENGTH (str);
450 }
451
452 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
453 encoding. False if it is 'wide', having a 32-bit UCS-4
454 encoding. */
455 int
456 scm_i_is_narrow_string (SCM str)
457 {
458 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
459 }
460
461 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
462 STR. */
463 const char *
464 scm_i_string_chars (SCM str)
465 {
466 SCM buf;
467 size_t start;
468 get_str_buf_start (&str, &buf, &start);
469 if (scm_i_is_narrow_string (str))
470 return STRINGBUF_CHARS (buf) + start;
471 else
472 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
473 scm_list_1 (str));
474 return NULL;
475 }
476
477 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
478 STR. */
479 const scm_t_wchar *
480 scm_i_string_wide_chars (SCM str)
481 {
482 SCM buf;
483 size_t start;
484
485 get_str_buf_start (&str, &buf, &start);
486 if (!scm_i_is_narrow_string (str))
487 return STRINGBUF_WIDE_CHARS (buf) + start;
488 else
489 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
490 scm_list_1 (str));
491 }
492
493 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
494 a new string buffer, so that it can be modified without modifying
495 other strings. Also, lock the string mutex. Later, one must call
496 scm_i_string_stop_writing to unlock the mutex. */
497 SCM
498 scm_i_string_start_writing (SCM orig_str)
499 {
500 SCM buf, str = orig_str;
501 size_t start;
502
503 get_str_buf_start (&str, &buf, &start);
504 if (IS_RO_STRING (str))
505 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
506
507 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
508 if (STRINGBUF_SHARED (buf))
509 {
510 /* Clone the stringbuf. */
511 size_t len = STRING_LENGTH (str);
512 SCM new_buf;
513
514 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
515
516 if (scm_i_is_narrow_string (str))
517 {
518 new_buf = make_stringbuf (len);
519 memcpy (STRINGBUF_CHARS (new_buf),
520 STRINGBUF_CHARS (buf) + STRING_START (str), len);
521
522 }
523 else
524 {
525 new_buf = make_wide_stringbuf (len);
526 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
527 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
528 + STRING_START (str)), len);
529 }
530 scm_i_thread_put_to_sleep ();
531 SET_STRING_STRINGBUF (str, new_buf);
532 start -= STRING_START (str);
533 SET_STRING_START (str, 0);
534 scm_i_thread_wake_up ();
535
536 buf = new_buf;
537
538 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
539 }
540 return orig_str;
541 }
542
543 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
544 char *
545 scm_i_string_writable_chars (SCM str)
546 {
547 SCM buf;
548 size_t start;
549
550 get_str_buf_start (&str, &buf, &start);
551 if (scm_i_is_narrow_string (str))
552 return STRINGBUF_CHARS (buf) + start;
553 else
554 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
555 scm_list_1 (str));
556 return NULL;
557 }
558
559 /* Return a pointer to the UCS-4 codepoints of a string. */
560 static scm_t_wchar *
561 scm_i_string_writable_wide_chars (SCM str)
562 {
563 SCM buf;
564 size_t start;
565
566 get_str_buf_start (&str, &buf, &start);
567 if (!scm_i_is_narrow_string (str))
568 return STRINGBUF_WIDE_CHARS (buf) + start;
569 else
570 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
571 scm_list_1 (str));
572 }
573
574 /* Unlock the string mutex that was locked when
575 scm_i_string_start_writing was called. */
576 void
577 scm_i_string_stop_writing (void)
578 {
579 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
580 }
581
582 /* Return the Xth character of STR as a UCS-4 codepoint. */
583 scm_t_wchar
584 scm_i_string_ref (SCM str, size_t x)
585 {
586 if (scm_i_is_narrow_string (str))
587 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
588 else
589 return scm_i_string_wide_chars (str)[x];
590 }
591
592 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
593 void
594 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
595 {
596 if (chr > 0xFF && scm_i_is_narrow_string (str))
597 widen_stringbuf (STRING_STRINGBUF (str));
598
599 if (scm_i_is_narrow_string (str))
600 {
601 char *dst = scm_i_string_writable_chars (str);
602 dst[p] = (char) (unsigned char) chr;
603 }
604 else
605 {
606 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
607 dst[p] = chr;
608 }
609 }
610
611 /* Symbols.
612
613 Basic symbol creation and accessing is done here, the rest is in
614 symbols.[hc]. This has been done to keep stringbufs and the
615 internals of strings and string-like objects confined to this file.
616 */
617
618 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
619
620 SCM
621 scm_i_make_symbol (SCM name, scm_t_bits flags,
622 unsigned long hash, SCM props)
623 {
624 SCM buf;
625 size_t start = STRING_START (name);
626 size_t length = STRING_LENGTH (name);
627
628 if (IS_SH_STRING (name))
629 {
630 name = SH_STRING_STRING (name);
631 start += STRING_START (name);
632 }
633 buf = SYMBOL_STRINGBUF (name);
634
635 if (start == 0 && length == STRINGBUF_LENGTH (buf))
636 {
637 /* reuse buf. */
638 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
639 SET_STRINGBUF_SHARED (buf);
640 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
641 }
642 else
643 {
644 /* make new buf. */
645 if (scm_i_is_narrow_string (name))
646 {
647 SCM new_buf = make_stringbuf (length);
648 memcpy (STRINGBUF_CHARS (new_buf),
649 STRINGBUF_CHARS (buf) + start, length);
650 buf = new_buf;
651 }
652 else
653 {
654 SCM new_buf = make_wide_stringbuf (length);
655 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
656 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
657 length);
658 buf = new_buf;
659 }
660 }
661 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
662 (scm_t_bits) hash, SCM_UNPACK (props));
663 }
664
665 SCM
666 scm_i_c_make_symbol (const char *name, size_t len,
667 scm_t_bits flags, unsigned long hash, SCM props)
668 {
669 SCM buf = make_stringbuf (len);
670 memcpy (STRINGBUF_CHARS (buf), name, len);
671
672 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
673 (scm_t_bits) hash, SCM_UNPACK (props));
674 }
675
676 /* Return a new symbol that uses the LEN bytes pointed to by NAME as its
677 underlying storage. */
678 SCM
679 scm_i_c_take_symbol (char *name, size_t len,
680 scm_t_bits flags, unsigned long hash, SCM props)
681 {
682 SCM buf = scm_i_take_stringbufn (name, len);
683
684 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
685 (scm_t_bits) hash, SCM_UNPACK (props));
686 }
687
688 /* Returns the number of characters in SYM. This may be different
689 from the memory size of SYM. */
690 size_t
691 scm_i_symbol_length (SCM sym)
692 {
693 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
694 }
695
696 size_t
697 scm_c_symbol_length (SCM sym)
698 #define FUNC_NAME "scm_c_symbol_length"
699 {
700 SCM_VALIDATE_SYMBOL (1, sym);
701
702 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
703 }
704 #undef FUNC_NAME
705
706 /* True if the name of SYM is stored as a Latin-1 encoded string.
707 False if it is stored as a 32-bit UCS-4-encoded string. */
708 int
709 scm_i_is_narrow_symbol (SCM sym)
710 {
711 SCM buf;
712
713 buf = SYMBOL_STRINGBUF (sym);
714 return !STRINGBUF_WIDE (buf);
715 }
716
717 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
718 contains the name of SYM. */
719 const char *
720 scm_i_symbol_chars (SCM sym)
721 {
722 SCM buf;
723
724 buf = SYMBOL_STRINGBUF (sym);
725 if (!STRINGBUF_WIDE (buf))
726 return STRINGBUF_CHARS (buf);
727 else
728 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
729 scm_list_1 (sym));
730 }
731
732 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
733 symbol's name. */
734 const scm_t_wchar *
735 scm_i_symbol_wide_chars (SCM sym)
736 {
737 SCM buf;
738
739 buf = SYMBOL_STRINGBUF (sym);
740 if (STRINGBUF_WIDE (buf))
741 return STRINGBUF_WIDE_CHARS (buf);
742 else
743 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
744 scm_list_1 (sym));
745 }
746
747 SCM
748 scm_i_symbol_mark (SCM sym)
749 {
750 scm_gc_mark (SYMBOL_STRINGBUF (sym));
751 return SCM_CELL_OBJECT_3 (sym);
752 }
753
754 void
755 scm_i_symbol_free (SCM sym)
756 {
757 }
758
759 SCM
760 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
761 {
762 SCM buf = SYMBOL_STRINGBUF (sym);
763 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
764 SET_STRINGBUF_SHARED (buf);
765 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
766 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
767 (scm_t_bits)start, (scm_t_bits) end - start);
768 }
769
770 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
771 scm_t_wchar
772 scm_i_symbol_ref (SCM sym, size_t x)
773 {
774 if (scm_i_is_narrow_symbol (sym))
775 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
776 else
777 return scm_i_symbol_wide_chars (sym)[x];
778 }
779
780 /* Debugging
781 */
782
783 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
784 "Returns an association list containing debugging information\n"
785 "for @var{str}. The association list has the following entries."
786 "@table @code\n"
787 "@item string\n"
788 "The string itself.\n"
789 "@item start\n"
790 "The start index of the string into its stringbuf\n"
791 "@item length\n"
792 "The length of the string\n"
793 "@item shared\n"
794 "If this string is a substring, it returns its parent string.\n"
795 "Otherwise, it returns @code{#f}\n"
796 "@item stringbuf\n"
797 "The string buffer that contains this string's characters\n"
798 "@item stringbuf-chars\n"
799 "A new string containing this string's stringbuf's characters\n"
800 "@item stringbuf-length\n"
801 "The number of characters in this stringbuf\n"
802 "@item stringbuf-shared\n"
803 "@code{#t} if this stringbuf is shared\n"
804 "@item stringbuf-inline\n"
805 "@code{#t} if this stringbuf's characters are stored in the\n"
806 "cell itself, or @code{#f} if they were allocated in memory\n"
807 "@item stringbuf-wide\n"
808 "@code{#t} if this stringbuf's characters are stored in a\n"
809 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
810 "buffer\n"
811 "@end table")
812 #define FUNC_NAME s_scm_sys_string_dump
813 {
814 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
815 SCM buf;
816 SCM_VALIDATE_STRING (1, str);
817
818 /* String info */
819 e1 = scm_cons (scm_from_locale_symbol ("string"),
820 str);
821 e2 = scm_cons (scm_from_locale_symbol ("start"),
822 scm_from_size_t (STRING_START (str)));
823 e3 = scm_cons (scm_from_locale_symbol ("length"),
824 scm_from_size_t (STRING_LENGTH (str)));
825
826 if (IS_SH_STRING (str))
827 {
828 e4 = scm_cons (scm_from_locale_symbol ("shared"),
829 SH_STRING_STRING (str));
830 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
831 }
832 else
833 {
834 e4 = scm_cons (scm_from_locale_symbol ("shared"),
835 SCM_BOOL_F);
836 buf = STRING_STRINGBUF (str);
837 }
838
839 /* Stringbuf info */
840 e5 = scm_cons (scm_from_locale_symbol ("stringbuf"),
841 buf);
842
843 if (!STRINGBUF_WIDE (buf))
844 {
845 size_t len = STRINGBUF_LENGTH (buf);
846 char *cbuf;
847 SCM sbc = scm_i_make_string (len, &cbuf);
848 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
849 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
850 sbc);
851 }
852 else
853 {
854 size_t len = STRINGBUF_LENGTH (buf);
855 scm_t_wchar *cbuf;
856 SCM sbc = scm_i_make_wide_string (len, &cbuf);
857 u32_cpy ((scm_t_uint32 *) cbuf,
858 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
859 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
860 sbc);
861 }
862 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
863 scm_from_size_t (STRINGBUF_LENGTH (buf)));
864 if (STRINGBUF_SHARED (buf))
865 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
866 SCM_BOOL_T);
867 else
868 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
869 SCM_BOOL_F);
870 if (STRINGBUF_INLINE (buf))
871 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
872 SCM_BOOL_T);
873 else
874 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
875 SCM_BOOL_F);
876 if (STRINGBUF_WIDE (buf))
877 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
878 SCM_BOOL_T);
879 else
880 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
881 SCM_BOOL_F);
882
883 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
884 }
885 #undef FUNC_NAME
886
887 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
888 "Returns an association list containing debugging information\n"
889 "for @var{sym}. The association list has the following entries."
890 "@table @code\n"
891 "@item symbol\n"
892 "The symbol itself\n"
893 "@item hash\n"
894 "Its hash value\n"
895 "@item stringbuf\n"
896 "The string buffer that contains this symbol's characters\n"
897 "@item stringbuf-chars\n"
898 "A new string containing this symbols's stringbuf's characters\n"
899 "@item stringbuf-length\n"
900 "The number of characters in this stringbuf\n"
901 "@item stringbuf-shared\n"
902 "@code{#t} if this stringbuf is shared\n"
903 "@item stringbuf-inline\n"
904 "@code{#t} if this stringbuf's characters are stored in the\n"
905 "cell itself, or @code{#f} if they were allocated in memory\n"
906 "@item stringbuf-wide\n"
907 "@code{#t} if this stringbuf's characters are stored in a\n"
908 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
909 "buffer\n"
910 "@end table")
911 #define FUNC_NAME s_scm_sys_symbol_dump
912 {
913 SCM e1, e2, e3, e4, e5, e6, e7, e8;
914 SCM buf;
915 SCM_VALIDATE_SYMBOL (1, sym);
916 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
917 sym);
918 e2 = scm_cons (scm_from_locale_symbol ("hash"),
919 scm_from_ulong (scm_i_symbol_hash (sym)));
920
921 buf = SYMBOL_STRINGBUF (sym);
922
923 /* Stringbuf info */
924 e3 = scm_cons (scm_from_locale_symbol ("stringbuf"),
925 buf);
926
927 if (!STRINGBUF_WIDE (buf))
928 {
929 size_t len = STRINGBUF_LENGTH (buf);
930 char *cbuf;
931 SCM sbc = scm_i_make_string (len, &cbuf);
932 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
933 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
934 sbc);
935 }
936 else
937 {
938 size_t len = STRINGBUF_LENGTH (buf);
939 scm_t_wchar *cbuf;
940 SCM sbc = scm_i_make_wide_string (len, &cbuf);
941 u32_cpy ((scm_t_uint32 *) cbuf,
942 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
943 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
944 sbc);
945 }
946 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
947 scm_from_size_t (STRINGBUF_LENGTH (buf)));
948 if (STRINGBUF_SHARED (buf))
949 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
950 SCM_BOOL_T);
951 else
952 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
953 SCM_BOOL_F);
954 if (STRINGBUF_INLINE (buf))
955 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
956 SCM_BOOL_T);
957 else
958 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
959 SCM_BOOL_F);
960 if (STRINGBUF_WIDE (buf))
961 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
962 SCM_BOOL_T);
963 else
964 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
965 SCM_BOOL_F);
966 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
967
968 }
969 #undef FUNC_NAME
970
971 #if SCM_STRING_LENGTH_HISTOGRAM
972
973 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
974 #define FUNC_NAME s_scm_sys_stringbuf_hist
975 {
976 int i;
977 for (i = 0; i < 1000; i++)
978 if (lenhist[i])
979 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
980 fprintf (stderr, ">999: %u\n", lenhist[1000]);
981 return SCM_UNSPECIFIED;
982 }
983 #undef FUNC_NAME
984
985 #endif
986
987 \f
988
989 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
990 (SCM obj),
991 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
992 #define FUNC_NAME s_scm_string_p
993 {
994 return scm_from_bool (IS_STRING (obj));
995 }
996 #undef FUNC_NAME
997
998
999 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1000
1001 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1002 (SCM chrs),
1003 "@deffnx {Scheme Procedure} list->string chrs\n"
1004 "Return a newly allocated string composed of the arguments,\n"
1005 "@var{chrs}.")
1006 #define FUNC_NAME s_scm_string
1007 {
1008 SCM result;
1009 SCM rest;
1010 size_t len;
1011 size_t p = 0;
1012 long i;
1013
1014 /* Verify that this is a list of chars. */
1015 i = scm_ilength (chrs);
1016 len = (size_t) i;
1017 rest = chrs;
1018
1019 SCM_ASSERT (len >= 0, chrs, SCM_ARG1, FUNC_NAME);
1020 while (len > 0 && scm_is_pair (rest))
1021 {
1022 SCM elt = SCM_CAR (rest);
1023 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1024 rest = SCM_CDR (rest);
1025 len--;
1026 scm_remember_upto_here_1 (elt);
1027 }
1028
1029 /* Construct a string containing this list of chars. */
1030 len = (size_t) i;
1031 rest = chrs;
1032
1033 result = scm_i_make_string (len, NULL);
1034 result = scm_i_string_start_writing (result);
1035 while (len > 0 && scm_is_pair (rest))
1036 {
1037 SCM elt = SCM_CAR (rest);
1038 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1039 p++;
1040 rest = SCM_CDR (rest);
1041 len--;
1042 scm_remember_upto_here_1 (elt);
1043 }
1044 scm_i_string_stop_writing ();
1045
1046 if (len > 0)
1047 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1048 if (!scm_is_null (rest))
1049 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1050
1051 return result;
1052 }
1053 #undef FUNC_NAME
1054
1055 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1056 (SCM k, SCM chr),
1057 "Return a newly allocated string of\n"
1058 "length @var{k}. If @var{chr} is given, then all elements of\n"
1059 "the string are initialized to @var{chr}, otherwise the contents\n"
1060 "of the @var{string} are unspecified.")
1061 #define FUNC_NAME s_scm_make_string
1062 {
1063 return scm_c_make_string (scm_to_size_t (k), chr);
1064 }
1065 #undef FUNC_NAME
1066
1067 SCM
1068 scm_c_make_string (size_t len, SCM chr)
1069 #define FUNC_NAME NULL
1070 {
1071 size_t p;
1072 SCM res = scm_i_make_string (len, NULL);
1073
1074 if (!SCM_UNBNDP (chr))
1075 {
1076 SCM_VALIDATE_CHAR (0, chr);
1077 res = scm_i_string_start_writing (res);
1078 for (p = 0; p < len; p++)
1079 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1080 scm_i_string_stop_writing ();
1081 }
1082
1083 return res;
1084 }
1085 #undef FUNC_NAME
1086
1087 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1088 (SCM string),
1089 "Return the number of characters in @var{string}.")
1090 #define FUNC_NAME s_scm_string_length
1091 {
1092 SCM_VALIDATE_STRING (1, string);
1093 return scm_from_size_t (STRING_LENGTH (string));
1094 }
1095 #undef FUNC_NAME
1096
1097 SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1098 (SCM string),
1099 "Return the bytes used to represent a character in @var{string}."
1100 "This will return 1 or 4.")
1101 #define FUNC_NAME s_scm_string_width
1102 {
1103 SCM_VALIDATE_STRING (1, string);
1104 if (!scm_i_is_narrow_string (string))
1105 return scm_from_int (4);
1106
1107 return scm_from_int (1);
1108 }
1109 #undef FUNC_NAME
1110
1111 size_t
1112 scm_c_string_length (SCM string)
1113 {
1114 if (!IS_STRING (string))
1115 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1116 return STRING_LENGTH (string);
1117 }
1118
1119 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1120 (SCM str, SCM k),
1121 "Return character @var{k} of @var{str} using zero-origin\n"
1122 "indexing. @var{k} must be a valid index of @var{str}.")
1123 #define FUNC_NAME s_scm_string_ref
1124 {
1125 size_t len;
1126 unsigned long idx;
1127
1128 SCM_VALIDATE_STRING (1, str);
1129
1130 len = scm_i_string_length (str);
1131 if (SCM_LIKELY (len > 0))
1132 idx = scm_to_unsigned_integer (k, 0, len - 1);
1133 else
1134 scm_out_of_range (NULL, k);
1135
1136 if (scm_i_is_narrow_string (str))
1137 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1138 else
1139 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1140 }
1141 #undef FUNC_NAME
1142
1143 SCM
1144 scm_c_string_ref (SCM str, size_t p)
1145 {
1146 if (p >= scm_i_string_length (str))
1147 scm_out_of_range (NULL, scm_from_size_t (p));
1148 if (scm_i_is_narrow_string (str))
1149 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1150 else
1151 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1152
1153 }
1154
1155 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1156 (SCM str, SCM k, SCM chr),
1157 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1158 "an unspecified value. @var{k} must be a valid index of\n"
1159 "@var{str}.")
1160 #define FUNC_NAME s_scm_string_set_x
1161 {
1162 size_t len;
1163 unsigned long idx;
1164
1165 SCM_VALIDATE_STRING (1, str);
1166
1167 len = scm_i_string_length (str);
1168 if (SCM_LIKELY (len > 0))
1169 idx = scm_to_unsigned_integer (k, 0, len - 1);
1170 else
1171 scm_out_of_range (NULL, k);
1172
1173 SCM_VALIDATE_CHAR (3, chr);
1174 str = scm_i_string_start_writing (str);
1175 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1176 scm_i_string_stop_writing ();
1177
1178 return SCM_UNSPECIFIED;
1179 }
1180 #undef FUNC_NAME
1181
1182 void
1183 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1184 {
1185 if (p >= scm_i_string_length (str))
1186 scm_out_of_range (NULL, scm_from_size_t (p));
1187 str = scm_i_string_start_writing (str);
1188 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1189 scm_i_string_stop_writing ();
1190 }
1191
1192 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1193 (SCM str, SCM start, SCM end),
1194 "Return a newly allocated string formed from the characters\n"
1195 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1196 "ending with index @var{end} (exclusive).\n"
1197 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1198 "exact integers satisfying:\n\n"
1199 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1200 #define FUNC_NAME s_scm_substring
1201 {
1202 size_t len, from, to;
1203
1204 SCM_VALIDATE_STRING (1, str);
1205 len = scm_i_string_length (str);
1206 from = scm_to_unsigned_integer (start, 0, len);
1207 if (SCM_UNBNDP (end))
1208 to = len;
1209 else
1210 to = scm_to_unsigned_integer (end, from, len);
1211 return scm_i_substring (str, from, to);
1212 }
1213 #undef FUNC_NAME
1214
1215 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1216 (SCM str, SCM start, SCM end),
1217 "Return a newly allocated string formed from the characters\n"
1218 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1219 "ending with index @var{end} (exclusive).\n"
1220 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1221 "exact integers satisfying:\n"
1222 "\n"
1223 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1224 "\n"
1225 "The returned string is read-only.\n")
1226 #define FUNC_NAME s_scm_substring_read_only
1227 {
1228 size_t len, from, to;
1229
1230 SCM_VALIDATE_STRING (1, str);
1231 len = scm_i_string_length (str);
1232 from = scm_to_unsigned_integer (start, 0, len);
1233 if (SCM_UNBNDP (end))
1234 to = len;
1235 else
1236 to = scm_to_unsigned_integer (end, from, len);
1237 return scm_i_substring_read_only (str, from, to);
1238 }
1239 #undef FUNC_NAME
1240
1241 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1242 (SCM str, SCM start, SCM end),
1243 "Return a newly allocated string formed from the characters\n"
1244 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1245 "ending with index @var{end} (exclusive).\n"
1246 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1247 "exact integers satisfying:\n\n"
1248 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1249 #define FUNC_NAME s_scm_substring_copy
1250 {
1251 /* For the Scheme version, START is mandatory, but for the C
1252 version, it is optional. See scm_string_copy in srfi-13.c for a
1253 rationale.
1254 */
1255
1256 size_t from, to;
1257
1258 SCM_VALIDATE_STRING (1, str);
1259 scm_i_get_substring_spec (scm_i_string_length (str),
1260 start, &from, end, &to);
1261 return scm_i_substring_copy (str, from, to);
1262 }
1263 #undef FUNC_NAME
1264
1265 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1266 (SCM str, SCM start, SCM end),
1267 "Return string that indirectly refers to the characters\n"
1268 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1269 "ending with index @var{end} (exclusive).\n"
1270 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1271 "exact integers satisfying:\n\n"
1272 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1273 #define FUNC_NAME s_scm_substring_shared
1274 {
1275 size_t len, from, to;
1276
1277 SCM_VALIDATE_STRING (1, str);
1278 len = scm_i_string_length (str);
1279 from = scm_to_unsigned_integer (start, 0, len);
1280 if (SCM_UNBNDP (end))
1281 to = len;
1282 else
1283 to = scm_to_unsigned_integer (end, from, len);
1284 return scm_i_substring_shared (str, from, to);
1285 }
1286 #undef FUNC_NAME
1287
1288 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1289 (SCM args),
1290 "Return a newly allocated string whose characters form the\n"
1291 "concatenation of the given strings, @var{args}.")
1292 #define FUNC_NAME s_scm_string_append
1293 {
1294 SCM res;
1295 size_t len = 0;
1296 int wide = 0;
1297 SCM l, s;
1298 char *data;
1299 scm_t_wchar *wdata;
1300 int i;
1301
1302 SCM_VALIDATE_REST_ARGUMENT (args);
1303 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1304 {
1305 s = SCM_CAR (l);
1306 SCM_VALIDATE_STRING (SCM_ARGn, s);
1307 len += scm_i_string_length (s);
1308 if (!scm_i_is_narrow_string (s))
1309 wide = 1;
1310 }
1311 if (!wide)
1312 res = scm_i_make_string (len, &data);
1313 else
1314 res = scm_i_make_wide_string (len, &wdata);
1315
1316 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1317 {
1318 size_t len;
1319 s = SCM_CAR (l);
1320 SCM_VALIDATE_STRING (SCM_ARGn, s);
1321 len = scm_i_string_length (s);
1322 if (!wide)
1323 {
1324 memcpy (data, scm_i_string_chars (s), len);
1325 data += len;
1326 }
1327 else
1328 {
1329 if (scm_i_is_narrow_string (s))
1330 {
1331 for (i = 0; i < scm_i_string_length (s); i++)
1332 wdata[i] = (unsigned char) scm_i_string_chars (s)[i];
1333 }
1334 else
1335 u32_cpy ((scm_t_uint32 *) wdata,
1336 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1337 wdata += len;
1338 }
1339 scm_remember_upto_here_1 (s);
1340 }
1341 return res;
1342 }
1343 #undef FUNC_NAME
1344
1345 int
1346 scm_is_string (SCM obj)
1347 {
1348 return IS_STRING (obj);
1349 }
1350
1351 SCM
1352 scm_from_locale_stringn (const char *str, size_t len)
1353 {
1354 SCM res;
1355 char *dst;
1356
1357 if (len == (size_t) -1)
1358 len = strlen (str);
1359 if (len == 0)
1360 return scm_nullstr;
1361
1362 res = scm_i_make_string (len, &dst);
1363 memcpy (dst, str, len);
1364 return res;
1365 }
1366
1367 SCM
1368 scm_from_locale_string (const char *str)
1369 {
1370 if (str == NULL)
1371 return scm_nullstr;
1372
1373 return scm_from_locale_stringn (str, -1);
1374 }
1375
1376 /* Create a new scheme string from the C string STR. The memory of
1377 STR may be used directly as storage for the new string. */
1378 SCM
1379 scm_take_locale_stringn (char *str, size_t len)
1380 {
1381 SCM buf, res;
1382
1383 if (len == (size_t) -1)
1384 len = strlen (str);
1385 else
1386 {
1387 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1388 often be satisfied from the alignment padding after the block, with
1389 no actual data movement. */
1390 str = scm_realloc (str, len + 1);
1391 str[len] = '\0';
1392 }
1393
1394 buf = scm_i_take_stringbufn (str, len);
1395 res = scm_double_cell (STRING_TAG,
1396 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
1397 return res;
1398 }
1399
1400 SCM
1401 scm_take_locale_string (char *str)
1402 {
1403 return scm_take_locale_stringn (str, -1);
1404 }
1405
1406 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1407 and \UXXXXXX. */
1408 static void
1409 unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1410 {
1411 char *before, *after;
1412 size_t i, j;
1413
1414 before = *bufp;
1415 after = *bufp;
1416 i = 0;
1417 j = 0;
1418 while (i < *lenp)
1419 {
1420 if ((i <= *lenp - 6)
1421 && before[i] == '\\'
1422 && before[i + 1] == 'u'
1423 && before[i + 2] == '0' && before[i + 3] == '0')
1424 {
1425 /* Convert \u00NN to \xNN */
1426 after[j] = '\\';
1427 after[j + 1] = 'x';
1428 after[j + 2] = tolower (before[i + 4]);
1429 after[j + 3] = tolower (before[i + 5]);
1430 i += 6;
1431 j += 4;
1432 }
1433 else if ((i <= *lenp - 10)
1434 && before[i] == '\\'
1435 && before[i + 1] == 'U'
1436 && before[i + 2] == '0' && before[i + 3] == '0')
1437 {
1438 /* Convert \U00NNNNNN to \UNNNNNN */
1439 after[j] = '\\';
1440 after[j + 1] = 'U';
1441 after[j + 2] = tolower (before[i + 4]);
1442 after[j + 3] = tolower (before[i + 5]);
1443 after[j + 4] = tolower (before[i + 6]);
1444 after[j + 5] = tolower (before[i + 7]);
1445 after[j + 6] = tolower (before[i + 8]);
1446 after[j + 7] = tolower (before[i + 9]);
1447 i += 10;
1448 j += 8;
1449 }
1450 else
1451 {
1452 after[j] = before[i];
1453 i++;
1454 j++;
1455 }
1456 }
1457 *lenp = j;
1458 after = scm_realloc (after, j);
1459 }
1460
1461 char *
1462 scm_to_locale_stringn (SCM str, size_t * lenp)
1463 {
1464 const char *enc;
1465
1466 /* In the future, enc will hold the port's encoding. */
1467 enc = NULL;
1468
1469 return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence);
1470 }
1471
1472 /* Low-level scheme to C string conversion function. */
1473 char *
1474 scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
1475 enum iconv_ilseq_handler handler)
1476 {
1477 static const char iso[11] = "ISO-8859-1";
1478 char *buf;
1479 size_t ilen, len, i;
1480
1481 if (!scm_is_string (str))
1482 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1483 ilen = scm_i_string_length (str);
1484
1485 if (ilen == 0)
1486 {
1487 buf = scm_malloc (1);
1488 buf[0] = '\0';
1489 if (lenp)
1490 *lenp = 0;
1491 return buf;
1492 }
1493
1494 if (lenp == NULL)
1495 for (i = 0; i < ilen; i++)
1496 if (scm_i_string_ref (str, i) == '\0')
1497 scm_misc_error (NULL,
1498 "string contains #\\nul character: ~S",
1499 scm_list_1 (str));
1500
1501 if (scm_i_is_narrow_string (str))
1502 {
1503 if (lenp)
1504 {
1505 buf = scm_malloc (ilen);
1506 memcpy (buf, scm_i_string_chars (str), ilen);
1507 *lenp = ilen;
1508 return buf;
1509 }
1510 else
1511 {
1512 buf = scm_malloc (ilen + 1);
1513 memcpy (buf, scm_i_string_chars (str), ilen);
1514 buf[ilen] = '\0';
1515 return buf;
1516 }
1517 }
1518
1519
1520 buf = NULL;
1521 len = 0;
1522 buf = u32_conv_to_encoding (iso,
1523 handler,
1524 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1525 ilen, NULL, NULL, &len);
1526 if (buf == NULL)
1527 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1528 scm_list_2 (scm_from_locale_string (iso), str));
1529
1530 if (handler == iconveh_escape_sequence)
1531 unistring_escapes_to_guile_escapes (&buf, &len);
1532
1533 if (lenp)
1534 *lenp = len;
1535 else
1536 {
1537 buf = scm_realloc (buf, len + 1);
1538 buf[len] = '\0';
1539 }
1540
1541 scm_remember_upto_here_1 (str);
1542 return buf;
1543 }
1544
1545 char *
1546 scm_to_locale_string (SCM str)
1547 {
1548 return scm_to_locale_stringn (str, NULL);
1549 }
1550
1551 size_t
1552 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1553 {
1554 size_t len;
1555 char *result = NULL;
1556 if (!scm_is_string (str))
1557 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1558 result = scm_to_locale_stringn (str, &len);
1559
1560 memcpy (buf, result, (len > max_len) ? max_len : len);
1561 free (result);
1562
1563 scm_remember_upto_here_1 (str);
1564 return len;
1565 }
1566
1567 /* converts C scm_array of strings to SCM scm_list of strings. */
1568 /* If argc < 0, a null terminated scm_array is assumed. */
1569 SCM
1570 scm_makfromstrs (int argc, char **argv)
1571 {
1572 int i = argc;
1573 SCM lst = SCM_EOL;
1574 if (0 > i)
1575 for (i = 0; argv[i]; i++);
1576 while (i--)
1577 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1578 return lst;
1579 }
1580
1581 /* Return a newly allocated array of char pointers to each of the strings
1582 in args, with a terminating NULL pointer. */
1583
1584 char **
1585 scm_i_allocate_string_pointers (SCM list)
1586 {
1587 char **result;
1588 int len = scm_ilength (list);
1589 int i;
1590
1591 if (len < 0)
1592 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1593
1594 scm_dynwind_begin (0);
1595
1596 result = (char **) scm_malloc ((len + 1) * sizeof (char *));
1597 result[len] = NULL;
1598 scm_dynwind_unwind_handler (free, result, 0);
1599
1600 /* The list might be have been modified in another thread, so
1601 we check LIST before each access.
1602 */
1603 for (i = 0; i < len && scm_is_pair (list); i++)
1604 {
1605 result[i] = scm_to_locale_string (SCM_CAR (list));
1606 list = SCM_CDR (list);
1607 }
1608
1609 scm_dynwind_end ();
1610 return result;
1611 }
1612
1613 void
1614 scm_i_free_string_pointers (char **pointers)
1615 {
1616 int i;
1617
1618 for (i = 0; pointers[i]; i++)
1619 free (pointers[i]);
1620 free (pointers);
1621 }
1622
1623 void
1624 scm_i_get_substring_spec (size_t len,
1625 SCM start, size_t *cstart,
1626 SCM end, size_t *cend)
1627 {
1628 if (SCM_UNBNDP (start))
1629 *cstart = 0;
1630 else
1631 *cstart = scm_to_unsigned_integer (start, 0, len);
1632
1633 if (SCM_UNBNDP (end))
1634 *cend = len;
1635 else
1636 *cend = scm_to_unsigned_integer (end, *cstart, len);
1637 }
1638
1639 #if SCM_ENABLE_DEPRECATED
1640
1641 /* When these definitions are removed, it becomes reasonable to use
1642 read-only strings for string literals. For that, change the reader
1643 to create string literals with scm_c_substring_read_only instead of
1644 with scm_c_substring_copy.
1645 */
1646
1647 int
1648 scm_i_deprecated_stringp (SCM str)
1649 {
1650 scm_c_issue_deprecation_warning
1651 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1652
1653 return scm_is_string (str);
1654 }
1655
1656 char *
1657 scm_i_deprecated_string_chars (SCM str)
1658 {
1659 char *chars;
1660
1661 scm_c_issue_deprecation_warning
1662 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1663
1664 /* We don't accept shared substrings here since they are not
1665 null-terminated.
1666 */
1667 if (IS_SH_STRING (str))
1668 scm_misc_error (NULL,
1669 "SCM_STRING_CHARS does not work with shared substrings.",
1670 SCM_EOL);
1671
1672 /* We explicitly test for read-only strings to produce a better
1673 error message.
1674 */
1675
1676 if (IS_RO_STRING (str))
1677 scm_misc_error (NULL,
1678 "SCM_STRING_CHARS does not work with read-only strings.",
1679 SCM_EOL);
1680
1681 /* The following is still wrong, of course...
1682 */
1683 str = scm_i_string_start_writing (str);
1684 chars = scm_i_string_writable_chars (str);
1685 scm_i_string_stop_writing ();
1686 return chars;
1687 }
1688
1689 size_t
1690 scm_i_deprecated_string_length (SCM str)
1691 {
1692 scm_c_issue_deprecation_warning
1693 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1694 return scm_c_string_length (str);
1695 }
1696
1697 #endif
1698
1699 void
1700 scm_init_strings ()
1701 {
1702 scm_nullstr = scm_i_make_string (0, NULL);
1703
1704 #include "libguile/strings.x"
1705 }
1706
1707
1708 /*
1709 Local Variables:
1710 c-file-style: "gnu"
1711 End:
1712 */