Remove the `scm_tc_free_cell' SMOB type.
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <string.h>
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <unistr.h>
29 #include <uniconv.h>
30
31 #include "libguile/_scm.h"
32 #include "libguile/chars.h"
33 #include "libguile/root.h"
34 #include "libguile/strings.h"
35 #include "libguile/deprecation.h"
36 #include "libguile/validate.h"
37
38 \f
39
40 /* {Strings}
41 */
42
43
44 /* Stringbufs
45 *
46 * XXX - keeping an accurate refcount during GC seems to be quite
47 * tricky, so we just keep score of whether a stringbuf might be
48 * shared, not whether it definitely is.
49 *
50 * The scheme I (mvo) tried to keep an accurate reference count would
51 * recount all strings that point to a stringbuf during the mark-phase
52 * of the GC. This was done since one cannot access the stringbuf of
53 * a string when that string is freed (in order to decrease the
54 * reference count). The memory of the stringbuf might have been
55 * reused already for something completely different.
56 *
57 * This recounted worked for a small number of threads beating on
58 * cow-strings, but it failed randomly with more than 10 threads, say.
59 * I couldn't figure out what went wrong, so I used the conservative
60 * approach implemented below.
61 *
62 * A stringbuf needs to know its length, but only so that it can be
63 * reported when the stringbuf is freed.
64 *
65 * There are 3 storage strategies for stringbufs: inline, outline, and
66 * wide.
67 *
68 * Inline strings are small 8-bit strings stored within the double
69 * cell itself. Outline strings are larger 8-bit strings with GC
70 * allocated storage. Wide strings are 32-bit strings with allocated
71 * storage.
72 *
73 * There was little value in making wide string inlineable, since
74 * there is only room for three inlined 32-bit characters. Thus wide
75 * stringbufs are never inlined.
76 */
77
78 #define STRINGBUF_F_SHARED 0x100
79 #define STRINGBUF_F_INLINE 0x200
80 #define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
81 encoding. Otherwise, strings
82 are Latin-1. */
83
84 #define STRINGBUF_TAG scm_tc7_stringbuf
85 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
86 #define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
88
89 #define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
90 #define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
91 #define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
92 #define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
93
94 #define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
95 ? STRINGBUF_INLINE_CHARS (buf) \
96 : STRINGBUF_OUTLINE_CHARS (buf))
97
98 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
99 #define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
100 ? STRINGBUF_INLINE_LENGTH (buf) \
101 : STRINGBUF_OUTLINE_LENGTH (buf))
102
103 #define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
104
105 #define SET_STRINGBUF_SHARED(buf) \
106 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
107
108 #if SCM_STRING_LENGTH_HISTOGRAM
109 static size_t lenhist[1001];
110 #endif
111
112 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
113 characters. */
114 static SCM
115 make_stringbuf (size_t len)
116 {
117 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
118 scm_i_symbol_chars, all stringbufs are null-terminated. Once
119 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
120 has been changed for scm_i_symbol_chars, this null-termination
121 can be dropped.
122 */
123
124 #if SCM_STRING_LENGTH_HISTOGRAM
125 if (len < 1000)
126 lenhist[len]++;
127 else
128 lenhist[1000]++;
129 #endif
130
131 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
132 {
133 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
134 0, 0, 0);
135 }
136 else
137 {
138 char *mem = scm_gc_malloc_pointerless (len + 1, "string");
139 mem[len] = '\0';
140 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
141 (scm_t_bits) len, (scm_t_bits) 0);
142 }
143 }
144
145 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 characters. */
147 static SCM
148 make_wide_stringbuf (size_t len)
149 {
150 scm_t_wchar *mem;
151 #if SCM_STRING_LENGTH_HISTOGRAM
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156 #endif
157
158 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
159 mem[len] = 0;
160 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
161 (scm_t_bits) len, (scm_t_bits) 0);
162 }
163
164 /* Return a new stringbuf whose underlying storage consists of the LEN+1
165 octets pointed to by STR (the last octet is zero). */
166 SCM
167 scm_i_take_stringbufn (char *str, size_t len)
168 {
169 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
170
171 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
172 (scm_t_bits) len, (scm_t_bits) 0);
173 }
174
175 /* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
176 one containing 32-bit UCS-4-encoded characters. */
177 static void
178 widen_stringbuf (SCM buf)
179 {
180 size_t i, len;
181 scm_t_wchar *mem;
182
183 if (STRINGBUF_WIDE (buf))
184 return;
185
186 if (STRINGBUF_INLINE (buf))
187 {
188 len = STRINGBUF_INLINE_LENGTH (buf);
189
190 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
191 for (i = 0; i < len; i++)
192 mem[i] =
193 (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
194 mem[len] = 0;
195
196 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
197 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
198 SCM_SET_CELL_WORD_1 (buf, mem);
199 SCM_SET_CELL_WORD_2 (buf, len);
200 }
201 else
202 {
203 len = STRINGBUF_OUTLINE_LENGTH (buf);
204
205 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
206 for (i = 0; i < len; i++)
207 mem[i] =
208 (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
209 mem[len] = 0;
210
211 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
212
213 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
214 SCM_SET_CELL_WORD_1 (buf, mem);
215 SCM_SET_CELL_WORD_2 (buf, len);
216 }
217 }
218
219 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
220
221 /* Copy-on-write strings.
222 */
223
224 #define STRING_TAG scm_tc7_string
225
226 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
227 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
228 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
229
230 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
231 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
232
233 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
234
235 /* Read-only strings.
236 */
237
238 #define RO_STRING_TAG (scm_tc7_string + 0x200)
239 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
240
241 /* Mutation-sharing substrings
242 */
243
244 #define SH_STRING_TAG (scm_tc7_string + 0x100)
245
246 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
247 /* START and LENGTH as for STRINGs. */
248
249 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
250
251 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
252 characters. CHARSP, if not NULL, will be set to location of the
253 char array. */
254 SCM
255 scm_i_make_string (size_t len, char **charsp)
256 {
257 SCM buf = make_stringbuf (len);
258 SCM res;
259 if (charsp)
260 *charsp = STRINGBUF_CHARS (buf);
261 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
262 (scm_t_bits)0, (scm_t_bits) len);
263 return res;
264 }
265
266 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
267 characters. CHARSP, if not NULL, will be set to location of the
268 character array. */
269 SCM
270 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
271 {
272 SCM buf = make_wide_stringbuf (len);
273 SCM res;
274 if (charsp)
275 *charsp = STRINGBUF_WIDE_CHARS (buf);
276 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
277 (scm_t_bits) 0, (scm_t_bits) len);
278 return res;
279 }
280
281 static void
282 validate_substring_args (SCM str, size_t start, size_t end)
283 {
284 if (!IS_STRING (str))
285 scm_wrong_type_arg_msg (NULL, 0, str, "string");
286 if (start > STRING_LENGTH (str))
287 scm_out_of_range (NULL, scm_from_size_t (start));
288 if (end > STRING_LENGTH (str) || end < start)
289 scm_out_of_range (NULL, scm_from_size_t (end));
290 }
291
292 static inline void
293 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
294 {
295 *start = STRING_START (*str);
296 if (IS_SH_STRING (*str))
297 {
298 *str = SH_STRING_STRING (*str);
299 *start += STRING_START (*str);
300 }
301 *buf = STRING_STRINGBUF (*str);
302 }
303
304 SCM
305 scm_i_substring (SCM str, size_t start, size_t end)
306 {
307 SCM buf;
308 size_t str_start;
309 get_str_buf_start (&str, &buf, &str_start);
310 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
311 SET_STRINGBUF_SHARED (buf);
312 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
313 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
314 (scm_t_bits)str_start + start,
315 (scm_t_bits) end - start);
316 }
317
318 SCM
319 scm_i_substring_read_only (SCM str, size_t start, size_t end)
320 {
321 SCM buf;
322 size_t str_start;
323 get_str_buf_start (&str, &buf, &str_start);
324 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
325 SET_STRINGBUF_SHARED (buf);
326 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
327 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
328 (scm_t_bits)str_start + start,
329 (scm_t_bits) end - start);
330 }
331
332 SCM
333 scm_i_substring_copy (SCM str, size_t start, size_t end)
334 {
335 size_t len = end - start;
336 SCM buf, my_buf;
337 size_t str_start;
338 get_str_buf_start (&str, &buf, &str_start);
339 if (scm_i_is_narrow_string (str))
340 {
341 my_buf = make_stringbuf (len);
342 memcpy (STRINGBUF_CHARS (my_buf),
343 STRINGBUF_CHARS (buf) + str_start + start, len);
344 }
345 else
346 {
347 my_buf = make_wide_stringbuf (len);
348 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
349 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
350 + start), len);
351 /* Even though this string is wide, the substring may be narrow.
352 Consider adding code to narrow the string. */
353 }
354 scm_remember_upto_here_1 (buf);
355 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
356 (scm_t_bits) 0, (scm_t_bits) len);
357 }
358
359 SCM
360 scm_i_substring_shared (SCM str, size_t start, size_t end)
361 {
362 if (start == 0 && end == STRING_LENGTH (str))
363 return str;
364 else
365 {
366 size_t len = end - start;
367 if (IS_SH_STRING (str))
368 {
369 start += STRING_START (str);
370 str = SH_STRING_STRING (str);
371 }
372 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
373 (scm_t_bits)start, (scm_t_bits) len);
374 }
375 }
376
377 SCM
378 scm_c_substring (SCM str, size_t start, size_t end)
379 {
380 validate_substring_args (str, start, end);
381 return scm_i_substring (str, start, end);
382 }
383
384 SCM
385 scm_c_substring_read_only (SCM str, size_t start, size_t end)
386 {
387 validate_substring_args (str, start, end);
388 return scm_i_substring_read_only (str, start, end);
389 }
390
391 SCM
392 scm_c_substring_copy (SCM str, size_t start, size_t end)
393 {
394 validate_substring_args (str, start, end);
395 return scm_i_substring_copy (str, start, end);
396 }
397
398 SCM
399 scm_c_substring_shared (SCM str, size_t start, size_t end)
400 {
401 validate_substring_args (str, start, end);
402 return scm_i_substring_shared (str, start, end);
403 }
404
405 \f
406 /* Internal accessors
407 */
408
409 /* Returns the number of characters in STR. This may be different
410 than the memory size of the string storage. */
411 size_t
412 scm_i_string_length (SCM str)
413 {
414 return STRING_LENGTH (str);
415 }
416
417 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
418 encoding. False if it is 'wide', having a 32-bit UCS-4
419 encoding. */
420 int
421 scm_i_is_narrow_string (SCM str)
422 {
423 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
424 }
425
426 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
427 STR. */
428 const char *
429 scm_i_string_chars (SCM str)
430 {
431 SCM buf;
432 size_t start;
433 get_str_buf_start (&str, &buf, &start);
434 if (scm_i_is_narrow_string (str))
435 return STRINGBUF_CHARS (buf) + start;
436 else
437 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
438 scm_list_1 (str));
439 return NULL;
440 }
441
442 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
443 STR. */
444 const scm_t_wchar *
445 scm_i_string_wide_chars (SCM str)
446 {
447 SCM buf;
448 size_t start;
449
450 get_str_buf_start (&str, &buf, &start);
451 if (!scm_i_is_narrow_string (str))
452 return STRINGBUF_WIDE_CHARS (buf) + start;
453 else
454 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
455 scm_list_1 (str));
456 }
457
458 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
459 a new string buffer, so that it can be modified without modifying
460 other strings. Also, lock the string mutex. Later, one must call
461 scm_i_string_stop_writing to unlock the mutex. */
462 SCM
463 scm_i_string_start_writing (SCM orig_str)
464 {
465 SCM buf, str = orig_str;
466 size_t start;
467
468 get_str_buf_start (&str, &buf, &start);
469 if (IS_RO_STRING (str))
470 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
471
472 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
473 if (STRINGBUF_SHARED (buf))
474 {
475 /* Clone the stringbuf. */
476 size_t len = STRING_LENGTH (str);
477 SCM new_buf;
478
479 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
480
481 if (scm_i_is_narrow_string (str))
482 {
483 new_buf = make_stringbuf (len);
484 memcpy (STRINGBUF_CHARS (new_buf),
485 STRINGBUF_CHARS (buf) + STRING_START (str), len);
486
487 }
488 else
489 {
490 new_buf = make_wide_stringbuf (len);
491 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
492 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
493 + STRING_START (str)), len);
494 }
495
496 SET_STRING_STRINGBUF (str, new_buf);
497 start -= STRING_START (str);
498
499 /* FIXME: The following operations are not atomic, so other threads
500 looking at STR may see an inconsistent state. Nevertheless it can't
501 hurt much since (i) accessing STR while it is being mutated can't
502 yield a crash, and (ii) concurrent accesses to STR should be
503 protected by a mutex at the application level. The latter may not
504 apply when STR != ORIG_STR, though. */
505 SET_STRING_START (str, 0);
506 SET_STRING_STRINGBUF (str, new_buf);
507
508 buf = new_buf;
509
510 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
511 }
512 return orig_str;
513 }
514
515 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
516 char *
517 scm_i_string_writable_chars (SCM str)
518 {
519 SCM buf;
520 size_t start;
521
522 get_str_buf_start (&str, &buf, &start);
523 if (scm_i_is_narrow_string (str))
524 return STRINGBUF_CHARS (buf) + start;
525 else
526 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
527 scm_list_1 (str));
528 return NULL;
529 }
530
531 /* Return a pointer to the UCS-4 codepoints of a string. */
532 static scm_t_wchar *
533 scm_i_string_writable_wide_chars (SCM str)
534 {
535 SCM buf;
536 size_t start;
537
538 get_str_buf_start (&str, &buf, &start);
539 if (!scm_i_is_narrow_string (str))
540 return STRINGBUF_WIDE_CHARS (buf) + start;
541 else
542 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
543 scm_list_1 (str));
544 }
545
546 /* Unlock the string mutex that was locked when
547 scm_i_string_start_writing was called. */
548 void
549 scm_i_string_stop_writing (void)
550 {
551 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
552 }
553
554 /* Return the Xth character of STR as a UCS-4 codepoint. */
555 scm_t_wchar
556 scm_i_string_ref (SCM str, size_t x)
557 {
558 if (scm_i_is_narrow_string (str))
559 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
560 else
561 return scm_i_string_wide_chars (str)[x];
562 }
563
564 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
565 void
566 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
567 {
568 if (chr > 0xFF && scm_i_is_narrow_string (str))
569 widen_stringbuf (STRING_STRINGBUF (str));
570
571 if (scm_i_is_narrow_string (str))
572 {
573 char *dst = scm_i_string_writable_chars (str);
574 dst[p] = (char) (unsigned char) chr;
575 }
576 else
577 {
578 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
579 dst[p] = chr;
580 }
581 }
582
583 /* Symbols.
584
585 Basic symbol creation and accessing is done here, the rest is in
586 symbols.[hc]. This has been done to keep stringbufs and the
587 internals of strings and string-like objects confined to this file.
588 */
589
590 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
591
592 SCM
593 scm_i_make_symbol (SCM name, scm_t_bits flags,
594 unsigned long hash, SCM props)
595 {
596 SCM buf;
597 size_t start = STRING_START (name);
598 size_t length = STRING_LENGTH (name);
599
600 if (IS_SH_STRING (name))
601 {
602 name = SH_STRING_STRING (name);
603 start += STRING_START (name);
604 }
605 buf = SYMBOL_STRINGBUF (name);
606
607 if (start == 0 && length == STRINGBUF_LENGTH (buf))
608 {
609 /* reuse buf. */
610 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
611 SET_STRINGBUF_SHARED (buf);
612 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
613 }
614 else
615 {
616 /* make new buf. */
617 if (scm_i_is_narrow_string (name))
618 {
619 SCM new_buf = make_stringbuf (length);
620 memcpy (STRINGBUF_CHARS (new_buf),
621 STRINGBUF_CHARS (buf) + start, length);
622 buf = new_buf;
623 }
624 else
625 {
626 SCM new_buf = make_wide_stringbuf (length);
627 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
628 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
629 length);
630 buf = new_buf;
631 }
632 }
633 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
634 (scm_t_bits) hash, SCM_UNPACK (props));
635 }
636
637 SCM
638 scm_i_c_make_symbol (const char *name, size_t len,
639 scm_t_bits flags, unsigned long hash, SCM props)
640 {
641 SCM buf = make_stringbuf (len);
642 memcpy (STRINGBUF_CHARS (buf), name, len);
643
644 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
645 (scm_t_bits) hash, SCM_UNPACK (props));
646 }
647
648 /* Return a new symbol that uses the LEN bytes pointed to by NAME as its
649 underlying storage. */
650 SCM
651 scm_i_c_take_symbol (char *name, size_t len,
652 scm_t_bits flags, unsigned long hash, SCM props)
653 {
654 SCM buf = scm_i_take_stringbufn (name, len);
655
656 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
657 (scm_t_bits) hash, SCM_UNPACK (props));
658 }
659
660 /* Returns the number of characters in SYM. This may be different
661 from the memory size of SYM. */
662 size_t
663 scm_i_symbol_length (SCM sym)
664 {
665 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
666 }
667
668 size_t
669 scm_c_symbol_length (SCM sym)
670 #define FUNC_NAME "scm_c_symbol_length"
671 {
672 SCM_VALIDATE_SYMBOL (1, sym);
673
674 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
675 }
676 #undef FUNC_NAME
677
678 /* True if the name of SYM is stored as a Latin-1 encoded string.
679 False if it is stored as a 32-bit UCS-4-encoded string. */
680 int
681 scm_i_is_narrow_symbol (SCM sym)
682 {
683 SCM buf;
684
685 buf = SYMBOL_STRINGBUF (sym);
686 return !STRINGBUF_WIDE (buf);
687 }
688
689 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
690 contains the name of SYM. */
691 const char *
692 scm_i_symbol_chars (SCM sym)
693 {
694 SCM buf;
695
696 buf = SYMBOL_STRINGBUF (sym);
697 if (!STRINGBUF_WIDE (buf))
698 return STRINGBUF_CHARS (buf);
699 else
700 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
701 scm_list_1 (sym));
702 }
703
704 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
705 symbol's name. */
706 const scm_t_wchar *
707 scm_i_symbol_wide_chars (SCM sym)
708 {
709 SCM buf;
710
711 buf = SYMBOL_STRINGBUF (sym);
712 if (STRINGBUF_WIDE (buf))
713 return STRINGBUF_WIDE_CHARS (buf);
714 else
715 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
716 scm_list_1 (sym));
717 }
718
719 SCM
720 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
721 {
722 SCM buf = SYMBOL_STRINGBUF (sym);
723 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
724 SET_STRINGBUF_SHARED (buf);
725 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
726 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
727 (scm_t_bits)start, (scm_t_bits) end - start);
728 }
729
730 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
731 scm_t_wchar
732 scm_i_symbol_ref (SCM sym, size_t x)
733 {
734 if (scm_i_is_narrow_symbol (sym))
735 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
736 else
737 return scm_i_symbol_wide_chars (sym)[x];
738 }
739
740 /* Debugging
741 */
742
743 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
744 "Returns an association list containing debugging information\n"
745 "for @var{str}. The association list has the following entries."
746 "@table @code\n"
747 "@item string\n"
748 "The string itself.\n"
749 "@item start\n"
750 "The start index of the string into its stringbuf\n"
751 "@item length\n"
752 "The length of the string\n"
753 "@item shared\n"
754 "If this string is a substring, it returns its parent string.\n"
755 "Otherwise, it returns @code{#f}\n"
756 "@item read-only\n"
757 "@code{#t} if the string is read-only\n"
758 "@item stringbuf-chars\n"
759 "A new string containing this string's stringbuf's characters\n"
760 "@item stringbuf-length\n"
761 "The number of characters in this stringbuf\n"
762 "@item stringbuf-shared\n"
763 "@code{#t} if this stringbuf is shared\n"
764 "@item stringbuf-inline\n"
765 "@code{#t} if this stringbuf's characters are stored in the\n"
766 "cell itself, or @code{#f} if they were allocated in memory\n"
767 "@item stringbuf-wide\n"
768 "@code{#t} if this stringbuf's characters are stored in a\n"
769 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
770 "buffer\n"
771 "@end table")
772 #define FUNC_NAME s_scm_sys_string_dump
773 {
774 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
775 SCM buf;
776 SCM_VALIDATE_STRING (1, str);
777
778 /* String info */
779 e1 = scm_cons (scm_from_locale_symbol ("string"),
780 str);
781 e2 = scm_cons (scm_from_locale_symbol ("start"),
782 scm_from_size_t (STRING_START (str)));
783 e3 = scm_cons (scm_from_locale_symbol ("length"),
784 scm_from_size_t (STRING_LENGTH (str)));
785
786 if (IS_SH_STRING (str))
787 {
788 e4 = scm_cons (scm_from_locale_symbol ("shared"),
789 SH_STRING_STRING (str));
790 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
791 }
792 else
793 {
794 e4 = scm_cons (scm_from_locale_symbol ("shared"),
795 SCM_BOOL_F);
796 buf = STRING_STRINGBUF (str);
797 }
798
799 if (IS_RO_STRING (str))
800 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
801 SCM_BOOL_T);
802 else
803 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
804 SCM_BOOL_F);
805
806 /* Stringbuf info */
807 if (!STRINGBUF_WIDE (buf))
808 {
809 size_t len = STRINGBUF_LENGTH (buf);
810 char *cbuf;
811 SCM sbc = scm_i_make_string (len, &cbuf);
812 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
813 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
814 sbc);
815 }
816 else
817 {
818 size_t len = STRINGBUF_LENGTH (buf);
819 scm_t_wchar *cbuf;
820 SCM sbc = scm_i_make_wide_string (len, &cbuf);
821 u32_cpy ((scm_t_uint32 *) cbuf,
822 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
823 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
824 sbc);
825 }
826 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
827 scm_from_size_t (STRINGBUF_LENGTH (buf)));
828 if (STRINGBUF_SHARED (buf))
829 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
830 SCM_BOOL_T);
831 else
832 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
833 SCM_BOOL_F);
834 if (STRINGBUF_INLINE (buf))
835 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
836 SCM_BOOL_T);
837 else
838 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
839 SCM_BOOL_F);
840 if (STRINGBUF_WIDE (buf))
841 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
842 SCM_BOOL_T);
843 else
844 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
845 SCM_BOOL_F);
846
847 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
848 }
849 #undef FUNC_NAME
850
851 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
852 "Returns an association list containing debugging information\n"
853 "for @var{sym}. The association list has the following entries."
854 "@table @code\n"
855 "@item symbol\n"
856 "The symbol itself\n"
857 "@item hash\n"
858 "Its hash value\n"
859 "@item interned\n"
860 "@code{#t} if it is an interned symbol\n"
861 "@item stringbuf-chars\n"
862 "A new string containing this symbols's stringbuf's characters\n"
863 "@item stringbuf-length\n"
864 "The number of characters in this stringbuf\n"
865 "@item stringbuf-shared\n"
866 "@code{#t} if this stringbuf is shared\n"
867 "@item stringbuf-inline\n"
868 "@code{#t} if this stringbuf's characters are stored in the\n"
869 "cell itself, or @code{#f} if they were allocated in memory\n"
870 "@item stringbuf-wide\n"
871 "@code{#t} if this stringbuf's characters are stored in a\n"
872 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
873 "buffer\n"
874 "@end table")
875 #define FUNC_NAME s_scm_sys_symbol_dump
876 {
877 SCM e1, e2, e3, e4, e5, e6, e7, e8;
878 SCM buf;
879 SCM_VALIDATE_SYMBOL (1, sym);
880 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
881 sym);
882 e2 = scm_cons (scm_from_locale_symbol ("hash"),
883 scm_from_ulong (scm_i_symbol_hash (sym)));
884 e3 = scm_cons (scm_from_locale_symbol ("interned"),
885 scm_symbol_interned_p (sym));
886 buf = SYMBOL_STRINGBUF (sym);
887
888 /* Stringbuf info */
889 if (!STRINGBUF_WIDE (buf))
890 {
891 size_t len = STRINGBUF_LENGTH (buf);
892 char *cbuf;
893 SCM sbc = scm_i_make_string (len, &cbuf);
894 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
895 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
896 sbc);
897 }
898 else
899 {
900 size_t len = STRINGBUF_LENGTH (buf);
901 scm_t_wchar *cbuf;
902 SCM sbc = scm_i_make_wide_string (len, &cbuf);
903 u32_cpy ((scm_t_uint32 *) cbuf,
904 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
905 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
906 sbc);
907 }
908 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
909 scm_from_size_t (STRINGBUF_LENGTH (buf)));
910 if (STRINGBUF_SHARED (buf))
911 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
912 SCM_BOOL_T);
913 else
914 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
915 SCM_BOOL_F);
916 if (STRINGBUF_INLINE (buf))
917 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
918 SCM_BOOL_T);
919 else
920 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
921 SCM_BOOL_F);
922 if (STRINGBUF_WIDE (buf))
923 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
924 SCM_BOOL_T);
925 else
926 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
927 SCM_BOOL_F);
928 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
929
930 }
931 #undef FUNC_NAME
932
933 #if SCM_STRING_LENGTH_HISTOGRAM
934
935 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
936 #define FUNC_NAME s_scm_sys_stringbuf_hist
937 {
938 int i;
939 for (i = 0; i < 1000; i++)
940 if (lenhist[i])
941 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
942 fprintf (stderr, ">999: %u\n", lenhist[1000]);
943 return SCM_UNSPECIFIED;
944 }
945 #undef FUNC_NAME
946
947 #endif
948
949 \f
950
951 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
952 (SCM obj),
953 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
954 #define FUNC_NAME s_scm_string_p
955 {
956 return scm_from_bool (IS_STRING (obj));
957 }
958 #undef FUNC_NAME
959
960
961 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
962
963 SCM_DEFINE (scm_string, "string", 0, 0, 1,
964 (SCM chrs),
965 "@deffnx {Scheme Procedure} list->string chrs\n"
966 "Return a newly allocated string composed of the arguments,\n"
967 "@var{chrs}.")
968 #define FUNC_NAME s_scm_string
969 {
970 SCM result;
971 SCM rest;
972 size_t len;
973 size_t p = 0;
974 long i;
975
976 /* Verify that this is a list of chars. */
977 i = scm_ilength (chrs);
978 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
979
980 len = (size_t) i;
981 rest = chrs;
982
983 while (len > 0 && scm_is_pair (rest))
984 {
985 SCM elt = SCM_CAR (rest);
986 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
987 rest = SCM_CDR (rest);
988 len--;
989 scm_remember_upto_here_1 (elt);
990 }
991
992 /* Construct a string containing this list of chars. */
993 len = (size_t) i;
994 rest = chrs;
995
996 result = scm_i_make_string (len, NULL);
997 result = scm_i_string_start_writing (result);
998 while (len > 0 && scm_is_pair (rest))
999 {
1000 SCM elt = SCM_CAR (rest);
1001 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1002 p++;
1003 rest = SCM_CDR (rest);
1004 len--;
1005 scm_remember_upto_here_1 (elt);
1006 }
1007 scm_i_string_stop_writing ();
1008
1009 if (len > 0)
1010 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1011 if (!scm_is_null (rest))
1012 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1013
1014 return result;
1015 }
1016 #undef FUNC_NAME
1017
1018 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1019 (SCM k, SCM chr),
1020 "Return a newly allocated string of\n"
1021 "length @var{k}. If @var{chr} is given, then all elements of\n"
1022 "the string are initialized to @var{chr}, otherwise the contents\n"
1023 "of the @var{string} are unspecified.")
1024 #define FUNC_NAME s_scm_make_string
1025 {
1026 return scm_c_make_string (scm_to_size_t (k), chr);
1027 }
1028 #undef FUNC_NAME
1029
1030 SCM
1031 scm_c_make_string (size_t len, SCM chr)
1032 #define FUNC_NAME NULL
1033 {
1034 size_t p;
1035 SCM res = scm_i_make_string (len, NULL);
1036
1037 if (!SCM_UNBNDP (chr))
1038 {
1039 SCM_VALIDATE_CHAR (0, chr);
1040 res = scm_i_string_start_writing (res);
1041 for (p = 0; p < len; p++)
1042 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1043 scm_i_string_stop_writing ();
1044 }
1045
1046 return res;
1047 }
1048 #undef FUNC_NAME
1049
1050 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1051 (SCM string),
1052 "Return the number of characters in @var{string}.")
1053 #define FUNC_NAME s_scm_string_length
1054 {
1055 SCM_VALIDATE_STRING (1, string);
1056 return scm_from_size_t (STRING_LENGTH (string));
1057 }
1058 #undef FUNC_NAME
1059
1060 SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1061 (SCM string),
1062 "Return the bytes used to represent a character in @var{string}."
1063 "This will return 1 or 4.")
1064 #define FUNC_NAME s_scm_string_width
1065 {
1066 SCM_VALIDATE_STRING (1, string);
1067 if (!scm_i_is_narrow_string (string))
1068 return scm_from_int (4);
1069
1070 return scm_from_int (1);
1071 }
1072 #undef FUNC_NAME
1073
1074 size_t
1075 scm_c_string_length (SCM string)
1076 {
1077 if (!IS_STRING (string))
1078 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1079 return STRING_LENGTH (string);
1080 }
1081
1082 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1083 (SCM str, SCM k),
1084 "Return character @var{k} of @var{str} using zero-origin\n"
1085 "indexing. @var{k} must be a valid index of @var{str}.")
1086 #define FUNC_NAME s_scm_string_ref
1087 {
1088 size_t len;
1089 unsigned long idx;
1090
1091 SCM_VALIDATE_STRING (1, str);
1092
1093 len = scm_i_string_length (str);
1094 if (SCM_LIKELY (len > 0))
1095 idx = scm_to_unsigned_integer (k, 0, len - 1);
1096 else
1097 scm_out_of_range (NULL, k);
1098
1099 if (scm_i_is_narrow_string (str))
1100 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1101 else
1102 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1103 }
1104 #undef FUNC_NAME
1105
1106 SCM
1107 scm_c_string_ref (SCM str, size_t p)
1108 {
1109 if (p >= scm_i_string_length (str))
1110 scm_out_of_range (NULL, scm_from_size_t (p));
1111 if (scm_i_is_narrow_string (str))
1112 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1113 else
1114 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1115
1116 }
1117
1118 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1119 (SCM str, SCM k, SCM chr),
1120 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1121 "an unspecified value. @var{k} must be a valid index of\n"
1122 "@var{str}.")
1123 #define FUNC_NAME s_scm_string_set_x
1124 {
1125 size_t len;
1126 unsigned long idx;
1127
1128 SCM_VALIDATE_STRING (1, str);
1129
1130 len = scm_i_string_length (str);
1131 if (SCM_LIKELY (len > 0))
1132 idx = scm_to_unsigned_integer (k, 0, len - 1);
1133 else
1134 scm_out_of_range (NULL, k);
1135
1136 SCM_VALIDATE_CHAR (3, chr);
1137 str = scm_i_string_start_writing (str);
1138 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1139 scm_i_string_stop_writing ();
1140
1141 return SCM_UNSPECIFIED;
1142 }
1143 #undef FUNC_NAME
1144
1145 void
1146 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1147 {
1148 if (p >= scm_i_string_length (str))
1149 scm_out_of_range (NULL, scm_from_size_t (p));
1150 str = scm_i_string_start_writing (str);
1151 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1152 scm_i_string_stop_writing ();
1153 }
1154
1155 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1156 (SCM str, SCM start, SCM end),
1157 "Return a newly allocated string formed from the characters\n"
1158 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1159 "ending with index @var{end} (exclusive).\n"
1160 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1161 "exact integers satisfying:\n\n"
1162 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1163 #define FUNC_NAME s_scm_substring
1164 {
1165 size_t len, from, to;
1166
1167 SCM_VALIDATE_STRING (1, str);
1168 len = scm_i_string_length (str);
1169 from = scm_to_unsigned_integer (start, 0, len);
1170 if (SCM_UNBNDP (end))
1171 to = len;
1172 else
1173 to = scm_to_unsigned_integer (end, from, len);
1174 return scm_i_substring (str, from, to);
1175 }
1176 #undef FUNC_NAME
1177
1178 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1179 (SCM str, SCM start, SCM end),
1180 "Return a newly allocated string formed from the characters\n"
1181 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1182 "ending with index @var{end} (exclusive).\n"
1183 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1184 "exact integers satisfying:\n"
1185 "\n"
1186 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1187 "\n"
1188 "The returned string is read-only.\n")
1189 #define FUNC_NAME s_scm_substring_read_only
1190 {
1191 size_t len, from, to;
1192
1193 SCM_VALIDATE_STRING (1, str);
1194 len = scm_i_string_length (str);
1195 from = scm_to_unsigned_integer (start, 0, len);
1196 if (SCM_UNBNDP (end))
1197 to = len;
1198 else
1199 to = scm_to_unsigned_integer (end, from, len);
1200 return scm_i_substring_read_only (str, from, to);
1201 }
1202 #undef FUNC_NAME
1203
1204 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1205 (SCM str, SCM start, SCM end),
1206 "Return a newly allocated string formed from the characters\n"
1207 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1208 "ending with index @var{end} (exclusive).\n"
1209 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1210 "exact integers satisfying:\n\n"
1211 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1212 #define FUNC_NAME s_scm_substring_copy
1213 {
1214 /* For the Scheme version, START is mandatory, but for the C
1215 version, it is optional. See scm_string_copy in srfi-13.c for a
1216 rationale.
1217 */
1218
1219 size_t from, to;
1220
1221 SCM_VALIDATE_STRING (1, str);
1222 scm_i_get_substring_spec (scm_i_string_length (str),
1223 start, &from, end, &to);
1224 return scm_i_substring_copy (str, from, to);
1225 }
1226 #undef FUNC_NAME
1227
1228 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1229 (SCM str, SCM start, SCM end),
1230 "Return string that indirectly refers to the characters\n"
1231 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1232 "ending with index @var{end} (exclusive).\n"
1233 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1234 "exact integers satisfying:\n\n"
1235 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1236 #define FUNC_NAME s_scm_substring_shared
1237 {
1238 size_t len, from, to;
1239
1240 SCM_VALIDATE_STRING (1, str);
1241 len = scm_i_string_length (str);
1242 from = scm_to_unsigned_integer (start, 0, len);
1243 if (SCM_UNBNDP (end))
1244 to = len;
1245 else
1246 to = scm_to_unsigned_integer (end, from, len);
1247 return scm_i_substring_shared (str, from, to);
1248 }
1249 #undef FUNC_NAME
1250
1251 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1252 (SCM args),
1253 "Return a newly allocated string whose characters form the\n"
1254 "concatenation of the given strings, @var{args}.")
1255 #define FUNC_NAME s_scm_string_append
1256 {
1257 SCM res;
1258 size_t len = 0;
1259 int wide = 0;
1260 SCM l, s;
1261 size_t i;
1262 union
1263 {
1264 char *narrow;
1265 scm_t_wchar *wide;
1266 } data;
1267
1268 SCM_VALIDATE_REST_ARGUMENT (args);
1269 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1270 {
1271 s = SCM_CAR (l);
1272 SCM_VALIDATE_STRING (SCM_ARGn, s);
1273 len += scm_i_string_length (s);
1274 if (!scm_i_is_narrow_string (s))
1275 wide = 1;
1276 }
1277 data.narrow = NULL;
1278 if (!wide)
1279 res = scm_i_make_string (len, &data.narrow);
1280 else
1281 res = scm_i_make_wide_string (len, &data.wide);
1282
1283 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1284 {
1285 size_t len;
1286 s = SCM_CAR (l);
1287 SCM_VALIDATE_STRING (SCM_ARGn, s);
1288 len = scm_i_string_length (s);
1289 if (!wide)
1290 {
1291 memcpy (data.narrow, scm_i_string_chars (s), len);
1292 data.narrow += len;
1293 }
1294 else
1295 {
1296 if (scm_i_is_narrow_string (s))
1297 {
1298 for (i = 0; i < scm_i_string_length (s); i++)
1299 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1300 }
1301 else
1302 u32_cpy ((scm_t_uint32 *) data.wide,
1303 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1304 data.wide += len;
1305 }
1306 scm_remember_upto_here_1 (s);
1307 }
1308 return res;
1309 }
1310 #undef FUNC_NAME
1311
1312 int
1313 scm_is_string (SCM obj)
1314 {
1315 return IS_STRING (obj);
1316 }
1317
1318 SCM
1319 scm_from_locale_stringn (const char *str, size_t len)
1320 {
1321 SCM res;
1322 char *dst;
1323
1324 if (len == (size_t) -1)
1325 len = strlen (str);
1326 if (len == 0)
1327 return scm_nullstr;
1328
1329 res = scm_i_make_string (len, &dst);
1330 memcpy (dst, str, len);
1331 return res;
1332 }
1333
1334 SCM
1335 scm_from_locale_string (const char *str)
1336 {
1337 if (str == NULL)
1338 return scm_nullstr;
1339
1340 return scm_from_locale_stringn (str, -1);
1341 }
1342
1343 /* Create a new scheme string from the C string STR. The memory of
1344 STR may be used directly as storage for the new string. */
1345 SCM
1346 scm_take_locale_stringn (char *str, size_t len)
1347 {
1348 SCM buf, res;
1349
1350 if (len == (size_t) -1)
1351 len = strlen (str);
1352 else
1353 {
1354 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1355 often be satisfied from the alignment padding after the block, with
1356 no actual data movement. */
1357 str = scm_realloc (str, len + 1);
1358 str[len] = '\0';
1359 }
1360
1361 buf = scm_i_take_stringbufn (str, len);
1362 res = scm_double_cell (STRING_TAG,
1363 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
1364 return res;
1365 }
1366
1367 SCM
1368 scm_take_locale_string (char *str)
1369 {
1370 return scm_take_locale_stringn (str, -1);
1371 }
1372
1373 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1374 and \UXXXXXX. */
1375 static void
1376 unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1377 {
1378 char *before, *after;
1379 size_t i, j;
1380
1381 before = *bufp;
1382 after = *bufp;
1383 i = 0;
1384 j = 0;
1385 while (i < *lenp)
1386 {
1387 if ((i <= *lenp - 6)
1388 && before[i] == '\\'
1389 && before[i + 1] == 'u'
1390 && before[i + 2] == '0' && before[i + 3] == '0')
1391 {
1392 /* Convert \u00NN to \xNN */
1393 after[j] = '\\';
1394 after[j + 1] = 'x';
1395 after[j + 2] = tolower ((int) before[i + 4]);
1396 after[j + 3] = tolower ((int) before[i + 5]);
1397 i += 6;
1398 j += 4;
1399 }
1400 else if ((i <= *lenp - 10)
1401 && before[i] == '\\'
1402 && before[i + 1] == 'U'
1403 && before[i + 2] == '0' && before[i + 3] == '0')
1404 {
1405 /* Convert \U00NNNNNN to \UNNNNNN */
1406 after[j] = '\\';
1407 after[j + 1] = 'U';
1408 after[j + 2] = tolower ((int) before[i + 4]);
1409 after[j + 3] = tolower ((int) before[i + 5]);
1410 after[j + 4] = tolower ((int) before[i + 6]);
1411 after[j + 5] = tolower ((int) before[i + 7]);
1412 after[j + 6] = tolower ((int) before[i + 8]);
1413 after[j + 7] = tolower ((int) before[i + 9]);
1414 i += 10;
1415 j += 8;
1416 }
1417 else
1418 {
1419 after[j] = before[i];
1420 i++;
1421 j++;
1422 }
1423 }
1424 *lenp = j;
1425 after = scm_realloc (after, j);
1426 }
1427
1428 char *
1429 scm_to_locale_stringn (SCM str, size_t * lenp)
1430 {
1431 const char *enc;
1432
1433 /* In the future, enc will hold the port's encoding. */
1434 enc = NULL;
1435
1436 return scm_to_stringn (str, lenp, enc,
1437 SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
1438 }
1439
1440 /* Low-level scheme to C string conversion function. */
1441 char *
1442 scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
1443 scm_t_string_failed_conversion_handler handler)
1444 {
1445 static const char iso[11] = "ISO-8859-1";
1446 char *buf;
1447 size_t ilen, len, i;
1448
1449 if (!scm_is_string (str))
1450 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1451 ilen = scm_i_string_length (str);
1452
1453 if (ilen == 0)
1454 {
1455 buf = scm_malloc (1);
1456 buf[0] = '\0';
1457 if (lenp)
1458 *lenp = 0;
1459 return buf;
1460 }
1461
1462 if (lenp == NULL)
1463 for (i = 0; i < ilen; i++)
1464 if (scm_i_string_ref (str, i) == '\0')
1465 scm_misc_error (NULL,
1466 "string contains #\\nul character: ~S",
1467 scm_list_1 (str));
1468
1469 if (scm_i_is_narrow_string (str))
1470 {
1471 if (lenp)
1472 {
1473 buf = scm_malloc (ilen);
1474 memcpy (buf, scm_i_string_chars (str), ilen);
1475 *lenp = ilen;
1476 return buf;
1477 }
1478 else
1479 {
1480 buf = scm_malloc (ilen + 1);
1481 memcpy (buf, scm_i_string_chars (str), ilen);
1482 buf[ilen] = '\0';
1483 return buf;
1484 }
1485 }
1486
1487
1488 buf = NULL;
1489 len = 0;
1490 buf = u32_conv_to_encoding (iso,
1491 (enum iconv_ilseq_handler) handler,
1492 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1493 ilen, NULL, NULL, &len);
1494 if (buf == NULL)
1495 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1496 scm_list_2 (scm_from_locale_string (iso), str));
1497
1498 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1499 unistring_escapes_to_guile_escapes (&buf, &len);
1500
1501 if (lenp)
1502 *lenp = len;
1503 else
1504 {
1505 buf = scm_realloc (buf, len + 1);
1506 buf[len] = '\0';
1507 }
1508
1509 scm_remember_upto_here_1 (str);
1510 return buf;
1511 }
1512
1513 char *
1514 scm_to_locale_string (SCM str)
1515 {
1516 return scm_to_locale_stringn (str, NULL);
1517 }
1518
1519 size_t
1520 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1521 {
1522 size_t len;
1523 char *result = NULL;
1524 if (!scm_is_string (str))
1525 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1526 result = scm_to_locale_stringn (str, &len);
1527
1528 memcpy (buf, result, (len > max_len) ? max_len : len);
1529 free (result);
1530
1531 scm_remember_upto_here_1 (str);
1532 return len;
1533 }
1534
1535 /* converts C scm_array of strings to SCM scm_list of strings. */
1536 /* If argc < 0, a null terminated scm_array is assumed. */
1537 SCM
1538 scm_makfromstrs (int argc, char **argv)
1539 {
1540 int i = argc;
1541 SCM lst = SCM_EOL;
1542 if (0 > i)
1543 for (i = 0; argv[i]; i++);
1544 while (i--)
1545 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1546 return lst;
1547 }
1548
1549 /* Return a newly allocated array of char pointers to each of the strings
1550 in args, with a terminating NULL pointer. */
1551
1552 char **
1553 scm_i_allocate_string_pointers (SCM list)
1554 #define FUNC_NAME "scm_i_allocate_string_pointers"
1555 {
1556 char **result;
1557 int len = scm_ilength (list);
1558 int i;
1559
1560 if (len < 0)
1561 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1562
1563 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1564 "string pointers");
1565 result[len] = NULL;
1566
1567 /* The list might be have been modified in another thread, so
1568 we check LIST before each access.
1569 */
1570 for (i = 0; i < len && scm_is_pair (list); i++)
1571 {
1572 SCM str;
1573 size_t len;
1574
1575 str = SCM_CAR (list);
1576 len = scm_c_string_length (str);
1577
1578 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1579 memcpy (result[i], scm_i_string_chars (str), len);
1580 result[i][len] = '\0';
1581
1582 list = SCM_CDR (list);
1583 }
1584
1585 return result;
1586 }
1587 #undef FUNC_NAME
1588
1589 void
1590 scm_i_get_substring_spec (size_t len,
1591 SCM start, size_t *cstart,
1592 SCM end, size_t *cend)
1593 {
1594 if (SCM_UNBNDP (start))
1595 *cstart = 0;
1596 else
1597 *cstart = scm_to_unsigned_integer (start, 0, len);
1598
1599 if (SCM_UNBNDP (end))
1600 *cend = len;
1601 else
1602 *cend = scm_to_unsigned_integer (end, *cstart, len);
1603 }
1604
1605 #if SCM_ENABLE_DEPRECATED
1606
1607 /* When these definitions are removed, it becomes reasonable to use
1608 read-only strings for string literals. For that, change the reader
1609 to create string literals with scm_c_substring_read_only instead of
1610 with scm_c_substring_copy.
1611 */
1612
1613 int
1614 scm_i_deprecated_stringp (SCM str)
1615 {
1616 scm_c_issue_deprecation_warning
1617 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1618
1619 return scm_is_string (str);
1620 }
1621
1622 char *
1623 scm_i_deprecated_string_chars (SCM str)
1624 {
1625 char *chars;
1626
1627 scm_c_issue_deprecation_warning
1628 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1629
1630 /* We don't accept shared substrings here since they are not
1631 null-terminated.
1632 */
1633 if (IS_SH_STRING (str))
1634 scm_misc_error (NULL,
1635 "SCM_STRING_CHARS does not work with shared substrings.",
1636 SCM_EOL);
1637
1638 /* We explicitly test for read-only strings to produce a better
1639 error message.
1640 */
1641
1642 if (IS_RO_STRING (str))
1643 scm_misc_error (NULL,
1644 "SCM_STRING_CHARS does not work with read-only strings.",
1645 SCM_EOL);
1646
1647 /* The following is still wrong, of course...
1648 */
1649 str = scm_i_string_start_writing (str);
1650 chars = scm_i_string_writable_chars (str);
1651 scm_i_string_stop_writing ();
1652 return chars;
1653 }
1654
1655 size_t
1656 scm_i_deprecated_string_length (SCM str)
1657 {
1658 scm_c_issue_deprecation_warning
1659 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1660 return scm_c_string_length (str);
1661 }
1662
1663 #endif
1664
1665 void
1666 scm_init_strings ()
1667 {
1668 scm_nullstr = scm_i_make_string (0, NULL);
1669
1670 #include "libguile/strings.x"
1671 }
1672
1673
1674 /*
1675 Local Variables:
1676 c-file-style: "gnu"
1677 End:
1678 */