Merge commit '3d51e57cfb0404db568a6adfde2a346d3fd9907e'
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229
230 \f
231 /* Copy-on-write strings.
232 */
233
234 #define STRING_TAG scm_tc7_string
235
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
244
245 /* Read-only strings.
246 */
247
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
251 /* Mutation-sharing substrings
252 */
253
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
258
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
261 SCM scm_nullstr;
262
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
267 SCM
268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
269 {
270 static SCM null_stringbuf = SCM_BOOL_F;
271 SCM buf;
272 SCM res;
273
274 if (len == 0)
275 {
276 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
277 {
278 null_stringbuf = make_stringbuf (0);
279 SET_STRINGBUF_SHARED (null_stringbuf);
280 }
281 buf = null_stringbuf;
282 }
283 else
284 buf = make_stringbuf (len);
285
286 if (charsp)
287 *charsp = (char *) STRINGBUF_CHARS (buf);
288 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
289 SCM_UNPACK (buf),
290 (scm_t_bits) 0, (scm_t_bits) len);
291 return res;
292 }
293
294 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
295 characters. CHARSP, if not NULL, will be set to location of the
296 character array. If READ_ONLY_P, the returned string is read-only;
297 otherwise it is writable. */
298 SCM
299 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
300 {
301 SCM buf = make_wide_stringbuf (len);
302 SCM res;
303 if (charsp)
304 *charsp = STRINGBUF_WIDE_CHARS (buf);
305 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
306 SCM_UNPACK (buf),
307 (scm_t_bits) 0, (scm_t_bits) len);
308 return res;
309 }
310
311 static void
312 validate_substring_args (SCM str, size_t start, size_t end)
313 {
314 if (!IS_STRING (str))
315 scm_wrong_type_arg_msg (NULL, 0, str, "string");
316 if (start > STRING_LENGTH (str))
317 scm_out_of_range (NULL, scm_from_size_t (start));
318 if (end > STRING_LENGTH (str) || end < start)
319 scm_out_of_range (NULL, scm_from_size_t (end));
320 }
321
322 static inline void
323 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
324 {
325 *start = STRING_START (*str);
326 if (IS_SH_STRING (*str))
327 {
328 *str = SH_STRING_STRING (*str);
329 *start += STRING_START (*str);
330 }
331 *buf = STRING_STRINGBUF (*str);
332 }
333
334 SCM
335 scm_i_substring (SCM str, size_t start, size_t end)
336 {
337 if (start == end)
338 return scm_i_make_string (0, NULL, 0);
339 else
340 {
341 SCM buf;
342 size_t str_start;
343 get_str_buf_start (&str, &buf, &str_start);
344 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
345 SET_STRINGBUF_SHARED (buf);
346 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
347 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
348 (scm_t_bits)str_start + start,
349 (scm_t_bits) end - start);
350 }
351 }
352
353 SCM
354 scm_i_substring_read_only (SCM str, size_t start, size_t end)
355 {
356 if (start == end)
357 return scm_i_make_string (0, NULL, 1);
358 else
359 {
360 SCM buf;
361 size_t str_start;
362 get_str_buf_start (&str, &buf, &str_start);
363 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
364 SET_STRINGBUF_SHARED (buf);
365 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
366 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
367 (scm_t_bits)str_start + start,
368 (scm_t_bits) end - start);
369 }
370 }
371
372 SCM
373 scm_i_substring_copy (SCM str, size_t start, size_t end)
374 {
375 size_t len = end - start;
376 SCM buf, my_buf, substr;
377 size_t str_start;
378 int wide = 0;
379 get_str_buf_start (&str, &buf, &str_start);
380 if (scm_i_is_narrow_string (str))
381 {
382 my_buf = make_stringbuf (len);
383 memcpy (STRINGBUF_CHARS (my_buf),
384 STRINGBUF_CHARS (buf) + str_start + start, len);
385 }
386 else
387 {
388 my_buf = make_wide_stringbuf (len);
389 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
390 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
391 + start), len);
392 wide = 1;
393 }
394 scm_remember_upto_here_1 (buf);
395 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
396 (scm_t_bits) 0, (scm_t_bits) len);
397 if (wide)
398 scm_i_try_narrow_string (substr);
399 return substr;
400 }
401
402 SCM
403 scm_i_substring_shared (SCM str, size_t start, size_t end)
404 {
405 if (start == 0 && end == STRING_LENGTH (str))
406 return str;
407 else if (start == end)
408 return scm_i_make_string (0, NULL, 0);
409 else
410 {
411 size_t len = end - start;
412 if (IS_SH_STRING (str))
413 {
414 start += STRING_START (str);
415 str = SH_STRING_STRING (str);
416 }
417 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
418 (scm_t_bits)start, (scm_t_bits) len);
419 }
420 }
421
422 SCM
423 scm_c_substring (SCM str, size_t start, size_t end)
424 {
425 validate_substring_args (str, start, end);
426 return scm_i_substring (str, start, end);
427 }
428
429 SCM
430 scm_c_substring_read_only (SCM str, size_t start, size_t end)
431 {
432 validate_substring_args (str, start, end);
433 return scm_i_substring_read_only (str, start, end);
434 }
435
436 SCM
437 scm_c_substring_copy (SCM str, size_t start, size_t end)
438 {
439 validate_substring_args (str, start, end);
440 return scm_i_substring_copy (str, start, end);
441 }
442
443 SCM
444 scm_c_substring_shared (SCM str, size_t start, size_t end)
445 {
446 validate_substring_args (str, start, end);
447 return scm_i_substring_shared (str, start, end);
448 }
449
450 \f
451 /* Internal accessors
452 */
453
454 /* Returns the number of characters in STR. This may be different
455 than the memory size of the string storage. */
456 size_t
457 scm_i_string_length (SCM str)
458 {
459 return STRING_LENGTH (str);
460 }
461
462 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
463 encoding. False if it is 'wide', having a 32-bit UCS-4
464 encoding. */
465 int
466 scm_i_is_narrow_string (SCM str)
467 {
468 if (IS_SH_STRING (str))
469 str = SH_STRING_STRING (str);
470
471 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
472 }
473
474 /* Try to coerce a string to be narrow. It if is narrow already, do
475 nothing. If it is wide, shrink it to narrow if none of its
476 characters are above 0xFF. Return true if the string is narrow or
477 was made to be narrow. */
478 int
479 scm_i_try_narrow_string (SCM str)
480 {
481 if (IS_SH_STRING (str))
482 str = SH_STRING_STRING (str);
483
484 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
485
486 return scm_i_is_narrow_string (str);
487 }
488
489 /* Return a pointer to the raw data of the string, which can be either Latin-1
490 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
491 const void *
492 scm_i_string_data (SCM str)
493 {
494 SCM buf;
495 size_t start;
496 const char *data;
497
498 get_str_buf_start (&str, &buf, &start);
499
500 data = STRINGBUF_CONTENTS (buf);
501 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
502
503 return data;
504 }
505
506 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
507 STR. */
508 const char *
509 scm_i_string_chars (SCM str)
510 {
511 SCM buf;
512 size_t start;
513 get_str_buf_start (&str, &buf, &start);
514 if (scm_i_is_narrow_string (str))
515 return (const char *) STRINGBUF_CHARS (buf) + start;
516 else
517 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
518 scm_list_1 (str));
519 return NULL;
520 }
521
522 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
523 STR. */
524 const scm_t_wchar *
525 scm_i_string_wide_chars (SCM str)
526 {
527 SCM buf;
528 size_t start;
529
530 get_str_buf_start (&str, &buf, &start);
531 if (!scm_i_is_narrow_string (str))
532 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
533 else
534 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
535 scm_list_1 (str));
536 }
537
538 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
539 a new string buffer, so that it can be modified without modifying
540 other strings. Also, lock the string mutex. Later, one must call
541 scm_i_string_stop_writing to unlock the mutex. */
542 SCM
543 scm_i_string_start_writing (SCM orig_str)
544 {
545 SCM buf, str = orig_str;
546 size_t start;
547
548 get_str_buf_start (&str, &buf, &start);
549 if (IS_RO_STRING (str))
550 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
551
552 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
553 if (STRINGBUF_SHARED (buf))
554 {
555 /* Clone the stringbuf. */
556 size_t len = STRING_LENGTH (str);
557 SCM new_buf;
558
559 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
560
561 if (scm_i_is_narrow_string (str))
562 {
563 new_buf = make_stringbuf (len);
564 memcpy (STRINGBUF_CHARS (new_buf),
565 STRINGBUF_CHARS (buf) + STRING_START (str), len);
566
567 }
568 else
569 {
570 new_buf = make_wide_stringbuf (len);
571 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
572 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
573 + STRING_START (str)), len);
574 }
575
576 SET_STRING_STRINGBUF (str, new_buf);
577 start -= STRING_START (str);
578
579 /* FIXME: The following operations are not atomic, so other threads
580 looking at STR may see an inconsistent state. Nevertheless it can't
581 hurt much since (i) accessing STR while it is being mutated can't
582 yield a crash, and (ii) concurrent accesses to STR should be
583 protected by a mutex at the application level. The latter may not
584 apply when STR != ORIG_STR, though. */
585 SET_STRING_START (str, 0);
586 SET_STRING_STRINGBUF (str, new_buf);
587
588 buf = new_buf;
589
590 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
591 }
592 return orig_str;
593 }
594
595 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
596 char *
597 scm_i_string_writable_chars (SCM str)
598 {
599 SCM buf;
600 size_t start;
601
602 get_str_buf_start (&str, &buf, &start);
603 if (scm_i_is_narrow_string (str))
604 return (char *) STRINGBUF_CHARS (buf) + start;
605 else
606 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
607 scm_list_1 (str));
608 return NULL;
609 }
610
611 /* Return a pointer to the UCS-4 codepoints of a string. */
612 static scm_t_wchar *
613 scm_i_string_writable_wide_chars (SCM str)
614 {
615 SCM buf;
616 size_t start;
617
618 get_str_buf_start (&str, &buf, &start);
619 if (!scm_i_is_narrow_string (str))
620 return STRINGBUF_WIDE_CHARS (buf) + start;
621 else
622 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
623 scm_list_1 (str));
624 }
625
626 /* Unlock the string mutex that was locked when
627 scm_i_string_start_writing was called. */
628 void
629 scm_i_string_stop_writing (void)
630 {
631 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
632 }
633
634 /* Return the Xth character of STR as a UCS-4 codepoint. */
635 scm_t_wchar
636 scm_i_string_ref (SCM str, size_t x)
637 {
638 if (scm_i_is_narrow_string (str))
639 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
640 else
641 return scm_i_string_wide_chars (str)[x];
642 }
643
644 /* Returns index+1 of the first char in STR that matches C, or
645 0 if the char is not found. */
646 int
647 scm_i_string_contains_char (SCM str, char ch)
648 {
649 size_t i;
650 size_t len = scm_i_string_length (str);
651
652 i = 0;
653 if (scm_i_is_narrow_string (str))
654 {
655 while (i < len)
656 {
657 if (scm_i_string_chars (str)[i] == ch)
658 return i+1;
659 i++;
660 }
661 }
662 else
663 {
664 while (i < len)
665 {
666 if (scm_i_string_wide_chars (str)[i]
667 == (unsigned char) ch)
668 return i+1;
669 i++;
670 }
671 }
672 return 0;
673 }
674
675 int
676 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
677 {
678 if (scm_i_is_narrow_string (sstr))
679 {
680 const char *a = scm_i_string_chars (sstr) + start_x;
681 const char *b = cstr;
682 return strncmp (a, b, strlen(b));
683 }
684 else
685 {
686 size_t i;
687 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
688 const char *b = cstr;
689 for (i = 0; i < strlen (b); i++)
690 {
691 if (a[i] != (unsigned char) b[i])
692 return 1;
693 }
694 }
695 return 0;
696 }
697
698 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
699 void
700 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
701 {
702 if (IS_SH_STRING (str))
703 {
704 p += STRING_START (str);
705 str = SH_STRING_STRING (str);
706 }
707
708 if (chr > 0xFF && scm_i_is_narrow_string (str))
709 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
710
711 if (scm_i_is_narrow_string (str))
712 {
713 char *dst = scm_i_string_writable_chars (str);
714 dst[p] = chr;
715 }
716 else
717 {
718 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
719 dst[p] = chr;
720 }
721 }
722
723 \f
724 /* Symbols.
725
726 Basic symbol creation and accessing is done here, the rest is in
727 symbols.[hc]. This has been done to keep stringbufs and the
728 internals of strings and string-like objects confined to this file.
729 */
730
731 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
732
733 SCM
734 scm_i_make_symbol (SCM name, scm_t_bits flags,
735 unsigned long hash, SCM props)
736 {
737 SCM buf;
738 size_t start = STRING_START (name);
739 size_t length = STRING_LENGTH (name);
740
741 if (IS_SH_STRING (name))
742 {
743 name = SH_STRING_STRING (name);
744 start += STRING_START (name);
745 }
746 buf = SYMBOL_STRINGBUF (name);
747
748 if (start == 0 && length == STRINGBUF_LENGTH (buf))
749 {
750 /* reuse buf. */
751 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
752 SET_STRINGBUF_SHARED (buf);
753 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
754 }
755 else
756 {
757 /* make new buf. */
758 if (scm_i_is_narrow_string (name))
759 {
760 SCM new_buf = make_stringbuf (length);
761 memcpy (STRINGBUF_CHARS (new_buf),
762 STRINGBUF_CHARS (buf) + start, length);
763 buf = new_buf;
764 }
765 else
766 {
767 SCM new_buf = make_wide_stringbuf (length);
768 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
769 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
770 length);
771 buf = new_buf;
772 }
773 }
774 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
775 (scm_t_bits) hash, SCM_UNPACK (props));
776 }
777
778 SCM
779 scm_i_c_make_symbol (const char *name, size_t len,
780 scm_t_bits flags, unsigned long hash, SCM props)
781 {
782 SCM buf = make_stringbuf (len);
783 memcpy (STRINGBUF_CHARS (buf), name, len);
784
785 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
786 (scm_t_bits) hash, SCM_UNPACK (props));
787 }
788
789 /* Returns the number of characters in SYM. This may be different
790 from the memory size of SYM. */
791 size_t
792 scm_i_symbol_length (SCM sym)
793 {
794 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
795 }
796
797 size_t
798 scm_c_symbol_length (SCM sym)
799 #define FUNC_NAME "scm_c_symbol_length"
800 {
801 SCM_VALIDATE_SYMBOL (1, sym);
802
803 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
804 }
805 #undef FUNC_NAME
806
807 /* True if the name of SYM is stored as a Latin-1 encoded string.
808 False if it is stored as a 32-bit UCS-4-encoded string. */
809 int
810 scm_i_is_narrow_symbol (SCM sym)
811 {
812 SCM buf;
813
814 buf = SYMBOL_STRINGBUF (sym);
815 return !STRINGBUF_WIDE (buf);
816 }
817
818 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
819 contains the name of SYM. */
820 const char *
821 scm_i_symbol_chars (SCM sym)
822 {
823 SCM buf;
824
825 buf = SYMBOL_STRINGBUF (sym);
826 if (!STRINGBUF_WIDE (buf))
827 return (const char *) STRINGBUF_CHARS (buf);
828 else
829 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
830 scm_list_1 (sym));
831 }
832
833 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
834 symbol's name. */
835 const scm_t_wchar *
836 scm_i_symbol_wide_chars (SCM sym)
837 {
838 SCM buf;
839
840 buf = SYMBOL_STRINGBUF (sym);
841 if (STRINGBUF_WIDE (buf))
842 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
843 else
844 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
845 scm_list_1 (sym));
846 }
847
848 SCM
849 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
850 {
851 SCM buf = SYMBOL_STRINGBUF (sym);
852 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
853 SET_STRINGBUF_SHARED (buf);
854 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
855 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
856 (scm_t_bits)start, (scm_t_bits) end - start);
857 }
858
859 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
860 scm_t_wchar
861 scm_i_symbol_ref (SCM sym, size_t x)
862 {
863 if (scm_i_is_narrow_symbol (sym))
864 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
865 else
866 return scm_i_symbol_wide_chars (sym)[x];
867 }
868
869 /* Debugging
870 */
871
872 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
873 "Returns an association list containing debugging information\n"
874 "for @var{str}. The association list has the following entries."
875 "@table @code\n"
876 "@item string\n"
877 "The string itself.\n"
878 "@item start\n"
879 "The start index of the string into its stringbuf\n"
880 "@item length\n"
881 "The length of the string\n"
882 "@item shared\n"
883 "If this string is a substring, it returns its parent string.\n"
884 "Otherwise, it returns @code{#f}\n"
885 "@item read-only\n"
886 "@code{#t} if the string is read-only\n"
887 "@item stringbuf-chars\n"
888 "A new string containing this string's stringbuf's characters\n"
889 "@item stringbuf-length\n"
890 "The number of characters in this stringbuf\n"
891 "@item stringbuf-shared\n"
892 "@code{#t} if this stringbuf is shared\n"
893 "@item stringbuf-wide\n"
894 "@code{#t} if this stringbuf's characters are stored in a\n"
895 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
896 "buffer\n"
897 "@end table")
898 #define FUNC_NAME s_scm_sys_string_dump
899 {
900 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
901 SCM buf;
902 SCM_VALIDATE_STRING (1, str);
903
904 /* String info */
905 e1 = scm_cons (scm_from_latin1_symbol ("string"),
906 str);
907 e2 = scm_cons (scm_from_latin1_symbol ("start"),
908 scm_from_size_t (STRING_START (str)));
909 e3 = scm_cons (scm_from_latin1_symbol ("length"),
910 scm_from_size_t (STRING_LENGTH (str)));
911
912 if (IS_SH_STRING (str))
913 {
914 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
915 SH_STRING_STRING (str));
916 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
917 }
918 else
919 {
920 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
921 SCM_BOOL_F);
922 buf = STRING_STRINGBUF (str);
923 }
924
925 if (IS_RO_STRING (str))
926 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
927 SCM_BOOL_T);
928 else
929 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
930 SCM_BOOL_F);
931
932 /* Stringbuf info */
933 if (!STRINGBUF_WIDE (buf))
934 {
935 size_t len = STRINGBUF_LENGTH (buf);
936 char *cbuf;
937 SCM sbc = scm_i_make_string (len, &cbuf, 0);
938 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
939 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
940 sbc);
941 }
942 else
943 {
944 size_t len = STRINGBUF_LENGTH (buf);
945 scm_t_wchar *cbuf;
946 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
947 u32_cpy ((scm_t_uint32 *) cbuf,
948 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
949 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
950 sbc);
951 }
952 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
953 scm_from_size_t (STRINGBUF_LENGTH (buf)));
954 if (STRINGBUF_SHARED (buf))
955 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
956 SCM_BOOL_T);
957 else
958 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
959 SCM_BOOL_F);
960 if (STRINGBUF_WIDE (buf))
961 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
962 SCM_BOOL_T);
963 else
964 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
965 SCM_BOOL_F);
966
967 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
968 }
969 #undef FUNC_NAME
970
971 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
972 "Returns an association list containing debugging information\n"
973 "for @var{sym}. The association list has the following entries."
974 "@table @code\n"
975 "@item symbol\n"
976 "The symbol itself\n"
977 "@item hash\n"
978 "Its hash value\n"
979 "@item interned\n"
980 "@code{#t} if it is an interned symbol\n"
981 "@item stringbuf-chars\n"
982 "A new string containing this symbols's stringbuf's characters\n"
983 "@item stringbuf-length\n"
984 "The number of characters in this stringbuf\n"
985 "@item stringbuf-shared\n"
986 "@code{#t} if this stringbuf is shared\n"
987 "@item stringbuf-wide\n"
988 "@code{#t} if this stringbuf's characters are stored in a\n"
989 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
990 "buffer\n"
991 "@end table")
992 #define FUNC_NAME s_scm_sys_symbol_dump
993 {
994 SCM e1, e2, e3, e4, e5, e6, e7;
995 SCM buf;
996 SCM_VALIDATE_SYMBOL (1, sym);
997 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
998 sym);
999 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1000 scm_from_ulong (scm_i_symbol_hash (sym)));
1001 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1002 scm_symbol_interned_p (sym));
1003 buf = SYMBOL_STRINGBUF (sym);
1004
1005 /* Stringbuf info */
1006 if (!STRINGBUF_WIDE (buf))
1007 {
1008 size_t len = STRINGBUF_LENGTH (buf);
1009 char *cbuf;
1010 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1011 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1012 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1013 sbc);
1014 }
1015 else
1016 {
1017 size_t len = STRINGBUF_LENGTH (buf);
1018 scm_t_wchar *cbuf;
1019 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1020 u32_cpy ((scm_t_uint32 *) cbuf,
1021 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1022 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1023 sbc);
1024 }
1025 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1026 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1027 if (STRINGBUF_SHARED (buf))
1028 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1029 SCM_BOOL_T);
1030 else
1031 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1032 SCM_BOOL_F);
1033 if (STRINGBUF_WIDE (buf))
1034 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1035 SCM_BOOL_T);
1036 else
1037 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1038 SCM_BOOL_F);
1039 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1040
1041 }
1042 #undef FUNC_NAME
1043
1044 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1045
1046 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1047 #define FUNC_NAME s_scm_sys_stringbuf_hist
1048 {
1049 int i;
1050 for (i = 0; i < 1000; i++)
1051 if (lenhist[i])
1052 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1053 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1054 return SCM_UNSPECIFIED;
1055 }
1056 #undef FUNC_NAME
1057
1058 #endif
1059
1060 \f
1061
1062 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1063 (SCM obj),
1064 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1065 #define FUNC_NAME s_scm_string_p
1066 {
1067 return scm_from_bool (IS_STRING (obj));
1068 }
1069 #undef FUNC_NAME
1070
1071
1072 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1073
1074 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1075 (SCM chrs),
1076 "@deffnx {Scheme Procedure} list->string chrs\n"
1077 "Return a newly allocated string composed of the arguments,\n"
1078 "@var{chrs}.")
1079 #define FUNC_NAME s_scm_string
1080 {
1081 SCM result = SCM_BOOL_F;
1082 SCM rest;
1083 size_t len;
1084 size_t p = 0;
1085 long i;
1086 int wide = 0;
1087
1088 /* Verify that this is a list of chars. */
1089 i = scm_ilength (chrs);
1090 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1091
1092 len = (size_t) i;
1093 rest = chrs;
1094
1095 while (len > 0 && scm_is_pair (rest))
1096 {
1097 SCM elt = SCM_CAR (rest);
1098 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1099 if (SCM_CHAR (elt) > 0xFF)
1100 wide = 1;
1101 rest = SCM_CDR (rest);
1102 len--;
1103 scm_remember_upto_here_1 (elt);
1104 }
1105
1106 /* Construct a string containing this list of chars. */
1107 len = (size_t) i;
1108 rest = chrs;
1109
1110 if (wide == 0)
1111 {
1112 char *buf;
1113
1114 result = scm_i_make_string (len, NULL, 0);
1115 result = scm_i_string_start_writing (result);
1116 buf = scm_i_string_writable_chars (result);
1117 while (len > 0 && scm_is_pair (rest))
1118 {
1119 SCM elt = SCM_CAR (rest);
1120 buf[p] = (unsigned char) SCM_CHAR (elt);
1121 p++;
1122 rest = SCM_CDR (rest);
1123 len--;
1124 scm_remember_upto_here_1 (elt);
1125 }
1126 }
1127 else
1128 {
1129 scm_t_wchar *buf;
1130
1131 result = scm_i_make_wide_string (len, NULL, 0);
1132 result = scm_i_string_start_writing (result);
1133 buf = scm_i_string_writable_wide_chars (result);
1134 while (len > 0 && scm_is_pair (rest))
1135 {
1136 SCM elt = SCM_CAR (rest);
1137 buf[p] = SCM_CHAR (elt);
1138 p++;
1139 rest = SCM_CDR (rest);
1140 len--;
1141 scm_remember_upto_here_1 (elt);
1142 }
1143 }
1144 scm_i_string_stop_writing ();
1145
1146 if (len > 0)
1147 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1148 if (!scm_is_null (rest))
1149 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1150
1151 return result;
1152 }
1153 #undef FUNC_NAME
1154
1155 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1156 (SCM k, SCM chr),
1157 "Return a newly allocated string of\n"
1158 "length @var{k}. If @var{chr} is given, then all elements of\n"
1159 "the string are initialized to @var{chr}, otherwise the contents\n"
1160 "of the @var{string} are all set to @var{#\nul}.")
1161 #define FUNC_NAME s_scm_make_string
1162 {
1163 return scm_c_make_string (scm_to_size_t (k), chr);
1164 }
1165 #undef FUNC_NAME
1166
1167 SCM
1168 scm_c_make_string (size_t len, SCM chr)
1169 #define FUNC_NAME NULL
1170 {
1171 size_t p;
1172 char *contents = NULL;
1173 SCM res = scm_i_make_string (len, &contents, 0);
1174
1175 /* If no char is given, initialize string contents to NULL. */
1176 if (SCM_UNBNDP (chr))
1177 memset (contents, 0, len);
1178 else
1179 {
1180 SCM_VALIDATE_CHAR (0, chr);
1181 res = scm_i_string_start_writing (res);
1182 for (p = 0; p < len; p++)
1183 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1184 scm_i_string_stop_writing ();
1185 }
1186
1187 return res;
1188 }
1189 #undef FUNC_NAME
1190
1191 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1192 (SCM string),
1193 "Return the number of characters in @var{string}.")
1194 #define FUNC_NAME s_scm_string_length
1195 {
1196 SCM_VALIDATE_STRING (1, string);
1197 return scm_from_size_t (STRING_LENGTH (string));
1198 }
1199 #undef FUNC_NAME
1200
1201 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1202 (SCM string),
1203 "Return the bytes used to represent a character in @var{string}."
1204 "This will return 1 or 4.")
1205 #define FUNC_NAME s_scm_string_bytes_per_char
1206 {
1207 SCM_VALIDATE_STRING (1, string);
1208 if (!scm_i_is_narrow_string (string))
1209 return scm_from_int (4);
1210
1211 return scm_from_int (1);
1212 }
1213 #undef FUNC_NAME
1214
1215 size_t
1216 scm_c_string_length (SCM string)
1217 {
1218 if (!IS_STRING (string))
1219 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1220 return STRING_LENGTH (string);
1221 }
1222
1223 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1224 (SCM str, SCM k),
1225 "Return character @var{k} of @var{str} using zero-origin\n"
1226 "indexing. @var{k} must be a valid index of @var{str}.")
1227 #define FUNC_NAME s_scm_string_ref
1228 {
1229 size_t len;
1230 unsigned long idx;
1231
1232 SCM_VALIDATE_STRING (1, str);
1233
1234 len = scm_i_string_length (str);
1235 if (SCM_LIKELY (len > 0))
1236 idx = scm_to_unsigned_integer (k, 0, len - 1);
1237 else
1238 scm_out_of_range (NULL, k);
1239
1240 if (scm_i_is_narrow_string (str))
1241 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1242 else
1243 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1244 }
1245 #undef FUNC_NAME
1246
1247 SCM
1248 scm_c_string_ref (SCM str, size_t p)
1249 {
1250 if (p >= scm_i_string_length (str))
1251 scm_out_of_range (NULL, scm_from_size_t (p));
1252 if (scm_i_is_narrow_string (str))
1253 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1254 else
1255 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1256
1257 }
1258
1259 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1260 (SCM str, SCM k, SCM chr),
1261 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1262 "an unspecified value. @var{k} must be a valid index of\n"
1263 "@var{str}.")
1264 #define FUNC_NAME s_scm_string_set_x
1265 {
1266 size_t len;
1267 unsigned long idx;
1268
1269 SCM_VALIDATE_STRING (1, str);
1270
1271 len = scm_i_string_length (str);
1272 if (SCM_LIKELY (len > 0))
1273 idx = scm_to_unsigned_integer (k, 0, len - 1);
1274 else
1275 scm_out_of_range (NULL, k);
1276
1277 SCM_VALIDATE_CHAR (3, chr);
1278 str = scm_i_string_start_writing (str);
1279 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1280 scm_i_string_stop_writing ();
1281
1282 return SCM_UNSPECIFIED;
1283 }
1284 #undef FUNC_NAME
1285
1286 void
1287 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1288 {
1289 if (p >= scm_i_string_length (str))
1290 scm_out_of_range (NULL, scm_from_size_t (p));
1291 str = scm_i_string_start_writing (str);
1292 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1293 scm_i_string_stop_writing ();
1294 }
1295
1296 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1297 (SCM str, SCM start, SCM end),
1298 "Return a newly allocated string formed from the characters\n"
1299 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1300 "ending with index @var{end} (exclusive).\n"
1301 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1302 "exact integers satisfying:\n\n"
1303 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1304 #define FUNC_NAME s_scm_substring
1305 {
1306 size_t len, from, to;
1307
1308 SCM_VALIDATE_STRING (1, str);
1309 len = scm_i_string_length (str);
1310 from = scm_to_unsigned_integer (start, 0, len);
1311 if (SCM_UNBNDP (end))
1312 to = len;
1313 else
1314 to = scm_to_unsigned_integer (end, from, len);
1315 return scm_i_substring (str, from, to);
1316 }
1317 #undef FUNC_NAME
1318
1319 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1320 (SCM str, SCM start, SCM end),
1321 "Return a newly allocated string formed from the characters\n"
1322 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323 "ending with index @var{end} (exclusive).\n"
1324 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325 "exact integers satisfying:\n"
1326 "\n"
1327 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1328 "\n"
1329 "The returned string is read-only.\n")
1330 #define FUNC_NAME s_scm_substring_read_only
1331 {
1332 size_t len, from, to;
1333
1334 SCM_VALIDATE_STRING (1, str);
1335 len = scm_i_string_length (str);
1336 from = scm_to_unsigned_integer (start, 0, len);
1337 if (SCM_UNBNDP (end))
1338 to = len;
1339 else
1340 to = scm_to_unsigned_integer (end, from, len);
1341 return scm_i_substring_read_only (str, from, to);
1342 }
1343 #undef FUNC_NAME
1344
1345 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1346 (SCM str, SCM start, SCM end),
1347 "Return a newly allocated string formed from the characters\n"
1348 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1349 "ending with index @var{end} (exclusive).\n"
1350 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1351 "exact integers satisfying:\n\n"
1352 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1353 #define FUNC_NAME s_scm_substring_copy
1354 {
1355 /* For the Scheme version, START is mandatory, but for the C
1356 version, it is optional. See scm_string_copy in srfi-13.c for a
1357 rationale.
1358 */
1359
1360 size_t from, to;
1361
1362 SCM_VALIDATE_STRING (1, str);
1363 scm_i_get_substring_spec (scm_i_string_length (str),
1364 start, &from, end, &to);
1365 return scm_i_substring_copy (str, from, to);
1366 }
1367 #undef FUNC_NAME
1368
1369 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1370 (SCM str, SCM start, SCM end),
1371 "Return string that indirectly refers to the characters\n"
1372 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1373 "ending with index @var{end} (exclusive).\n"
1374 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1375 "exact integers satisfying:\n\n"
1376 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1377 #define FUNC_NAME s_scm_substring_shared
1378 {
1379 size_t len, from, to;
1380
1381 SCM_VALIDATE_STRING (1, str);
1382 len = scm_i_string_length (str);
1383 from = scm_to_unsigned_integer (start, 0, len);
1384 if (SCM_UNBNDP (end))
1385 to = len;
1386 else
1387 to = scm_to_unsigned_integer (end, from, len);
1388 return scm_i_substring_shared (str, from, to);
1389 }
1390 #undef FUNC_NAME
1391
1392 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1393 (SCM args),
1394 "Return a newly allocated string whose characters form the\n"
1395 "concatenation of the given strings, @var{args}.")
1396 #define FUNC_NAME s_scm_string_append
1397 {
1398 SCM res;
1399 size_t len = 0;
1400 int wide = 0;
1401 SCM l, s;
1402 size_t i;
1403 union
1404 {
1405 char *narrow;
1406 scm_t_wchar *wide;
1407 } data;
1408
1409 SCM_VALIDATE_REST_ARGUMENT (args);
1410 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1411 {
1412 s = SCM_CAR (l);
1413 SCM_VALIDATE_STRING (SCM_ARGn, s);
1414 len += scm_i_string_length (s);
1415 if (!scm_i_is_narrow_string (s))
1416 wide = 1;
1417 }
1418 data.narrow = NULL;
1419 if (!wide)
1420 res = scm_i_make_string (len, &data.narrow, 0);
1421 else
1422 res = scm_i_make_wide_string (len, &data.wide, 0);
1423
1424 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1425 {
1426 size_t len;
1427 s = SCM_CAR (l);
1428 SCM_VALIDATE_STRING (SCM_ARGn, s);
1429 len = scm_i_string_length (s);
1430 if (!wide)
1431 {
1432 memcpy (data.narrow, scm_i_string_chars (s), len);
1433 data.narrow += len;
1434 }
1435 else
1436 {
1437 if (scm_i_is_narrow_string (s))
1438 {
1439 for (i = 0; i < scm_i_string_length (s); i++)
1440 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1441 }
1442 else
1443 u32_cpy ((scm_t_uint32 *) data.wide,
1444 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1445 data.wide += len;
1446 }
1447 scm_remember_upto_here_1 (s);
1448 }
1449 return res;
1450 }
1451 #undef FUNC_NAME
1452
1453
1454 \f
1455 /* Charset conversion error handling. */
1456
1457 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1458 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1459
1460 /* Raise an exception informing that character CHR could not be written
1461 to PORT in its current encoding. */
1462 void
1463 scm_encoding_error (const char *subr, int err, const char *message,
1464 SCM port, SCM chr)
1465 {
1466 scm_throw (scm_encoding_error_key,
1467 scm_list_n (scm_from_latin1_string (subr),
1468 scm_from_latin1_string (message),
1469 scm_from_int (err),
1470 port, chr,
1471 SCM_UNDEFINED));
1472 }
1473
1474 /* Raise an exception informing of an encoding error on PORT. This
1475 means that a character could not be written in PORT's encoding. */
1476 void
1477 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1478 {
1479 scm_throw (scm_decoding_error_key,
1480 scm_list_n (scm_from_latin1_string (subr),
1481 scm_from_latin1_string (message),
1482 scm_from_int (err),
1483 port,
1484 SCM_UNDEFINED));
1485 }
1486
1487 \f
1488 /* String conversion to/from C. */
1489
1490 static void
1491 decoding_error (const char *func_name, int errno_save,
1492 const char *str, size_t len)
1493 {
1494 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1495 handler. */
1496 SCM bv;
1497 signed char *buf;
1498
1499 buf = scm_gc_malloc_pointerless (len, "bytevector");
1500 memcpy (buf, str, len);
1501 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1502
1503 scm_decoding_error (func_name, errno_save,
1504 "input locale conversion error", bv);
1505 }
1506
1507 SCM
1508 scm_from_stringn (const char *str, size_t len, const char *encoding,
1509 scm_t_string_failed_conversion_handler handler)
1510 {
1511 size_t u32len, i;
1512 scm_t_wchar *u32;
1513 int wide = 0;
1514 SCM res;
1515
1516 /* The order of these checks is important. */
1517 if (!str && len != 0)
1518 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1519 if (len == (size_t) -1)
1520 len = strlen (str);
1521
1522 if (encoding == NULL || len == 0)
1523 {
1524 /* If encoding is null (or the string is empty), use Latin-1. */
1525 char *buf;
1526 res = scm_i_make_string (len, &buf, 0);
1527 memcpy (buf, str, len);
1528 return res;
1529 }
1530
1531 u32len = 0;
1532 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1533 (enum iconv_ilseq_handler)
1534 handler,
1535 str, len,
1536 NULL,
1537 NULL, &u32len);
1538
1539 if (SCM_UNLIKELY (u32 == NULL))
1540 decoding_error (__func__, errno, str, len);
1541
1542 i = 0;
1543 while (i < u32len)
1544 if (u32[i++] > 0xFF)
1545 {
1546 wide = 1;
1547 break;
1548 }
1549
1550 if (!wide)
1551 {
1552 char *dst;
1553 res = scm_i_make_string (u32len, &dst, 0);
1554 for (i = 0; i < u32len; i ++)
1555 dst[i] = (unsigned char) u32[i];
1556 dst[u32len] = '\0';
1557 }
1558 else
1559 {
1560 scm_t_wchar *wdst;
1561 res = scm_i_make_wide_string (u32len, &wdst, 0);
1562 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1563 wdst[u32len] = 0;
1564 }
1565
1566 free (u32);
1567 return res;
1568 }
1569
1570 SCM
1571 scm_from_locale_string (const char *str)
1572 {
1573 return scm_from_locale_stringn (str, -1);
1574 }
1575
1576 SCM
1577 scm_from_locale_stringn (const char *str, size_t len)
1578 {
1579 return scm_from_stringn (str, len, locale_charset (),
1580 scm_i_get_conversion_strategy (SCM_BOOL_F));
1581 }
1582
1583 SCM
1584 scm_from_latin1_string (const char *str)
1585 {
1586 return scm_from_latin1_stringn (str, -1);
1587 }
1588
1589 SCM
1590 scm_from_latin1_stringn (const char *str, size_t len)
1591 {
1592 char *buf;
1593 SCM result;
1594
1595 if (len == (size_t) -1)
1596 len = strlen (str);
1597
1598 /* Make a narrow string and copy STR as is. */
1599 result = scm_i_make_string (len, &buf, 0);
1600 memcpy (buf, str, len);
1601
1602 return result;
1603 }
1604
1605 SCM
1606 scm_from_utf8_string (const char *str)
1607 {
1608 return scm_from_utf8_stringn (str, -1);
1609 }
1610
1611 SCM
1612 scm_from_utf8_stringn (const char *str, size_t len)
1613 {
1614 size_t i, char_len;
1615 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1616 int ascii = 1, narrow = 1;
1617 SCM res;
1618
1619 if (len == (size_t) -1)
1620 len = strlen (str);
1621
1622 i = 0;
1623 char_len = 0;
1624
1625 while (i < len)
1626 {
1627 if (ustr[i] <= 127)
1628 {
1629 char_len++;
1630 i++;
1631 }
1632 else
1633 {
1634 ucs4_t c;
1635 int nbytes;
1636
1637 ascii = 0;
1638
1639 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1640
1641 if (nbytes < 0)
1642 /* Bad UTF-8. */
1643 decoding_error (__func__, errno, str, len);
1644
1645 if (c > 255)
1646 narrow = 0;
1647
1648 char_len++;
1649 i += nbytes;
1650 }
1651 }
1652
1653 if (ascii)
1654 {
1655 char *dst;
1656 res = scm_i_make_string (char_len, &dst, 0);
1657 memcpy (dst, str, len);
1658 }
1659 else if (narrow)
1660 {
1661 char *dst;
1662 size_t j;
1663 ucs4_t c;
1664
1665 res = scm_i_make_string (char_len, &dst, 0);
1666
1667 for (i = 0, j = 0; i < len; i++, j++)
1668 {
1669 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1670 dst[j] = (signed char) c;
1671 }
1672 }
1673 else
1674 {
1675 scm_t_wchar *dst;
1676 size_t j;
1677 ucs4_t c;
1678
1679 res = scm_i_make_wide_string (char_len, &dst, 0);
1680
1681 for (i = 0, j = 0; i < len; i++, j++)
1682 {
1683 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1684 dst[j] = c;
1685 }
1686 }
1687
1688 return res;
1689 }
1690
1691 SCM
1692 scm_from_utf32_string (const scm_t_wchar *str)
1693 {
1694 return scm_from_utf32_stringn (str, -1);
1695 }
1696
1697 SCM
1698 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1699 {
1700 SCM result;
1701 scm_t_wchar *buf;
1702
1703 if (len == (size_t) -1)
1704 len = u32_strlen ((uint32_t *) str);
1705
1706 result = scm_i_make_wide_string (len, &buf, 0);
1707 memcpy (buf, str, len * sizeof (scm_t_wchar));
1708 scm_i_try_narrow_string (result);
1709
1710 return result;
1711 }
1712
1713 /* Create a new scheme string from the C string STR. The memory of
1714 STR may be used directly as storage for the new string. */
1715 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1716 would be to register a finalizer to eventually free(3) STR, which isn't
1717 worth it. Should we just deprecate the `scm_take_' functions? */
1718 SCM
1719 scm_take_locale_stringn (char *str, size_t len)
1720 {
1721 SCM res;
1722
1723 res = scm_from_locale_stringn (str, len);
1724 free (str);
1725
1726 return res;
1727 }
1728
1729 SCM
1730 scm_take_locale_string (char *str)
1731 {
1732 return scm_take_locale_stringn (str, -1);
1733 }
1734
1735 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1736 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1737 Set *LENP to the size of the resulting string.
1738
1739 FIXME: This is a hack we should get rid of. See
1740 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1741 for details. */
1742 static void
1743 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1744 {
1745 char *before, *after;
1746 size_t i, j;
1747
1748 before = buf;
1749 after = buf;
1750 i = 0;
1751 j = 0;
1752 while (i < *lenp)
1753 {
1754 if ((i <= *lenp - 6)
1755 && before[i] == '\\'
1756 && before[i + 1] == 'u'
1757 && before[i + 2] == '0' && before[i + 3] == '0')
1758 {
1759 /* Convert \u00NN to \xNN */
1760 after[j] = '\\';
1761 after[j + 1] = 'x';
1762 after[j + 2] = tolower ((int) before[i + 4]);
1763 after[j + 3] = tolower ((int) before[i + 5]);
1764 i += 6;
1765 j += 4;
1766 }
1767 else if ((i <= *lenp - 10)
1768 && before[i] == '\\'
1769 && before[i + 1] == 'U'
1770 && before[i + 2] == '0' && before[i + 3] == '0')
1771 {
1772 /* Convert \U00NNNNNN to \UNNNNNN */
1773 after[j] = '\\';
1774 after[j + 1] = 'U';
1775 after[j + 2] = tolower ((int) before[i + 4]);
1776 after[j + 3] = tolower ((int) before[i + 5]);
1777 after[j + 4] = tolower ((int) before[i + 6]);
1778 after[j + 5] = tolower ((int) before[i + 7]);
1779 after[j + 6] = tolower ((int) before[i + 8]);
1780 after[j + 7] = tolower ((int) before[i + 9]);
1781 i += 10;
1782 j += 8;
1783 }
1784 else
1785 {
1786 after[j] = before[i];
1787 i++;
1788 j++;
1789 }
1790 }
1791 *lenp = j;
1792 }
1793
1794 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1795 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1796 of the resulting string. BUF must be large enough to handle the
1797 worst case when `\uXXXX' escapes (6 characters) are replaced by
1798 `\xXXXX;' (7 characters). */
1799 static void
1800 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1801 {
1802 char *before, *after;
1803 size_t i, j;
1804 /* The worst case is if the input string contains all 4-digit hex escapes.
1805 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1806 size_t max_out_len = (*lenp * 7) / 6 + 1;
1807 size_t nzeros, ndigits;
1808
1809 before = buf;
1810 after = alloca (max_out_len);
1811 i = 0;
1812 j = 0;
1813 while (i < *lenp)
1814 {
1815 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1816 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1817 {
1818 if (before[i + 1] == 'u')
1819 ndigits = 4;
1820 else if (before[i + 1] == 'U')
1821 ndigits = 8;
1822 else
1823 abort ();
1824
1825 /* Add the R6RS hex escape initial sequence. */
1826 after[j] = '\\';
1827 after[j + 1] = 'x';
1828
1829 /* Move string positions to the start of the hex numbers. */
1830 i += 2;
1831 j += 2;
1832
1833 /* Find the number of initial zeros in this hex number. */
1834 nzeros = 0;
1835 while (before[i + nzeros] == '0' && nzeros < ndigits)
1836 nzeros++;
1837
1838 /* Copy the number, skipping initial zeros, and then move the string
1839 positions. */
1840 if (nzeros == ndigits)
1841 {
1842 after[j] = '0';
1843 i += ndigits;
1844 j += 1;
1845 }
1846 else
1847 {
1848 int pos;
1849 for (pos = 0; pos < ndigits - nzeros; pos++)
1850 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1851 i += ndigits;
1852 j += (ndigits - nzeros);
1853 }
1854
1855 /* Add terminating semicolon. */
1856 after[j] = ';';
1857 j++;
1858 }
1859 else
1860 {
1861 after[j] = before[i];
1862 i++;
1863 j++;
1864 }
1865 }
1866 *lenp = j;
1867 memcpy (before, after, j);
1868 }
1869
1870 char *
1871 scm_to_locale_string (SCM str)
1872 {
1873 return scm_to_locale_stringn (str, NULL);
1874 }
1875
1876 char *
1877 scm_to_locale_stringn (SCM str, size_t *lenp)
1878 {
1879 return scm_to_stringn (str, lenp,
1880 locale_charset (),
1881 scm_i_get_conversion_strategy (SCM_BOOL_F));
1882 }
1883
1884 char *
1885 scm_to_latin1_string (SCM str)
1886 {
1887 return scm_to_latin1_stringn (str, NULL);
1888 }
1889
1890 char *
1891 scm_to_latin1_stringn (SCM str, size_t *lenp)
1892 #define FUNC_NAME "scm_to_latin1_stringn"
1893 {
1894 char *result;
1895
1896 SCM_VALIDATE_STRING (1, str);
1897
1898 if (scm_i_is_narrow_string (str))
1899 {
1900 size_t len = scm_i_string_length (str);
1901
1902 if (lenp)
1903 *lenp = len;
1904
1905 result = scm_strndup (scm_i_string_data (str), len);
1906 }
1907 else
1908 result = scm_to_stringn (str, lenp, NULL,
1909 SCM_FAILED_CONVERSION_ERROR);
1910
1911 return result;
1912 }
1913 #undef FUNC_NAME
1914
1915 char *
1916 scm_to_utf8_string (SCM str)
1917 {
1918 return scm_to_utf8_stringn (str, NULL);
1919 }
1920
1921 char *
1922 scm_to_utf8_stringn (SCM str, size_t *lenp)
1923 {
1924 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1925 }
1926
1927 scm_t_wchar *
1928 scm_to_utf32_string (SCM str)
1929 {
1930 return scm_to_utf32_stringn (str, NULL);
1931 }
1932
1933 scm_t_wchar *
1934 scm_to_utf32_stringn (SCM str, size_t *lenp)
1935 #define FUNC_NAME "scm_to_utf32_stringn"
1936 {
1937 scm_t_wchar *result;
1938
1939 SCM_VALIDATE_STRING (1, str);
1940
1941 if (scm_i_is_narrow_string (str))
1942 result = (scm_t_wchar *)
1943 scm_to_stringn (str, lenp, "UTF-32",
1944 SCM_FAILED_CONVERSION_ERROR);
1945 else
1946 {
1947 size_t len;
1948
1949 len = scm_i_string_length (str);
1950 if (lenp)
1951 *lenp = len;
1952
1953 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1954 memcpy (result, scm_i_string_wide_chars (str),
1955 len * sizeof (scm_t_wchar));
1956 result[len] = 0;
1957 }
1958
1959 return result;
1960 }
1961 #undef FUNC_NAME
1962
1963 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1964 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1965 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1966 defined by HANDLER. */
1967 char *
1968 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1969 scm_t_string_failed_conversion_handler handler)
1970 {
1971 char *buf;
1972 size_t ilen, len, i;
1973 int ret;
1974 const char *enc;
1975
1976 if (!scm_is_string (str))
1977 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1978 ilen = scm_i_string_length (str);
1979
1980 if (ilen == 0)
1981 {
1982 buf = scm_malloc (1);
1983 buf[0] = '\0';
1984 if (lenp)
1985 *lenp = 0;
1986 return buf;
1987 }
1988
1989 if (lenp == NULL)
1990 for (i = 0; i < ilen; i++)
1991 if (scm_i_string_ref (str, i) == '\0')
1992 scm_misc_error (NULL,
1993 "string contains #\\nul character: ~S",
1994 scm_list_1 (str));
1995
1996 if (scm_i_is_narrow_string (str) && (encoding == NULL))
1997 {
1998 /* If using native Latin-1 encoding, just copy the string
1999 contents. */
2000 if (lenp)
2001 {
2002 buf = scm_malloc (ilen);
2003 memcpy (buf, scm_i_string_chars (str), ilen);
2004 *lenp = ilen;
2005 return buf;
2006 }
2007 else
2008 {
2009 buf = scm_malloc (ilen + 1);
2010 memcpy (buf, scm_i_string_chars (str), ilen);
2011 buf[ilen] = '\0';
2012 return buf;
2013 }
2014 }
2015
2016
2017 buf = NULL;
2018 len = 0;
2019 enc = encoding;
2020 if (enc == NULL)
2021 enc = "ISO-8859-1";
2022 if (scm_i_is_narrow_string (str))
2023 {
2024 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2025 "ISO-8859-1", enc,
2026 (enum iconv_ilseq_handler) handler, NULL,
2027 &buf, &len);
2028
2029 if (ret != 0)
2030 scm_encoding_error (__func__, errno,
2031 "cannot convert narrow string to output locale",
2032 SCM_BOOL_F,
2033 /* FIXME: Faulty character unknown. */
2034 SCM_BOOL_F);
2035 }
2036 else
2037 {
2038 buf = u32_conv_to_encoding (enc,
2039 (enum iconv_ilseq_handler) handler,
2040 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2041 ilen,
2042 NULL,
2043 NULL, &len);
2044 if (buf == NULL)
2045 scm_encoding_error (__func__, errno,
2046 "cannot convert wide string to output locale",
2047 SCM_BOOL_F,
2048 /* FIXME: Faulty character unknown. */
2049 SCM_BOOL_F);
2050 }
2051 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2052 {
2053 if (SCM_R6RS_ESCAPES_P)
2054 {
2055 /* The worst case is if the input string contains all 4-digit
2056 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2057 (seven characters). Make BUF large enough to hold
2058 that. */
2059 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2060 unistring_escapes_to_r6rs_escapes (buf, &len);
2061 }
2062 else
2063 unistring_escapes_to_guile_escapes (buf, &len);
2064
2065 buf = scm_realloc (buf, len);
2066 }
2067 if (lenp)
2068 *lenp = len;
2069 else
2070 {
2071 buf = scm_realloc (buf, len + 1);
2072 buf[len] = '\0';
2073 }
2074
2075 scm_remember_upto_here_1 (str);
2076 return buf;
2077 }
2078
2079 size_t
2080 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2081 {
2082 size_t len;
2083 char *result = NULL;
2084 if (!scm_is_string (str))
2085 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2086 result = scm_to_locale_stringn (str, &len);
2087
2088 memcpy (buf, result, (len > max_len) ? max_len : len);
2089 free (result);
2090
2091 scm_remember_upto_here_1 (str);
2092 return len;
2093 }
2094
2095 \f
2096 /* Unicode string normalization. */
2097
2098 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2099 libguile/i18n.c. It would be useful to have this factored out into a more
2100 convenient location, but its use of alloca makes that tricky to do. */
2101
2102 static SCM
2103 normalize_str (SCM string, uninorm_t form)
2104 {
2105 SCM ret;
2106 scm_t_uint32 *w_str;
2107 scm_t_wchar *cbuf;
2108 size_t rlen, len = scm_i_string_length (string);
2109
2110 if (scm_i_is_narrow_string (string))
2111 {
2112 size_t i;
2113 const char *buf = scm_i_string_chars (string);
2114
2115 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2116
2117 for (i = 0; i < len; i ++)
2118 w_str[i] = (unsigned char) buf[i];
2119 w_str[len] = 0;
2120 }
2121 else
2122 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2123
2124 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2125
2126 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2127 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2128 free (w_str);
2129
2130 scm_i_try_narrow_string (ret);
2131
2132 return ret;
2133 }
2134
2135 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2136 (SCM string),
2137 "Returns the NFC normalized form of @var{string}.")
2138 #define FUNC_NAME s_scm_string_normalize_nfc
2139 {
2140 SCM_VALIDATE_STRING (1, string);
2141 return normalize_str (string, UNINORM_NFC);
2142 }
2143 #undef FUNC_NAME
2144
2145 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2146 (SCM string),
2147 "Returns the NFD normalized form of @var{string}.")
2148 #define FUNC_NAME s_scm_string_normalize_nfd
2149 {
2150 SCM_VALIDATE_STRING (1, string);
2151 return normalize_str (string, UNINORM_NFD);
2152 }
2153 #undef FUNC_NAME
2154
2155 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2156 (SCM string),
2157 "Returns the NFKC normalized form of @var{string}.")
2158 #define FUNC_NAME s_scm_string_normalize_nfkc
2159 {
2160 SCM_VALIDATE_STRING (1, string);
2161 return normalize_str (string, UNINORM_NFKC);
2162 }
2163 #undef FUNC_NAME
2164
2165 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2166 (SCM string),
2167 "Returns the NFKD normalized form of @var{string}.")
2168 #define FUNC_NAME s_scm_string_normalize_nfkd
2169 {
2170 SCM_VALIDATE_STRING (1, string);
2171 return normalize_str (string, UNINORM_NFKD);
2172 }
2173 #undef FUNC_NAME
2174
2175 /* converts C scm_array of strings to SCM scm_list of strings.
2176 If argc < 0, a null terminated scm_array is assumed.
2177 The current locale encoding is assumed */
2178 SCM
2179 scm_makfromstrs (int argc, char **argv)
2180 {
2181 int i = argc;
2182 SCM lst = SCM_EOL;
2183 if (0 > i)
2184 for (i = 0; argv[i]; i++);
2185 while (i--)
2186 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2187 return lst;
2188 }
2189
2190 /* Return a newly allocated array of char pointers to each of the strings
2191 in args, with a terminating NULL pointer. The strings are encoded using
2192 the current locale. */
2193
2194 char **
2195 scm_i_allocate_string_pointers (SCM list)
2196 #define FUNC_NAME "scm_i_allocate_string_pointers"
2197 {
2198 char **result;
2199 int list_len = scm_ilength (list);
2200 int i;
2201
2202 if (list_len < 0)
2203 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2204
2205 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2206 "string pointers");
2207 result[list_len] = NULL;
2208
2209 /* The list might have been modified in another thread, so
2210 we check LIST before each access.
2211 */
2212 for (i = 0; i < list_len && scm_is_pair (list); i++)
2213 {
2214 SCM str = SCM_CAR (list);
2215 size_t len; /* String length in bytes */
2216 char *c_str = scm_to_locale_stringn (str, &len);
2217
2218 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2219 scm_malloc to allocate the returned string, which must be
2220 explicitly deallocated. This forces us to copy the string a
2221 second time into a new buffer. Ideally there would be variants
2222 of scm_to_*_stringn that can return garbage-collected buffers. */
2223
2224 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2225 memcpy (result[i], c_str, len);
2226 result[i][len] = '\0';
2227 free (c_str);
2228
2229 list = SCM_CDR (list);
2230 }
2231
2232 return result;
2233 }
2234 #undef FUNC_NAME
2235
2236 void
2237 scm_i_get_substring_spec (size_t len,
2238 SCM start, size_t *cstart,
2239 SCM end, size_t *cend)
2240 {
2241 if (SCM_UNBNDP (start))
2242 *cstart = 0;
2243 else
2244 *cstart = scm_to_unsigned_integer (start, 0, len);
2245
2246 if (SCM_UNBNDP (end))
2247 *cend = len;
2248 else
2249 *cend = scm_to_unsigned_integer (end, *cstart, len);
2250 }
2251
2252 static SCM
2253 string_handle_ref (scm_t_array_handle *h, size_t index)
2254 {
2255 return scm_c_string_ref (h->array, index);
2256 }
2257
2258 static void
2259 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2260 {
2261 scm_c_string_set_x (h->array, index, val);
2262 }
2263
2264 static void
2265 string_get_handle (SCM v, scm_t_array_handle *h)
2266 {
2267 h->array = v;
2268 h->ndims = 1;
2269 h->dims = &h->dim0;
2270 h->dim0.lbnd = 0;
2271 h->dim0.ubnd = scm_c_string_length (v) - 1;
2272 h->dim0.inc = 1;
2273 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2274 h->elements = h->writable_elements = NULL;
2275 }
2276
2277 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2278 string_handle_ref, string_handle_set,
2279 string_get_handle)
2280 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2281
2282 void
2283 scm_init_strings ()
2284 {
2285 scm_nullstr = scm_i_make_string (0, NULL, 0);
2286
2287 #include "libguile/strings.x"
2288 }
2289
2290
2291 /*
2292 Local Variables:
2293 c-file-style: "gnu"
2294 End:
2295 */