fix bug where scm_from_utf8_stringn would not detect bad utf-8
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229
230 \f
231 /* Copy-on-write strings.
232 */
233
234 #define STRING_TAG scm_tc7_string
235
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
244
245 /* Read-only strings.
246 */
247
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
251 /* Mutation-sharing substrings
252 */
253
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
258
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
261 SCM scm_nullstr;
262
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
267 SCM
268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
269 {
270 static SCM null_stringbuf = SCM_BOOL_F;
271 SCM buf;
272 SCM res;
273
274 if (len == 0)
275 {
276 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
277 {
278 null_stringbuf = make_stringbuf (0);
279 SET_STRINGBUF_SHARED (null_stringbuf);
280 }
281 buf = null_stringbuf;
282 }
283 else
284 buf = make_stringbuf (len);
285
286 if (charsp)
287 *charsp = (char *) STRINGBUF_CHARS (buf);
288 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
289 SCM_UNPACK (buf),
290 (scm_t_bits) 0, (scm_t_bits) len);
291 return res;
292 }
293
294 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
295 characters. CHARSP, if not NULL, will be set to location of the
296 character array. If READ_ONLY_P, the returned string is read-only;
297 otherwise it is writable. */
298 SCM
299 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
300 {
301 SCM buf = make_wide_stringbuf (len);
302 SCM res;
303 if (charsp)
304 *charsp = STRINGBUF_WIDE_CHARS (buf);
305 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
306 SCM_UNPACK (buf),
307 (scm_t_bits) 0, (scm_t_bits) len);
308 return res;
309 }
310
311 static void
312 validate_substring_args (SCM str, size_t start, size_t end)
313 {
314 if (!IS_STRING (str))
315 scm_wrong_type_arg_msg (NULL, 0, str, "string");
316 if (start > STRING_LENGTH (str))
317 scm_out_of_range (NULL, scm_from_size_t (start));
318 if (end > STRING_LENGTH (str) || end < start)
319 scm_out_of_range (NULL, scm_from_size_t (end));
320 }
321
322 static inline void
323 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
324 {
325 *start = STRING_START (*str);
326 if (IS_SH_STRING (*str))
327 {
328 *str = SH_STRING_STRING (*str);
329 *start += STRING_START (*str);
330 }
331 *buf = STRING_STRINGBUF (*str);
332 }
333
334 SCM
335 scm_i_substring (SCM str, size_t start, size_t end)
336 {
337 if (start == end)
338 return scm_i_make_string (0, NULL, 0);
339 else
340 {
341 SCM buf;
342 size_t str_start;
343 get_str_buf_start (&str, &buf, &str_start);
344 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
345 SET_STRINGBUF_SHARED (buf);
346 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
347 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
348 (scm_t_bits)str_start + start,
349 (scm_t_bits) end - start);
350 }
351 }
352
353 SCM
354 scm_i_substring_read_only (SCM str, size_t start, size_t end)
355 {
356 if (start == end)
357 return scm_i_make_string (0, NULL, 1);
358 else
359 {
360 SCM buf;
361 size_t str_start;
362 get_str_buf_start (&str, &buf, &str_start);
363 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
364 SET_STRINGBUF_SHARED (buf);
365 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
366 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
367 (scm_t_bits)str_start + start,
368 (scm_t_bits) end - start);
369 }
370 }
371
372 SCM
373 scm_i_substring_copy (SCM str, size_t start, size_t end)
374 {
375 if (start == end)
376 return scm_i_make_string (0, NULL, 0);
377 else
378 {
379 size_t len = end - start;
380 SCM buf, my_buf, substr;
381 size_t str_start;
382 int wide = 0;
383 get_str_buf_start (&str, &buf, &str_start);
384 if (scm_i_is_narrow_string (str))
385 {
386 my_buf = make_stringbuf (len);
387 memcpy (STRINGBUF_CHARS (my_buf),
388 STRINGBUF_CHARS (buf) + str_start + start, len);
389 }
390 else
391 {
392 my_buf = make_wide_stringbuf (len);
393 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
394 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
395 + start), len);
396 wide = 1;
397 }
398 scm_remember_upto_here_1 (buf);
399 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
400 (scm_t_bits) 0, (scm_t_bits) len);
401 if (wide)
402 scm_i_try_narrow_string (substr);
403 return substr;
404 }
405 }
406
407 SCM
408 scm_i_substring_shared (SCM str, size_t start, size_t end)
409 {
410 if (start == 0 && end == STRING_LENGTH (str))
411 return str;
412 else if (start == end)
413 return scm_i_make_string (0, NULL, 0);
414 else
415 {
416 size_t len = end - start;
417 if (IS_SH_STRING (str))
418 {
419 start += STRING_START (str);
420 str = SH_STRING_STRING (str);
421 }
422 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
423 (scm_t_bits)start, (scm_t_bits) len);
424 }
425 }
426
427 SCM
428 scm_c_substring (SCM str, size_t start, size_t end)
429 {
430 validate_substring_args (str, start, end);
431 return scm_i_substring (str, start, end);
432 }
433
434 SCM
435 scm_c_substring_read_only (SCM str, size_t start, size_t end)
436 {
437 validate_substring_args (str, start, end);
438 return scm_i_substring_read_only (str, start, end);
439 }
440
441 SCM
442 scm_c_substring_copy (SCM str, size_t start, size_t end)
443 {
444 validate_substring_args (str, start, end);
445 return scm_i_substring_copy (str, start, end);
446 }
447
448 SCM
449 scm_c_substring_shared (SCM str, size_t start, size_t end)
450 {
451 validate_substring_args (str, start, end);
452 return scm_i_substring_shared (str, start, end);
453 }
454
455 \f
456 /* Internal accessors
457 */
458
459 /* Returns the number of characters in STR. This may be different
460 than the memory size of the string storage. */
461 size_t
462 scm_i_string_length (SCM str)
463 {
464 return STRING_LENGTH (str);
465 }
466
467 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
468 encoding. False if it is 'wide', having a 32-bit UCS-4
469 encoding. */
470 int
471 scm_i_is_narrow_string (SCM str)
472 {
473 if (IS_SH_STRING (str))
474 str = SH_STRING_STRING (str);
475
476 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
477 }
478
479 /* Try to coerce a string to be narrow. It if is narrow already, do
480 nothing. If it is wide, shrink it to narrow if none of its
481 characters are above 0xFF. Return true if the string is narrow or
482 was made to be narrow. */
483 int
484 scm_i_try_narrow_string (SCM str)
485 {
486 if (IS_SH_STRING (str))
487 str = SH_STRING_STRING (str);
488
489 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
490
491 return scm_i_is_narrow_string (str);
492 }
493
494 /* Return a pointer to the raw data of the string, which can be either Latin-1
495 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
496 const void *
497 scm_i_string_data (SCM str)
498 {
499 SCM buf;
500 size_t start;
501 const char *data;
502
503 get_str_buf_start (&str, &buf, &start);
504
505 data = STRINGBUF_CONTENTS (buf);
506 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
507
508 return data;
509 }
510
511 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
512 STR. */
513 const char *
514 scm_i_string_chars (SCM str)
515 {
516 SCM buf;
517 size_t start;
518 get_str_buf_start (&str, &buf, &start);
519 if (scm_i_is_narrow_string (str))
520 return (const char *) STRINGBUF_CHARS (buf) + start;
521 else
522 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
523 scm_list_1 (str));
524 return NULL;
525 }
526
527 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
528 STR. */
529 const scm_t_wchar *
530 scm_i_string_wide_chars (SCM str)
531 {
532 SCM buf;
533 size_t start;
534
535 get_str_buf_start (&str, &buf, &start);
536 if (!scm_i_is_narrow_string (str))
537 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
538 else
539 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
540 scm_list_1 (str));
541 }
542
543 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
544 a new string buffer, so that it can be modified without modifying
545 other strings. Also, lock the string mutex. Later, one must call
546 scm_i_string_stop_writing to unlock the mutex. */
547 SCM
548 scm_i_string_start_writing (SCM orig_str)
549 {
550 SCM buf, str = orig_str;
551 size_t start;
552
553 get_str_buf_start (&str, &buf, &start);
554 if (IS_RO_STRING (str))
555 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
556
557 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
558 if (STRINGBUF_SHARED (buf))
559 {
560 /* Clone the stringbuf. */
561 size_t len = STRING_LENGTH (str);
562 SCM new_buf;
563
564 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
565
566 if (scm_i_is_narrow_string (str))
567 {
568 new_buf = make_stringbuf (len);
569 memcpy (STRINGBUF_CHARS (new_buf),
570 STRINGBUF_CHARS (buf) + STRING_START (str), len);
571
572 }
573 else
574 {
575 new_buf = make_wide_stringbuf (len);
576 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
577 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
578 + STRING_START (str)), len);
579 }
580
581 SET_STRING_STRINGBUF (str, new_buf);
582 start -= STRING_START (str);
583
584 /* FIXME: The following operations are not atomic, so other threads
585 looking at STR may see an inconsistent state. Nevertheless it can't
586 hurt much since (i) accessing STR while it is being mutated can't
587 yield a crash, and (ii) concurrent accesses to STR should be
588 protected by a mutex at the application level. The latter may not
589 apply when STR != ORIG_STR, though. */
590 SET_STRING_START (str, 0);
591 SET_STRING_STRINGBUF (str, new_buf);
592
593 buf = new_buf;
594
595 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
596 }
597 return orig_str;
598 }
599
600 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
601 char *
602 scm_i_string_writable_chars (SCM str)
603 {
604 SCM buf;
605 size_t start;
606
607 get_str_buf_start (&str, &buf, &start);
608 if (scm_i_is_narrow_string (str))
609 return (char *) STRINGBUF_CHARS (buf) + start;
610 else
611 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
612 scm_list_1 (str));
613 return NULL;
614 }
615
616 /* Return a pointer to the UCS-4 codepoints of a string. */
617 static scm_t_wchar *
618 scm_i_string_writable_wide_chars (SCM str)
619 {
620 SCM buf;
621 size_t start;
622
623 get_str_buf_start (&str, &buf, &start);
624 if (!scm_i_is_narrow_string (str))
625 return STRINGBUF_WIDE_CHARS (buf) + start;
626 else
627 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
628 scm_list_1 (str));
629 }
630
631 /* Unlock the string mutex that was locked when
632 scm_i_string_start_writing was called. */
633 void
634 scm_i_string_stop_writing (void)
635 {
636 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
637 }
638
639 /* Return the Xth character of STR as a UCS-4 codepoint. */
640 scm_t_wchar
641 scm_i_string_ref (SCM str, size_t x)
642 {
643 if (scm_i_is_narrow_string (str))
644 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
645 else
646 return scm_i_string_wide_chars (str)[x];
647 }
648
649 /* Returns index+1 of the first char in STR that matches C, or
650 0 if the char is not found. */
651 int
652 scm_i_string_contains_char (SCM str, char ch)
653 {
654 size_t i;
655 size_t len = scm_i_string_length (str);
656
657 i = 0;
658 if (scm_i_is_narrow_string (str))
659 {
660 while (i < len)
661 {
662 if (scm_i_string_chars (str)[i] == ch)
663 return i+1;
664 i++;
665 }
666 }
667 else
668 {
669 while (i < len)
670 {
671 if (scm_i_string_wide_chars (str)[i]
672 == (unsigned char) ch)
673 return i+1;
674 i++;
675 }
676 }
677 return 0;
678 }
679
680 int
681 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
682 {
683 if (scm_i_is_narrow_string (sstr))
684 {
685 const char *a = scm_i_string_chars (sstr) + start_x;
686 const char *b = cstr;
687 return strncmp (a, b, strlen(b));
688 }
689 else
690 {
691 size_t i;
692 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
693 const char *b = cstr;
694 for (i = 0; i < strlen (b); i++)
695 {
696 if (a[i] != (unsigned char) b[i])
697 return 1;
698 }
699 }
700 return 0;
701 }
702
703 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
704 void
705 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
706 {
707 if (IS_SH_STRING (str))
708 {
709 p += STRING_START (str);
710 str = SH_STRING_STRING (str);
711 }
712
713 if (chr > 0xFF && scm_i_is_narrow_string (str))
714 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
715
716 if (scm_i_is_narrow_string (str))
717 {
718 char *dst = scm_i_string_writable_chars (str);
719 dst[p] = chr;
720 }
721 else
722 {
723 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
724 dst[p] = chr;
725 }
726 }
727
728 \f
729 /* Symbols.
730
731 Basic symbol creation and accessing is done here, the rest is in
732 symbols.[hc]. This has been done to keep stringbufs and the
733 internals of strings and string-like objects confined to this file.
734 */
735
736 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
737
738 SCM
739 scm_i_make_symbol (SCM name, scm_t_bits flags,
740 unsigned long hash, SCM props)
741 {
742 SCM buf;
743 size_t start = STRING_START (name);
744 size_t length = STRING_LENGTH (name);
745
746 if (IS_SH_STRING (name))
747 {
748 name = SH_STRING_STRING (name);
749 start += STRING_START (name);
750 }
751 buf = STRING_STRINGBUF (name);
752
753 if (start == 0 && length == STRINGBUF_LENGTH (buf))
754 {
755 /* reuse buf. */
756 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
757 SET_STRINGBUF_SHARED (buf);
758 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
759 }
760 else
761 {
762 /* make new buf. */
763 if (scm_i_is_narrow_string (name))
764 {
765 SCM new_buf = make_stringbuf (length);
766 memcpy (STRINGBUF_CHARS (new_buf),
767 STRINGBUF_CHARS (buf) + start, length);
768 buf = new_buf;
769 }
770 else
771 {
772 SCM new_buf = make_wide_stringbuf (length);
773 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
774 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
775 length);
776 buf = new_buf;
777 }
778 }
779 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
780 (scm_t_bits) hash, SCM_UNPACK (props));
781 }
782
783 SCM
784 scm_i_c_make_symbol (const char *name, size_t len,
785 scm_t_bits flags, unsigned long hash, SCM props)
786 {
787 SCM buf = make_stringbuf (len);
788 memcpy (STRINGBUF_CHARS (buf), name, len);
789
790 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
791 (scm_t_bits) hash, SCM_UNPACK (props));
792 }
793
794 /* Returns the number of characters in SYM. This may be different
795 from the memory size of SYM. */
796 size_t
797 scm_i_symbol_length (SCM sym)
798 {
799 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
800 }
801
802 size_t
803 scm_c_symbol_length (SCM sym)
804 #define FUNC_NAME "scm_c_symbol_length"
805 {
806 SCM_VALIDATE_SYMBOL (1, sym);
807
808 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
809 }
810 #undef FUNC_NAME
811
812 /* True if the name of SYM is stored as a Latin-1 encoded string.
813 False if it is stored as a 32-bit UCS-4-encoded string. */
814 int
815 scm_i_is_narrow_symbol (SCM sym)
816 {
817 SCM buf;
818
819 buf = SYMBOL_STRINGBUF (sym);
820 return !STRINGBUF_WIDE (buf);
821 }
822
823 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
824 contains the name of SYM. */
825 const char *
826 scm_i_symbol_chars (SCM sym)
827 {
828 SCM buf;
829
830 buf = SYMBOL_STRINGBUF (sym);
831 if (!STRINGBUF_WIDE (buf))
832 return (const char *) STRINGBUF_CHARS (buf);
833 else
834 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
835 scm_list_1 (sym));
836 }
837
838 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
839 symbol's name. */
840 const scm_t_wchar *
841 scm_i_symbol_wide_chars (SCM sym)
842 {
843 SCM buf;
844
845 buf = SYMBOL_STRINGBUF (sym);
846 if (STRINGBUF_WIDE (buf))
847 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
848 else
849 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
850 scm_list_1 (sym));
851 }
852
853 SCM
854 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
855 {
856 SCM buf = SYMBOL_STRINGBUF (sym);
857 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
858 SET_STRINGBUF_SHARED (buf);
859 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
860 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
861 (scm_t_bits)start, (scm_t_bits) end - start);
862 }
863
864 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
865 scm_t_wchar
866 scm_i_symbol_ref (SCM sym, size_t x)
867 {
868 if (scm_i_is_narrow_symbol (sym))
869 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
870 else
871 return scm_i_symbol_wide_chars (sym)[x];
872 }
873
874 /* Debugging
875 */
876
877 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
878 "Returns an association list containing debugging information\n"
879 "for @var{str}. The association list has the following entries."
880 "@table @code\n"
881 "@item string\n"
882 "The string itself.\n"
883 "@item start\n"
884 "The start index of the string into its stringbuf\n"
885 "@item length\n"
886 "The length of the string\n"
887 "@item shared\n"
888 "If this string is a substring, it returns its parent string.\n"
889 "Otherwise, it returns @code{#f}\n"
890 "@item read-only\n"
891 "@code{#t} if the string is read-only\n"
892 "@item stringbuf-chars\n"
893 "A new string containing this string's stringbuf's characters\n"
894 "@item stringbuf-length\n"
895 "The number of characters in this stringbuf\n"
896 "@item stringbuf-shared\n"
897 "@code{#t} if this stringbuf is shared\n"
898 "@item stringbuf-wide\n"
899 "@code{#t} if this stringbuf's characters are stored in a\n"
900 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
901 "buffer\n"
902 "@end table")
903 #define FUNC_NAME s_scm_sys_string_dump
904 {
905 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
906 SCM buf;
907 SCM_VALIDATE_STRING (1, str);
908
909 /* String info */
910 e1 = scm_cons (scm_from_latin1_symbol ("string"),
911 str);
912 e2 = scm_cons (scm_from_latin1_symbol ("start"),
913 scm_from_size_t (STRING_START (str)));
914 e3 = scm_cons (scm_from_latin1_symbol ("length"),
915 scm_from_size_t (STRING_LENGTH (str)));
916
917 if (IS_SH_STRING (str))
918 {
919 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
920 SH_STRING_STRING (str));
921 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
922 }
923 else
924 {
925 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
926 SCM_BOOL_F);
927 buf = STRING_STRINGBUF (str);
928 }
929
930 if (IS_RO_STRING (str))
931 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
932 SCM_BOOL_T);
933 else
934 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
935 SCM_BOOL_F);
936
937 /* Stringbuf info */
938 if (!STRINGBUF_WIDE (buf))
939 {
940 size_t len = STRINGBUF_LENGTH (buf);
941 char *cbuf;
942 SCM sbc = scm_i_make_string (len, &cbuf, 0);
943 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
944 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
945 sbc);
946 }
947 else
948 {
949 size_t len = STRINGBUF_LENGTH (buf);
950 scm_t_wchar *cbuf;
951 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
952 u32_cpy ((scm_t_uint32 *) cbuf,
953 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
954 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
955 sbc);
956 }
957 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
958 scm_from_size_t (STRINGBUF_LENGTH (buf)));
959 if (STRINGBUF_SHARED (buf))
960 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
961 SCM_BOOL_T);
962 else
963 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
964 SCM_BOOL_F);
965 if (STRINGBUF_WIDE (buf))
966 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
967 SCM_BOOL_T);
968 else
969 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
970 SCM_BOOL_F);
971
972 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
973 }
974 #undef FUNC_NAME
975
976 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
977 "Returns an association list containing debugging information\n"
978 "for @var{sym}. The association list has the following entries."
979 "@table @code\n"
980 "@item symbol\n"
981 "The symbol itself\n"
982 "@item hash\n"
983 "Its hash value\n"
984 "@item interned\n"
985 "@code{#t} if it is an interned symbol\n"
986 "@item stringbuf-chars\n"
987 "A new string containing this symbols's stringbuf's characters\n"
988 "@item stringbuf-length\n"
989 "The number of characters in this stringbuf\n"
990 "@item stringbuf-shared\n"
991 "@code{#t} if this stringbuf is shared\n"
992 "@item stringbuf-wide\n"
993 "@code{#t} if this stringbuf's characters are stored in a\n"
994 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
995 "buffer\n"
996 "@end table")
997 #define FUNC_NAME s_scm_sys_symbol_dump
998 {
999 SCM e1, e2, e3, e4, e5, e6, e7;
1000 SCM buf;
1001 SCM_VALIDATE_SYMBOL (1, sym);
1002 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1003 sym);
1004 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1005 scm_from_ulong (scm_i_symbol_hash (sym)));
1006 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1007 scm_symbol_interned_p (sym));
1008 buf = SYMBOL_STRINGBUF (sym);
1009
1010 /* Stringbuf info */
1011 if (!STRINGBUF_WIDE (buf))
1012 {
1013 size_t len = STRINGBUF_LENGTH (buf);
1014 char *cbuf;
1015 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1016 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1017 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1018 sbc);
1019 }
1020 else
1021 {
1022 size_t len = STRINGBUF_LENGTH (buf);
1023 scm_t_wchar *cbuf;
1024 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1025 u32_cpy ((scm_t_uint32 *) cbuf,
1026 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1027 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1028 sbc);
1029 }
1030 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1031 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1032 if (STRINGBUF_SHARED (buf))
1033 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1034 SCM_BOOL_T);
1035 else
1036 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037 SCM_BOOL_F);
1038 if (STRINGBUF_WIDE (buf))
1039 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1040 SCM_BOOL_T);
1041 else
1042 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043 SCM_BOOL_F);
1044 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1045
1046 }
1047 #undef FUNC_NAME
1048
1049 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1050
1051 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1052 #define FUNC_NAME s_scm_sys_stringbuf_hist
1053 {
1054 int i;
1055 for (i = 0; i < 1000; i++)
1056 if (lenhist[i])
1057 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1058 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1059 return SCM_UNSPECIFIED;
1060 }
1061 #undef FUNC_NAME
1062
1063 #endif
1064
1065 \f
1066
1067 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1068 (SCM obj),
1069 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1070 #define FUNC_NAME s_scm_string_p
1071 {
1072 return scm_from_bool (IS_STRING (obj));
1073 }
1074 #undef FUNC_NAME
1075
1076
1077 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1078
1079 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1080 (SCM chrs),
1081 "@deffnx {Scheme Procedure} list->string chrs\n"
1082 "Return a newly allocated string composed of the arguments,\n"
1083 "@var{chrs}.")
1084 #define FUNC_NAME s_scm_string
1085 {
1086 SCM result = SCM_BOOL_F;
1087 SCM rest;
1088 size_t len;
1089 size_t p = 0;
1090 long i;
1091 int wide = 0;
1092
1093 /* Verify that this is a list of chars. */
1094 i = scm_ilength (chrs);
1095 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1096
1097 len = (size_t) i;
1098 rest = chrs;
1099
1100 while (len > 0 && scm_is_pair (rest))
1101 {
1102 SCM elt = SCM_CAR (rest);
1103 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1104 if (SCM_CHAR (elt) > 0xFF)
1105 wide = 1;
1106 rest = SCM_CDR (rest);
1107 len--;
1108 scm_remember_upto_here_1 (elt);
1109 }
1110
1111 /* Construct a string containing this list of chars. */
1112 len = (size_t) i;
1113 rest = chrs;
1114
1115 if (wide == 0)
1116 {
1117 char *buf;
1118
1119 result = scm_i_make_string (len, NULL, 0);
1120 result = scm_i_string_start_writing (result);
1121 buf = scm_i_string_writable_chars (result);
1122 while (len > 0 && scm_is_pair (rest))
1123 {
1124 SCM elt = SCM_CAR (rest);
1125 buf[p] = (unsigned char) SCM_CHAR (elt);
1126 p++;
1127 rest = SCM_CDR (rest);
1128 len--;
1129 scm_remember_upto_here_1 (elt);
1130 }
1131 }
1132 else
1133 {
1134 scm_t_wchar *buf;
1135
1136 result = scm_i_make_wide_string (len, NULL, 0);
1137 result = scm_i_string_start_writing (result);
1138 buf = scm_i_string_writable_wide_chars (result);
1139 while (len > 0 && scm_is_pair (rest))
1140 {
1141 SCM elt = SCM_CAR (rest);
1142 buf[p] = SCM_CHAR (elt);
1143 p++;
1144 rest = SCM_CDR (rest);
1145 len--;
1146 scm_remember_upto_here_1 (elt);
1147 }
1148 }
1149 scm_i_string_stop_writing ();
1150
1151 if (len > 0)
1152 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1153 if (!scm_is_null (rest))
1154 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1155
1156 return result;
1157 }
1158 #undef FUNC_NAME
1159
1160 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1161 (SCM k, SCM chr),
1162 "Return a newly allocated string of\n"
1163 "length @var{k}. If @var{chr} is given, then all elements of\n"
1164 "the string are initialized to @var{chr}, otherwise the contents\n"
1165 "of the string are all set to @code{#\nul}.")
1166 #define FUNC_NAME s_scm_make_string
1167 {
1168 return scm_c_make_string (scm_to_size_t (k), chr);
1169 }
1170 #undef FUNC_NAME
1171
1172 SCM
1173 scm_c_make_string (size_t len, SCM chr)
1174 #define FUNC_NAME NULL
1175 {
1176 size_t p;
1177 char *contents = NULL;
1178 SCM res = scm_i_make_string (len, &contents, 0);
1179
1180 /* If no char is given, initialize string contents to NULL. */
1181 if (SCM_UNBNDP (chr))
1182 memset (contents, 0, len);
1183 else
1184 {
1185 SCM_VALIDATE_CHAR (0, chr);
1186 res = scm_i_string_start_writing (res);
1187 for (p = 0; p < len; p++)
1188 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1189 scm_i_string_stop_writing ();
1190 }
1191
1192 return res;
1193 }
1194 #undef FUNC_NAME
1195
1196 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1197 (SCM string),
1198 "Return the number of characters in @var{string}.")
1199 #define FUNC_NAME s_scm_string_length
1200 {
1201 SCM_VALIDATE_STRING (1, string);
1202 return scm_from_size_t (STRING_LENGTH (string));
1203 }
1204 #undef FUNC_NAME
1205
1206 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1207 (SCM string),
1208 "Return the bytes used to represent a character in @var{string}."
1209 "This will return 1 or 4.")
1210 #define FUNC_NAME s_scm_string_bytes_per_char
1211 {
1212 SCM_VALIDATE_STRING (1, string);
1213 if (!scm_i_is_narrow_string (string))
1214 return scm_from_int (4);
1215
1216 return scm_from_int (1);
1217 }
1218 #undef FUNC_NAME
1219
1220 size_t
1221 scm_c_string_length (SCM string)
1222 {
1223 if (!IS_STRING (string))
1224 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1225 return STRING_LENGTH (string);
1226 }
1227
1228 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1229 (SCM str, SCM k),
1230 "Return character @var{k} of @var{str} using zero-origin\n"
1231 "indexing. @var{k} must be a valid index of @var{str}.")
1232 #define FUNC_NAME s_scm_string_ref
1233 {
1234 size_t len;
1235 unsigned long idx;
1236
1237 SCM_VALIDATE_STRING (1, str);
1238
1239 len = scm_i_string_length (str);
1240 if (SCM_LIKELY (len > 0))
1241 idx = scm_to_unsigned_integer (k, 0, len - 1);
1242 else
1243 scm_out_of_range (NULL, k);
1244
1245 if (scm_i_is_narrow_string (str))
1246 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1247 else
1248 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1249 }
1250 #undef FUNC_NAME
1251
1252 SCM
1253 scm_c_string_ref (SCM str, size_t p)
1254 {
1255 if (p >= scm_i_string_length (str))
1256 scm_out_of_range (NULL, scm_from_size_t (p));
1257 if (scm_i_is_narrow_string (str))
1258 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1259 else
1260 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1261
1262 }
1263
1264 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1265 (SCM str, SCM k, SCM chr),
1266 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1267 "an unspecified value. @var{k} must be a valid index of\n"
1268 "@var{str}.")
1269 #define FUNC_NAME s_scm_string_set_x
1270 {
1271 size_t len;
1272 unsigned long idx;
1273
1274 SCM_VALIDATE_STRING (1, str);
1275
1276 len = scm_i_string_length (str);
1277 if (SCM_LIKELY (len > 0))
1278 idx = scm_to_unsigned_integer (k, 0, len - 1);
1279 else
1280 scm_out_of_range (NULL, k);
1281
1282 SCM_VALIDATE_CHAR (3, chr);
1283 str = scm_i_string_start_writing (str);
1284 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1285 scm_i_string_stop_writing ();
1286
1287 return SCM_UNSPECIFIED;
1288 }
1289 #undef FUNC_NAME
1290
1291 void
1292 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1293 {
1294 if (p >= scm_i_string_length (str))
1295 scm_out_of_range (NULL, scm_from_size_t (p));
1296 str = scm_i_string_start_writing (str);
1297 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1298 scm_i_string_stop_writing ();
1299 }
1300
1301 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1302 (SCM str, SCM start, SCM end),
1303 "Return a newly allocated string formed from the characters\n"
1304 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1305 "ending with index @var{end} (exclusive).\n"
1306 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1307 "exact integers satisfying:\n\n"
1308 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1309 #define FUNC_NAME s_scm_substring
1310 {
1311 size_t len, from, to;
1312
1313 SCM_VALIDATE_STRING (1, str);
1314 len = scm_i_string_length (str);
1315 from = scm_to_unsigned_integer (start, 0, len);
1316 if (SCM_UNBNDP (end))
1317 to = len;
1318 else
1319 to = scm_to_unsigned_integer (end, from, len);
1320 return scm_i_substring (str, from, to);
1321 }
1322 #undef FUNC_NAME
1323
1324 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1325 (SCM str, SCM start, SCM end),
1326 "Return a newly allocated string formed from the characters\n"
1327 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1328 "ending with index @var{end} (exclusive).\n"
1329 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1330 "exact integers satisfying:\n"
1331 "\n"
1332 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1333 "\n"
1334 "The returned string is read-only.\n")
1335 #define FUNC_NAME s_scm_substring_read_only
1336 {
1337 size_t len, from, to;
1338
1339 SCM_VALIDATE_STRING (1, str);
1340 len = scm_i_string_length (str);
1341 from = scm_to_unsigned_integer (start, 0, len);
1342 if (SCM_UNBNDP (end))
1343 to = len;
1344 else
1345 to = scm_to_unsigned_integer (end, from, len);
1346 return scm_i_substring_read_only (str, from, to);
1347 }
1348 #undef FUNC_NAME
1349
1350 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1351 (SCM str, SCM start, SCM end),
1352 "Return a newly allocated string formed from the characters\n"
1353 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1354 "ending with index @var{end} (exclusive).\n"
1355 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1356 "exact integers satisfying:\n\n"
1357 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1358 #define FUNC_NAME s_scm_substring_copy
1359 {
1360 /* For the Scheme version, START is mandatory, but for the C
1361 version, it is optional. See scm_string_copy in srfi-13.c for a
1362 rationale.
1363 */
1364
1365 size_t from, to;
1366
1367 SCM_VALIDATE_STRING (1, str);
1368 scm_i_get_substring_spec (scm_i_string_length (str),
1369 start, &from, end, &to);
1370 return scm_i_substring_copy (str, from, to);
1371 }
1372 #undef FUNC_NAME
1373
1374 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1375 (SCM str, SCM start, SCM end),
1376 "Return string that indirectly refers to the characters\n"
1377 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1378 "ending with index @var{end} (exclusive).\n"
1379 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1380 "exact integers satisfying:\n\n"
1381 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1382 #define FUNC_NAME s_scm_substring_shared
1383 {
1384 size_t len, from, to;
1385
1386 SCM_VALIDATE_STRING (1, str);
1387 len = scm_i_string_length (str);
1388 from = scm_to_unsigned_integer (start, 0, len);
1389 if (SCM_UNBNDP (end))
1390 to = len;
1391 else
1392 to = scm_to_unsigned_integer (end, from, len);
1393 return scm_i_substring_shared (str, from, to);
1394 }
1395 #undef FUNC_NAME
1396
1397 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1398 (SCM args),
1399 "Return a newly allocated string whose characters form the\n"
1400 "concatenation of the given strings, @var{args}.")
1401 #define FUNC_NAME s_scm_string_append
1402 {
1403 SCM res;
1404 size_t len = 0;
1405 int wide = 0;
1406 SCM l, s;
1407 size_t i;
1408 union
1409 {
1410 char *narrow;
1411 scm_t_wchar *wide;
1412 } data;
1413
1414 SCM_VALIDATE_REST_ARGUMENT (args);
1415 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1416 {
1417 s = SCM_CAR (l);
1418 SCM_VALIDATE_STRING (SCM_ARGn, s);
1419 len += scm_i_string_length (s);
1420 if (!scm_i_is_narrow_string (s))
1421 wide = 1;
1422 }
1423 data.narrow = NULL;
1424 if (!wide)
1425 res = scm_i_make_string (len, &data.narrow, 0);
1426 else
1427 res = scm_i_make_wide_string (len, &data.wide, 0);
1428
1429 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1430 {
1431 size_t len;
1432 s = SCM_CAR (l);
1433 SCM_VALIDATE_STRING (SCM_ARGn, s);
1434 len = scm_i_string_length (s);
1435 if (!wide)
1436 {
1437 memcpy (data.narrow, scm_i_string_chars (s), len);
1438 data.narrow += len;
1439 }
1440 else
1441 {
1442 if (scm_i_is_narrow_string (s))
1443 {
1444 for (i = 0; i < scm_i_string_length (s); i++)
1445 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1446 }
1447 else
1448 u32_cpy ((scm_t_uint32 *) data.wide,
1449 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1450 data.wide += len;
1451 }
1452 scm_remember_upto_here_1 (s);
1453 }
1454 return res;
1455 }
1456 #undef FUNC_NAME
1457
1458
1459 \f
1460 /* Charset conversion error handling. */
1461
1462 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1463 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1464
1465 /* Raise an exception informing that character CHR could not be written
1466 to PORT in its current encoding. */
1467 void
1468 scm_encoding_error (const char *subr, int err, const char *message,
1469 SCM port, SCM chr)
1470 {
1471 scm_throw (scm_encoding_error_key,
1472 scm_list_n (scm_from_latin1_string (subr),
1473 scm_from_latin1_string (message),
1474 scm_from_int (err),
1475 port, chr,
1476 SCM_UNDEFINED));
1477 }
1478
1479 /* Raise an exception informing of an encoding error on PORT. This
1480 means that a character could not be written in PORT's encoding. */
1481 void
1482 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1483 {
1484 scm_throw (scm_decoding_error_key,
1485 scm_list_n (scm_from_latin1_string (subr),
1486 scm_from_latin1_string (message),
1487 scm_from_int (err),
1488 port,
1489 SCM_UNDEFINED));
1490 }
1491
1492 \f
1493 /* String conversion to/from C. */
1494
1495 static void
1496 decoding_error (const char *func_name, int errno_save,
1497 const char *str, size_t len)
1498 {
1499 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1500 handler. */
1501 SCM bv;
1502 signed char *buf;
1503
1504 buf = scm_gc_malloc_pointerless (len, "bytevector");
1505 memcpy (buf, str, len);
1506 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1507
1508 scm_decoding_error (func_name, errno_save,
1509 "input locale conversion error", bv);
1510 }
1511
1512 SCM
1513 scm_from_stringn (const char *str, size_t len, const char *encoding,
1514 scm_t_string_failed_conversion_handler handler)
1515 {
1516 size_t u32len, i;
1517 scm_t_wchar *u32;
1518 int wide = 0;
1519 SCM res;
1520
1521 /* The order of these checks is important. */
1522 if (!str && len != 0)
1523 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1524 if (len == (size_t) -1)
1525 len = strlen (str);
1526
1527 if (encoding == NULL || len == 0)
1528 return scm_from_latin1_stringn (str, len);
1529 else if (strcmp (encoding, "UTF-8") == 0
1530 && handler == SCM_FAILED_CONVERSION_ERROR)
1531 return scm_from_utf8_stringn (str, len);
1532
1533 u32len = 0;
1534 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1535 (enum iconv_ilseq_handler)
1536 handler,
1537 str, len,
1538 NULL,
1539 NULL, &u32len);
1540
1541 if (SCM_UNLIKELY (u32 == NULL))
1542 decoding_error (__func__, errno, str, len);
1543
1544 i = 0;
1545 while (i < u32len)
1546 if (u32[i++] > 0xFF)
1547 {
1548 wide = 1;
1549 break;
1550 }
1551
1552 if (!wide)
1553 {
1554 char *dst;
1555 res = scm_i_make_string (u32len, &dst, 0);
1556 for (i = 0; i < u32len; i ++)
1557 dst[i] = (unsigned char) u32[i];
1558 dst[u32len] = '\0';
1559 }
1560 else
1561 {
1562 scm_t_wchar *wdst;
1563 res = scm_i_make_wide_string (u32len, &wdst, 0);
1564 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1565 wdst[u32len] = 0;
1566 }
1567
1568 free (u32);
1569 return res;
1570 }
1571
1572 SCM
1573 scm_from_locale_string (const char *str)
1574 {
1575 return scm_from_locale_stringn (str, -1);
1576 }
1577
1578 SCM
1579 scm_from_locale_stringn (const char *str, size_t len)
1580 {
1581 return scm_from_stringn (str, len, locale_charset (),
1582 scm_i_default_port_conversion_handler ());
1583 }
1584
1585 SCM
1586 scm_from_latin1_string (const char *str)
1587 {
1588 return scm_from_latin1_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_latin1_stringn (const char *str, size_t len)
1593 {
1594 char *buf;
1595 SCM result;
1596
1597 if (len == (size_t) -1)
1598 len = strlen (str);
1599
1600 /* Make a narrow string and copy STR as is. */
1601 result = scm_i_make_string (len, &buf, 0);
1602 memcpy (buf, str, len);
1603
1604 return result;
1605 }
1606
1607 SCM
1608 scm_from_utf8_string (const char *str)
1609 {
1610 return scm_from_utf8_stringn (str, -1);
1611 }
1612
1613 SCM
1614 scm_from_utf8_stringn (const char *str, size_t len)
1615 {
1616 size_t i, char_len;
1617 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1618 int ascii = 1, narrow = 1;
1619 SCM res;
1620
1621 if (len == (size_t) -1)
1622 len = strlen (str);
1623
1624 i = 0;
1625 char_len = 0;
1626
1627 while (i < len)
1628 {
1629 if (ustr[i] <= 127)
1630 {
1631 char_len++;
1632 i++;
1633 }
1634 else
1635 {
1636 ucs4_t c;
1637 int nbytes;
1638
1639 ascii = 0;
1640
1641 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1642
1643 if (c == 0xfffd)
1644 /* Bad UTF-8. */
1645 decoding_error (__func__, errno, str, len);
1646
1647 if (c > 255)
1648 narrow = 0;
1649
1650 char_len++;
1651 i += nbytes;
1652 }
1653 }
1654
1655 if (ascii)
1656 {
1657 char *dst;
1658 res = scm_i_make_string (char_len, &dst, 0);
1659 memcpy (dst, str, len);
1660 }
1661 else if (narrow)
1662 {
1663 char *dst;
1664 size_t j;
1665 ucs4_t c;
1666
1667 res = scm_i_make_string (char_len, &dst, 0);
1668
1669 for (i = 0, j = 0; i < len; j++)
1670 {
1671 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1672 dst[j] = (signed char) c;
1673 }
1674 }
1675 else
1676 {
1677 scm_t_wchar *dst;
1678 size_t j;
1679 ucs4_t c;
1680
1681 res = scm_i_make_wide_string (char_len, &dst, 0);
1682
1683 for (i = 0, j = 0; i < len; j++)
1684 {
1685 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1686 dst[j] = c;
1687 }
1688 }
1689
1690 return res;
1691 }
1692
1693 SCM
1694 scm_from_utf32_string (const scm_t_wchar *str)
1695 {
1696 return scm_from_utf32_stringn (str, -1);
1697 }
1698
1699 SCM
1700 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1701 {
1702 SCM result;
1703 scm_t_wchar *buf;
1704
1705 if (len == (size_t) -1)
1706 len = u32_strlen ((uint32_t *) str);
1707
1708 result = scm_i_make_wide_string (len, &buf, 0);
1709 memcpy (buf, str, len * sizeof (scm_t_wchar));
1710 scm_i_try_narrow_string (result);
1711
1712 return result;
1713 }
1714
1715 /* Create a new scheme string from the C string STR. The memory of
1716 STR may be used directly as storage for the new string. */
1717 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1718 would be to register a finalizer to eventually free(3) STR, which isn't
1719 worth it. Should we just deprecate the `scm_take_' functions? */
1720 SCM
1721 scm_take_locale_stringn (char *str, size_t len)
1722 {
1723 SCM res;
1724
1725 res = scm_from_locale_stringn (str, len);
1726 free (str);
1727
1728 return res;
1729 }
1730
1731 SCM
1732 scm_take_locale_string (char *str)
1733 {
1734 return scm_take_locale_stringn (str, -1);
1735 }
1736
1737 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1738 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1739 Set *LENP to the size of the resulting string.
1740
1741 FIXME: This is a hack we should get rid of. See
1742 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1743 for details. */
1744 static void
1745 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1746 {
1747 char *before, *after;
1748 size_t i, j;
1749
1750 before = buf;
1751 after = buf;
1752 i = 0;
1753 j = 0;
1754 while (i < *lenp)
1755 {
1756 if ((i <= *lenp - 6)
1757 && before[i] == '\\'
1758 && before[i + 1] == 'u'
1759 && before[i + 2] == '0' && before[i + 3] == '0')
1760 {
1761 /* Convert \u00NN to \xNN */
1762 after[j] = '\\';
1763 after[j + 1] = 'x';
1764 after[j + 2] = tolower ((int) before[i + 4]);
1765 after[j + 3] = tolower ((int) before[i + 5]);
1766 i += 6;
1767 j += 4;
1768 }
1769 else if ((i <= *lenp - 10)
1770 && before[i] == '\\'
1771 && before[i + 1] == 'U'
1772 && before[i + 2] == '0' && before[i + 3] == '0')
1773 {
1774 /* Convert \U00NNNNNN to \UNNNNNN */
1775 after[j] = '\\';
1776 after[j + 1] = 'U';
1777 after[j + 2] = tolower ((int) before[i + 4]);
1778 after[j + 3] = tolower ((int) before[i + 5]);
1779 after[j + 4] = tolower ((int) before[i + 6]);
1780 after[j + 5] = tolower ((int) before[i + 7]);
1781 after[j + 6] = tolower ((int) before[i + 8]);
1782 after[j + 7] = tolower ((int) before[i + 9]);
1783 i += 10;
1784 j += 8;
1785 }
1786 else
1787 {
1788 after[j] = before[i];
1789 i++;
1790 j++;
1791 }
1792 }
1793 *lenp = j;
1794 }
1795
1796 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1797 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1798 of the resulting string. BUF must be large enough to handle the
1799 worst case when `\uXXXX' escapes (6 characters) are replaced by
1800 `\xXXXX;' (7 characters). */
1801 static void
1802 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1803 {
1804 char *before, *after;
1805 size_t i, j;
1806 /* The worst case is if the input string contains all 4-digit hex escapes.
1807 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1808 size_t max_out_len = (*lenp * 7) / 6 + 1;
1809 size_t nzeros, ndigits;
1810
1811 before = buf;
1812 after = alloca (max_out_len);
1813 i = 0;
1814 j = 0;
1815 while (i < *lenp)
1816 {
1817 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1818 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1819 {
1820 if (before[i + 1] == 'u')
1821 ndigits = 4;
1822 else if (before[i + 1] == 'U')
1823 ndigits = 8;
1824 else
1825 abort ();
1826
1827 /* Add the R6RS hex escape initial sequence. */
1828 after[j] = '\\';
1829 after[j + 1] = 'x';
1830
1831 /* Move string positions to the start of the hex numbers. */
1832 i += 2;
1833 j += 2;
1834
1835 /* Find the number of initial zeros in this hex number. */
1836 nzeros = 0;
1837 while (before[i + nzeros] == '0' && nzeros < ndigits)
1838 nzeros++;
1839
1840 /* Copy the number, skipping initial zeros, and then move the string
1841 positions. */
1842 if (nzeros == ndigits)
1843 {
1844 after[j] = '0';
1845 i += ndigits;
1846 j += 1;
1847 }
1848 else
1849 {
1850 int pos;
1851 for (pos = 0; pos < ndigits - nzeros; pos++)
1852 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1853 i += ndigits;
1854 j += (ndigits - nzeros);
1855 }
1856
1857 /* Add terminating semicolon. */
1858 after[j] = ';';
1859 j++;
1860 }
1861 else
1862 {
1863 after[j] = before[i];
1864 i++;
1865 j++;
1866 }
1867 }
1868 *lenp = j;
1869 memcpy (before, after, j);
1870 }
1871
1872 char *
1873 scm_to_locale_string (SCM str)
1874 {
1875 return scm_to_locale_stringn (str, NULL);
1876 }
1877
1878 char *
1879 scm_to_locale_stringn (SCM str, size_t *lenp)
1880 {
1881 return scm_to_stringn (str, lenp,
1882 locale_charset (),
1883 scm_i_default_port_conversion_handler ());
1884 }
1885
1886 char *
1887 scm_to_latin1_string (SCM str)
1888 {
1889 return scm_to_latin1_stringn (str, NULL);
1890 }
1891
1892 char *
1893 scm_to_latin1_stringn (SCM str, size_t *lenp)
1894 #define FUNC_NAME "scm_to_latin1_stringn"
1895 {
1896 char *result;
1897
1898 SCM_VALIDATE_STRING (1, str);
1899
1900 if (scm_i_is_narrow_string (str))
1901 {
1902 size_t len = scm_i_string_length (str);
1903
1904 if (lenp)
1905 *lenp = len;
1906
1907 result = scm_strndup (scm_i_string_data (str), len);
1908 }
1909 else
1910 result = scm_to_stringn (str, lenp, NULL,
1911 SCM_FAILED_CONVERSION_ERROR);
1912
1913 return result;
1914 }
1915 #undef FUNC_NAME
1916
1917 char *
1918 scm_to_utf8_string (SCM str)
1919 {
1920 return scm_to_utf8_stringn (str, NULL);
1921 }
1922
1923 static size_t
1924 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1925 {
1926 size_t ret, i;
1927 for (i = 0, ret = 0; i < len; i++)
1928 ret += (str[i] < 128) ? 1 : 2;
1929 return ret;
1930 }
1931
1932 static scm_t_uint8*
1933 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1934 scm_t_uint8 *u8_result, size_t *u8_lenp)
1935 {
1936 size_t i, n;
1937 size_t u8_len = latin1_u8_strlen (str, latin_len);
1938
1939 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1940 u8_result = scm_malloc (u8_len + 1);
1941 if (u8_lenp)
1942 *u8_lenp = u8_len;
1943
1944 for (i = 0, n = 0; i < latin_len; i++)
1945 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1946 if (n != u8_len)
1947 abort ();
1948 u8_result[n] = 0;
1949
1950 return u8_result;
1951 }
1952
1953 /* UTF-8 code table
1954
1955 (Note that this includes code points that are not allowed by Unicode,
1956 but since this function has no way to report an error, and its
1957 purpose is to determine the size of destination buffers for
1958 libunicode conversion functions, we err on the safe side and handle
1959 everything that libunicode might conceivably handle, now or in the
1960 future.)
1961
1962 Char. number range | UTF-8 octet sequence
1963 (hexadecimal) | (binary)
1964 --------------------+------------------------------------------------------
1965 0000 0000-0000 007F | 0xxxxxxx
1966 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
1967 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1968 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1969 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1970 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1971 */
1972
1973 static size_t
1974 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
1975 {
1976 size_t ret, i;
1977
1978 for (i = 0, ret = 0; i < len; i++)
1979 {
1980 scm_t_uint32 c = str[i];
1981
1982 if (c <= 0x7f)
1983 ret += 1;
1984 else if (c <= 0x7ff)
1985 ret += 2;
1986 else if (c <= 0xffff)
1987 ret += 3;
1988 else if (c <= 0x1fffff)
1989 ret += 4;
1990 else if (c <= 0x3ffffff)
1991 ret += 5;
1992 else
1993 ret += 6;
1994 }
1995
1996 return ret;
1997 }
1998
1999 char *
2000 scm_to_utf8_stringn (SCM str, size_t *lenp)
2001 #define FUNC_NAME "scm_to_utf8_stringn"
2002 {
2003 SCM_VALIDATE_STRING (1, str);
2004
2005 if (scm_i_is_narrow_string (str))
2006 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2007 scm_i_string_length (str),
2008 NULL, lenp);
2009 else
2010 {
2011 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2012 scm_t_uint8 *buf, *ret;
2013 size_t num_chars = scm_i_string_length (str);
2014 size_t num_bytes_predicted, num_bytes_actual;
2015
2016 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2017
2018 if (lenp)
2019 {
2020 *lenp = num_bytes_predicted;
2021 buf = scm_malloc (num_bytes_predicted);
2022 }
2023 else
2024 {
2025 buf = scm_malloc (num_bytes_predicted + 1);
2026 buf[num_bytes_predicted] = 0;
2027 }
2028
2029 num_bytes_actual = num_bytes_predicted;
2030 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2031
2032 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2033 return (char *) ret;
2034
2035 /* An error: a bad codepoint. */
2036 {
2037 int saved_errno = errno;
2038
2039 free (buf);
2040 if (!saved_errno)
2041 abort ();
2042
2043 scm_decoding_error ("scm_to_utf8_stringn", errno,
2044 "invalid codepoint in string", str);
2045
2046 /* Not reached. */
2047 return NULL;
2048 }
2049 }
2050 }
2051 #undef FUNC_NAME
2052
2053 scm_t_wchar *
2054 scm_to_utf32_string (SCM str)
2055 {
2056 return scm_to_utf32_stringn (str, NULL);
2057 }
2058
2059 scm_t_wchar *
2060 scm_to_utf32_stringn (SCM str, size_t *lenp)
2061 #define FUNC_NAME "scm_to_utf32_stringn"
2062 {
2063 scm_t_wchar *result;
2064
2065 SCM_VALIDATE_STRING (1, str);
2066
2067 if (scm_i_is_narrow_string (str))
2068 {
2069 scm_t_uint8 *codepoints;
2070 size_t i, len;
2071
2072 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2073 len = scm_i_string_length (str);
2074 if (lenp)
2075 *lenp = len;
2076
2077 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2078 for (i = 0; i < len; i++)
2079 result[i] = codepoints[i];
2080 result[len] = 0;
2081 }
2082 else
2083 {
2084 size_t len;
2085
2086 len = scm_i_string_length (str);
2087 if (lenp)
2088 *lenp = len;
2089
2090 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2091 memcpy (result, scm_i_string_wide_chars (str),
2092 len * sizeof (scm_t_wchar));
2093 result[len] = 0;
2094 }
2095
2096 return result;
2097 }
2098 #undef FUNC_NAME
2099
2100 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2101 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2102 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2103 defined by HANDLER. */
2104 char *
2105 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2106 scm_t_string_failed_conversion_handler handler)
2107 {
2108 char *buf;
2109 size_t ilen, len, i;
2110 int ret;
2111 const char *enc;
2112
2113 if (!scm_is_string (str))
2114 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2115 ilen = scm_i_string_length (str);
2116
2117 if (ilen == 0)
2118 {
2119 buf = scm_malloc (1);
2120 buf[0] = '\0';
2121 if (lenp)
2122 *lenp = 0;
2123 return buf;
2124 }
2125
2126 if (lenp == NULL)
2127 for (i = 0; i < ilen; i++)
2128 if (scm_i_string_ref (str, i) == '\0')
2129 scm_misc_error (NULL,
2130 "string contains #\\nul character: ~S",
2131 scm_list_1 (str));
2132
2133 if (scm_i_is_narrow_string (str) && (encoding == NULL))
2134 {
2135 /* If using native Latin-1 encoding, just copy the string
2136 contents. */
2137 if (lenp)
2138 {
2139 buf = scm_malloc (ilen);
2140 memcpy (buf, scm_i_string_chars (str), ilen);
2141 *lenp = ilen;
2142 return buf;
2143 }
2144 else
2145 {
2146 buf = scm_malloc (ilen + 1);
2147 memcpy (buf, scm_i_string_chars (str), ilen);
2148 buf[ilen] = '\0';
2149 return buf;
2150 }
2151 }
2152
2153
2154 buf = NULL;
2155 len = 0;
2156 enc = encoding;
2157 if (enc == NULL)
2158 enc = "ISO-8859-1";
2159 if (scm_i_is_narrow_string (str))
2160 {
2161 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2162 "ISO-8859-1", enc,
2163 (enum iconv_ilseq_handler) handler, NULL,
2164 &buf, &len);
2165
2166 if (ret != 0)
2167 scm_encoding_error (__func__, errno,
2168 "cannot convert narrow string to output locale",
2169 SCM_BOOL_F,
2170 /* FIXME: Faulty character unknown. */
2171 SCM_BOOL_F);
2172 }
2173 else
2174 {
2175 buf = u32_conv_to_encoding (enc,
2176 (enum iconv_ilseq_handler) handler,
2177 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2178 ilen,
2179 NULL,
2180 NULL, &len);
2181 if (buf == NULL)
2182 scm_encoding_error (__func__, errno,
2183 "cannot convert wide string to output locale",
2184 SCM_BOOL_F,
2185 /* FIXME: Faulty character unknown. */
2186 SCM_BOOL_F);
2187 }
2188 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2189 {
2190 if (SCM_R6RS_ESCAPES_P)
2191 {
2192 /* The worst case is if the input string contains all 4-digit
2193 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2194 (seven characters). Make BUF large enough to hold
2195 that. */
2196 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2197 unistring_escapes_to_r6rs_escapes (buf, &len);
2198 }
2199 else
2200 unistring_escapes_to_guile_escapes (buf, &len);
2201
2202 buf = scm_realloc (buf, len);
2203 }
2204 if (lenp)
2205 *lenp = len;
2206 else
2207 {
2208 buf = scm_realloc (buf, len + 1);
2209 buf[len] = '\0';
2210 }
2211
2212 scm_remember_upto_here_1 (str);
2213 return buf;
2214 }
2215
2216 size_t
2217 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2218 {
2219 size_t len;
2220 char *result = NULL;
2221 if (!scm_is_string (str))
2222 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2223 result = scm_to_locale_stringn (str, &len);
2224
2225 memcpy (buf, result, (len > max_len) ? max_len : len);
2226 free (result);
2227
2228 scm_remember_upto_here_1 (str);
2229 return len;
2230 }
2231
2232 \f
2233 /* Unicode string normalization. */
2234
2235 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2236 libguile/i18n.c. It would be useful to have this factored out into a more
2237 convenient location, but its use of alloca makes that tricky to do. */
2238
2239 static SCM
2240 normalize_str (SCM string, uninorm_t form)
2241 {
2242 SCM ret;
2243 scm_t_uint32 *w_str;
2244 scm_t_wchar *cbuf;
2245 size_t rlen, len = scm_i_string_length (string);
2246
2247 if (scm_i_is_narrow_string (string))
2248 {
2249 size_t i;
2250 const char *buf = scm_i_string_chars (string);
2251
2252 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2253
2254 for (i = 0; i < len; i ++)
2255 w_str[i] = (unsigned char) buf[i];
2256 w_str[len] = 0;
2257 }
2258 else
2259 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2260
2261 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2262
2263 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2264 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2265 free (w_str);
2266
2267 scm_i_try_narrow_string (ret);
2268
2269 return ret;
2270 }
2271
2272 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2273 (SCM string),
2274 "Returns the NFC normalized form of @var{string}.")
2275 #define FUNC_NAME s_scm_string_normalize_nfc
2276 {
2277 SCM_VALIDATE_STRING (1, string);
2278 return normalize_str (string, UNINORM_NFC);
2279 }
2280 #undef FUNC_NAME
2281
2282 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2283 (SCM string),
2284 "Returns the NFD normalized form of @var{string}.")
2285 #define FUNC_NAME s_scm_string_normalize_nfd
2286 {
2287 SCM_VALIDATE_STRING (1, string);
2288 return normalize_str (string, UNINORM_NFD);
2289 }
2290 #undef FUNC_NAME
2291
2292 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2293 (SCM string),
2294 "Returns the NFKC normalized form of @var{string}.")
2295 #define FUNC_NAME s_scm_string_normalize_nfkc
2296 {
2297 SCM_VALIDATE_STRING (1, string);
2298 return normalize_str (string, UNINORM_NFKC);
2299 }
2300 #undef FUNC_NAME
2301
2302 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2303 (SCM string),
2304 "Returns the NFKD normalized form of @var{string}.")
2305 #define FUNC_NAME s_scm_string_normalize_nfkd
2306 {
2307 SCM_VALIDATE_STRING (1, string);
2308 return normalize_str (string, UNINORM_NFKD);
2309 }
2310 #undef FUNC_NAME
2311
2312 /* converts C scm_array of strings to SCM scm_list of strings.
2313 If argc < 0, a null terminated scm_array is assumed.
2314 The current locale encoding is assumed */
2315 SCM
2316 scm_makfromstrs (int argc, char **argv)
2317 {
2318 int i = argc;
2319 SCM lst = SCM_EOL;
2320 if (0 > i)
2321 for (i = 0; argv[i]; i++);
2322 while (i--)
2323 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2324 return lst;
2325 }
2326
2327 /* Return a newly allocated array of char pointers to each of the strings
2328 in args, with a terminating NULL pointer. The strings are encoded using
2329 the current locale. */
2330
2331 char **
2332 scm_i_allocate_string_pointers (SCM list)
2333 #define FUNC_NAME "scm_i_allocate_string_pointers"
2334 {
2335 char **result;
2336 int list_len = scm_ilength (list);
2337 int i;
2338
2339 if (list_len < 0)
2340 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2341
2342 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2343 "string pointers");
2344 result[list_len] = NULL;
2345
2346 /* The list might have been modified in another thread, so
2347 we check LIST before each access.
2348 */
2349 for (i = 0; i < list_len && scm_is_pair (list); i++)
2350 {
2351 SCM str = SCM_CAR (list);
2352 size_t len; /* String length in bytes */
2353 char *c_str = scm_to_locale_stringn (str, &len);
2354
2355 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2356 scm_malloc to allocate the returned string, which must be
2357 explicitly deallocated. This forces us to copy the string a
2358 second time into a new buffer. Ideally there would be variants
2359 of scm_to_*_stringn that can return garbage-collected buffers. */
2360
2361 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2362 memcpy (result[i], c_str, len);
2363 result[i][len] = '\0';
2364 free (c_str);
2365
2366 list = SCM_CDR (list);
2367 }
2368
2369 return result;
2370 }
2371 #undef FUNC_NAME
2372
2373 void
2374 scm_i_get_substring_spec (size_t len,
2375 SCM start, size_t *cstart,
2376 SCM end, size_t *cend)
2377 {
2378 if (SCM_UNBNDP (start))
2379 *cstart = 0;
2380 else
2381 *cstart = scm_to_unsigned_integer (start, 0, len);
2382
2383 if (SCM_UNBNDP (end))
2384 *cend = len;
2385 else
2386 *cend = scm_to_unsigned_integer (end, *cstart, len);
2387 }
2388
2389 static SCM
2390 string_handle_ref (scm_t_array_handle *h, size_t index)
2391 {
2392 return scm_c_string_ref (h->array, index);
2393 }
2394
2395 static void
2396 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2397 {
2398 scm_c_string_set_x (h->array, index, val);
2399 }
2400
2401 static void
2402 string_get_handle (SCM v, scm_t_array_handle *h)
2403 {
2404 h->array = v;
2405 h->ndims = 1;
2406 h->dims = &h->dim0;
2407 h->dim0.lbnd = 0;
2408 h->dim0.ubnd = scm_c_string_length (v) - 1;
2409 h->dim0.inc = 1;
2410 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2411 h->elements = h->writable_elements = NULL;
2412 }
2413
2414 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2415 string_handle_ref, string_handle_set,
2416 string_get_handle)
2417 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2418
2419 void
2420 scm_init_strings ()
2421 {
2422 scm_nullstr = scm_i_make_string (0, NULL, 0);
2423
2424 #include "libguile/strings.x"
2425 }
2426
2427
2428 /*
2429 Local Variables:
2430 c-file-style: "gnu"
2431 End:
2432 */