Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32 #include <c-strcase.h>
33
34 #include "striconveh.h"
35
36 #include "libguile/_scm.h"
37 #include "libguile/chars.h"
38 #include "libguile/root.h"
39 #include "libguile/strings.h"
40 #include "libguile/ports.h"
41 #include "libguile/ports-internal.h"
42 #include "libguile/error.h"
43 #include "libguile/generalized-vectors.h"
44 #include "libguile/deprecation.h"
45 #include "libguile/validate.h"
46 #include "libguile/private-options.h"
47
48 \f
49
50 /* {Strings}
51 */
52
53
54 /* Stringbufs
55 *
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
58 * shared, not whether it definitely is.
59 *
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
66 *
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
71 *
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 * strings.
75 */
76
77 /* The size in words of the stringbuf header (type tag + size). */
78 #define STRINGBUF_HEADER_SIZE 2U
79
80 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
81
82 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
83 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
84
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
88
89 #define STRINGBUF_CONTENTS(buf) ((void *) \
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
92 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
94
95 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
96
97 #define SET_STRINGBUF_SHARED(buf) \
98 do \
99 { \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 } \
105 while (0)
106
107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
108 static size_t lenhist[1001];
109 #endif
110
111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
112 characters. */
113 static SCM
114 make_stringbuf (size_t len)
115 {
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
120 can be dropped.
121 */
122
123 SCM buf;
124
125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130 #endif
131
132 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
133 "string"));
134
135 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
136 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
137
138 STRINGBUF_CHARS (buf)[len] = 0;
139
140 return buf;
141 }
142
143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
144 characters. */
145 static SCM
146 make_wide_stringbuf (size_t len)
147 {
148 SCM buf;
149 size_t raw_len;
150
151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156 #endif
157
158 raw_len = (len + 1) * sizeof (scm_t_wchar);
159 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
160 "string"));
161
162 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
163 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
164
165 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
166
167 return buf;
168 }
169
170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
172 static SCM
173 wide_stringbuf (SCM buf)
174 {
175 SCM new_buf;
176
177 if (STRINGBUF_WIDE (buf))
178 new_buf = buf;
179 else
180 {
181 size_t i, len;
182 scm_t_wchar *mem;
183
184 len = STRINGBUF_LENGTH (buf);
185
186 new_buf = make_wide_stringbuf (len);
187
188 mem = STRINGBUF_WIDE_CHARS (new_buf);
189 for (i = 0; i < len; i++)
190 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
191 mem[len] = 0;
192 }
193
194 return new_buf;
195 }
196
197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
199 static SCM
200 narrow_stringbuf (SCM buf)
201 {
202 SCM new_buf;
203
204 if (!STRINGBUF_WIDE (buf))
205 new_buf = buf;
206 else
207 {
208 size_t i, len;
209 scm_t_wchar *wmem;
210 unsigned char *mem;
211
212 len = STRINGBUF_LENGTH (buf);
213 wmem = STRINGBUF_WIDE_CHARS (buf);
214
215 for (i = 0; i < len; i++)
216 if (wmem[i] > 0xFF)
217 /* BUF cannot be narrowed. */
218 return buf;
219
220 new_buf = make_stringbuf (len);
221
222 mem = STRINGBUF_CHARS (new_buf);
223 for (i = 0; i < len; i++)
224 mem[i] = (unsigned char) wmem[i];
225 mem[len] = 0;
226 }
227
228 return new_buf;
229 }
230
231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
232
233 \f
234 /* Copy-on-write strings.
235 */
236
237 #define STRING_TAG scm_tc7_string
238
239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
242
243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
245
246 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
247
248 /* Read-only strings.
249 */
250
251 #define RO_STRING_TAG scm_tc7_ro_string
252 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
253
254 /* Mutation-sharing substrings
255 */
256
257 #define SH_STRING_TAG (scm_tc7_string + 0x100)
258
259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260 /* START and LENGTH as for STRINGs. */
261
262 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263
264 SCM scm_nullstr;
265
266 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
267 characters. CHARSP, if not NULL, will be set to location of the
268 char array. If READ_ONLY_P, the returned string is read-only;
269 otherwise it is writable. */
270 SCM
271 scm_i_make_string (size_t len, char **charsp, int read_only_p)
272 {
273 static SCM null_stringbuf = SCM_BOOL_F;
274 SCM buf;
275 SCM res;
276
277 if (len == 0)
278 {
279 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
280 {
281 null_stringbuf = make_stringbuf (0);
282 SET_STRINGBUF_SHARED (null_stringbuf);
283 }
284 buf = null_stringbuf;
285 }
286 else
287 buf = make_stringbuf (len);
288
289 if (charsp)
290 *charsp = (char *) STRINGBUF_CHARS (buf);
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295 }
296
297 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
298 characters. CHARSP, if not NULL, will be set to location of the
299 character array. If READ_ONLY_P, the returned string is read-only;
300 otherwise it is writable. */
301 SCM
302 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
303 {
304 SCM buf = make_wide_stringbuf (len);
305 SCM res;
306 if (charsp)
307 *charsp = STRINGBUF_WIDE_CHARS (buf);
308 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
309 SCM_UNPACK (buf),
310 (scm_t_bits) 0, (scm_t_bits) len);
311 return res;
312 }
313
314 static void
315 validate_substring_args (SCM str, size_t start, size_t end)
316 {
317 if (!IS_STRING (str))
318 scm_wrong_type_arg_msg (NULL, 0, str, "string");
319 if (start > STRING_LENGTH (str))
320 scm_out_of_range (NULL, scm_from_size_t (start));
321 if (end > STRING_LENGTH (str) || end < start)
322 scm_out_of_range (NULL, scm_from_size_t (end));
323 }
324
325 static inline void
326 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
327 {
328 *start = STRING_START (*str);
329 if (IS_SH_STRING (*str))
330 {
331 *str = SH_STRING_STRING (*str);
332 *start += STRING_START (*str);
333 }
334 *buf = STRING_STRINGBUF (*str);
335 }
336
337 SCM
338 scm_i_substring (SCM str, size_t start, size_t end)
339 {
340 if (start == end)
341 return scm_i_make_string (0, NULL, 0);
342 else
343 {
344 SCM buf;
345 size_t str_start;
346 get_str_buf_start (&str, &buf, &str_start);
347 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
348 SET_STRINGBUF_SHARED (buf);
349 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
350 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
351 (scm_t_bits)str_start + start,
352 (scm_t_bits) end - start);
353 }
354 }
355
356 SCM
357 scm_i_substring_read_only (SCM str, size_t start, size_t end)
358 {
359 if (start == end)
360 return scm_i_make_string (0, NULL, 1);
361 else
362 {
363 SCM buf;
364 size_t str_start;
365 get_str_buf_start (&str, &buf, &str_start);
366 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
367 SET_STRINGBUF_SHARED (buf);
368 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
369 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
370 (scm_t_bits)str_start + start,
371 (scm_t_bits) end - start);
372 }
373 }
374
375 SCM
376 scm_i_substring_copy (SCM str, size_t start, size_t end)
377 {
378 if (start == end)
379 return scm_i_make_string (0, NULL, 0);
380 else
381 {
382 size_t len = end - start;
383 SCM buf, my_buf, substr;
384 size_t str_start;
385 int wide = 0;
386 get_str_buf_start (&str, &buf, &str_start);
387 if (scm_i_is_narrow_string (str))
388 {
389 my_buf = make_stringbuf (len);
390 memcpy (STRINGBUF_CHARS (my_buf),
391 STRINGBUF_CHARS (buf) + str_start + start, len);
392 }
393 else
394 {
395 my_buf = make_wide_stringbuf (len);
396 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
397 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
398 + start), len);
399 wide = 1;
400 }
401 scm_remember_upto_here_1 (buf);
402 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
403 (scm_t_bits) 0, (scm_t_bits) len);
404 if (wide)
405 scm_i_try_narrow_string (substr);
406 return substr;
407 }
408 }
409
410 SCM
411 scm_i_substring_shared (SCM str, size_t start, size_t end)
412 {
413 if (start == 0 && end == STRING_LENGTH (str))
414 return str;
415 else if (start == end)
416 return scm_i_make_string (0, NULL, 0);
417 else
418 {
419 size_t len = end - start;
420 if (IS_SH_STRING (str))
421 {
422 start += STRING_START (str);
423 str = SH_STRING_STRING (str);
424 }
425 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
426 (scm_t_bits)start, (scm_t_bits) len);
427 }
428 }
429
430 SCM
431 scm_c_substring (SCM str, size_t start, size_t end)
432 {
433 validate_substring_args (str, start, end);
434 return scm_i_substring (str, start, end);
435 }
436
437 SCM
438 scm_c_substring_read_only (SCM str, size_t start, size_t end)
439 {
440 validate_substring_args (str, start, end);
441 return scm_i_substring_read_only (str, start, end);
442 }
443
444 SCM
445 scm_c_substring_copy (SCM str, size_t start, size_t end)
446 {
447 validate_substring_args (str, start, end);
448 return scm_i_substring_copy (str, start, end);
449 }
450
451 SCM
452 scm_c_substring_shared (SCM str, size_t start, size_t end)
453 {
454 validate_substring_args (str, start, end);
455 return scm_i_substring_shared (str, start, end);
456 }
457
458 \f
459 /* Internal accessors
460 */
461
462 /* Returns the number of characters in STR. This may be different
463 than the memory size of the string storage. */
464 size_t
465 scm_i_string_length (SCM str)
466 {
467 return STRING_LENGTH (str);
468 }
469
470 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
471 encoding. False if it is 'wide', having a 32-bit UCS-4
472 encoding. */
473 int
474 scm_i_is_narrow_string (SCM str)
475 {
476 if (IS_SH_STRING (str))
477 str = SH_STRING_STRING (str);
478
479 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
480 }
481
482 /* Try to coerce a string to be narrow. It if is narrow already, do
483 nothing. If it is wide, shrink it to narrow if none of its
484 characters are above 0xFF. Return true if the string is narrow or
485 was made to be narrow. */
486 int
487 scm_i_try_narrow_string (SCM str)
488 {
489 if (IS_SH_STRING (str))
490 str = SH_STRING_STRING (str);
491
492 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
493
494 return scm_i_is_narrow_string (str);
495 }
496
497 /* Return a pointer to the raw data of the string, which can be either Latin-1
498 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
499 const void *
500 scm_i_string_data (SCM str)
501 {
502 SCM buf;
503 size_t start;
504 const char *data;
505
506 get_str_buf_start (&str, &buf, &start);
507
508 data = STRINGBUF_CONTENTS (buf);
509 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
510
511 return data;
512 }
513
514 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
515 STR. */
516 const char *
517 scm_i_string_chars (SCM str)
518 {
519 SCM buf;
520 size_t start;
521 get_str_buf_start (&str, &buf, &start);
522 if (scm_i_is_narrow_string (str))
523 return (const char *) STRINGBUF_CHARS (buf) + start;
524 else
525 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
526 scm_list_1 (str));
527 return NULL;
528 }
529
530 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
531 STR. */
532 const scm_t_wchar *
533 scm_i_string_wide_chars (SCM str)
534 {
535 SCM buf;
536 size_t start;
537
538 get_str_buf_start (&str, &buf, &start);
539 if (!scm_i_is_narrow_string (str))
540 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
541 else
542 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
543 scm_list_1 (str));
544 }
545
546 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
547 a new string buffer, so that it can be modified without modifying
548 other strings. Also, lock the string mutex. Later, one must call
549 scm_i_string_stop_writing to unlock the mutex. */
550 SCM
551 scm_i_string_start_writing (SCM orig_str)
552 {
553 SCM buf, str = orig_str;
554 size_t start;
555
556 get_str_buf_start (&str, &buf, &start);
557 if (IS_RO_STRING (str))
558 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
559
560 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
561 if (STRINGBUF_SHARED (buf))
562 {
563 /* Clone the stringbuf. */
564 size_t len = STRING_LENGTH (str);
565 SCM new_buf;
566
567 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
568
569 if (scm_i_is_narrow_string (str))
570 {
571 new_buf = make_stringbuf (len);
572 memcpy (STRINGBUF_CHARS (new_buf),
573 STRINGBUF_CHARS (buf) + STRING_START (str), len);
574
575 }
576 else
577 {
578 new_buf = make_wide_stringbuf (len);
579 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
580 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
581 + STRING_START (str)), len);
582 }
583
584 SET_STRING_STRINGBUF (str, new_buf);
585 start -= STRING_START (str);
586
587 /* FIXME: The following operations are not atomic, so other threads
588 looking at STR may see an inconsistent state. Nevertheless it can't
589 hurt much since (i) accessing STR while it is being mutated can't
590 yield a crash, and (ii) concurrent accesses to STR should be
591 protected by a mutex at the application level. The latter may not
592 apply when STR != ORIG_STR, though. */
593 SET_STRING_START (str, 0);
594 SET_STRING_STRINGBUF (str, new_buf);
595
596 buf = new_buf;
597
598 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
599 }
600 return orig_str;
601 }
602
603 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
604 char *
605 scm_i_string_writable_chars (SCM str)
606 {
607 SCM buf;
608 size_t start;
609
610 get_str_buf_start (&str, &buf, &start);
611 if (scm_i_is_narrow_string (str))
612 return (char *) STRINGBUF_CHARS (buf) + start;
613 else
614 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
615 scm_list_1 (str));
616 return NULL;
617 }
618
619 /* Return a pointer to the UCS-4 codepoints of a string. */
620 static scm_t_wchar *
621 scm_i_string_writable_wide_chars (SCM str)
622 {
623 SCM buf;
624 size_t start;
625
626 get_str_buf_start (&str, &buf, &start);
627 if (!scm_i_is_narrow_string (str))
628 return STRINGBUF_WIDE_CHARS (buf) + start;
629 else
630 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
631 scm_list_1 (str));
632 }
633
634 /* Unlock the string mutex that was locked when
635 scm_i_string_start_writing was called. */
636 void
637 scm_i_string_stop_writing (void)
638 {
639 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
640 }
641
642 /* Return the Xth character of STR as a UCS-4 codepoint. */
643 scm_t_wchar
644 scm_i_string_ref (SCM str, size_t x)
645 {
646 if (scm_i_is_narrow_string (str))
647 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
648 else
649 return scm_i_string_wide_chars (str)[x];
650 }
651
652 /* Returns index+1 of the first char in STR that matches C, or
653 0 if the char is not found. */
654 int
655 scm_i_string_contains_char (SCM str, char ch)
656 {
657 size_t i;
658 size_t len = scm_i_string_length (str);
659
660 i = 0;
661 if (scm_i_is_narrow_string (str))
662 {
663 while (i < len)
664 {
665 if (scm_i_string_chars (str)[i] == ch)
666 return i+1;
667 i++;
668 }
669 }
670 else
671 {
672 while (i < len)
673 {
674 if (scm_i_string_wide_chars (str)[i]
675 == (unsigned char) ch)
676 return i+1;
677 i++;
678 }
679 }
680 return 0;
681 }
682
683 int
684 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
685 {
686 if (scm_i_is_narrow_string (sstr))
687 {
688 const char *a = scm_i_string_chars (sstr) + start_x;
689 const char *b = cstr;
690 return strncmp (a, b, strlen(b));
691 }
692 else
693 {
694 size_t i;
695 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
696 const char *b = cstr;
697 for (i = 0; i < strlen (b); i++)
698 {
699 if (a[i] != (unsigned char) b[i])
700 return 1;
701 }
702 }
703 return 0;
704 }
705
706 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
707 void
708 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
709 {
710 if (IS_SH_STRING (str))
711 {
712 p += STRING_START (str);
713 str = SH_STRING_STRING (str);
714 }
715
716 if (chr > 0xFF && scm_i_is_narrow_string (str))
717 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
718
719 if (scm_i_is_narrow_string (str))
720 {
721 char *dst = scm_i_string_writable_chars (str);
722 dst[p] = chr;
723 }
724 else
725 {
726 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
727 dst[p] = chr;
728 }
729 }
730
731 \f
732 /* Symbols.
733
734 Basic symbol creation and accessing is done here, the rest is in
735 symbols.[hc]. This has been done to keep stringbufs and the
736 internals of strings and string-like objects confined to this file.
737 */
738
739 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
740
741 SCM
742 scm_i_make_symbol (SCM name, scm_t_bits flags,
743 unsigned long hash, SCM props)
744 {
745 SCM buf;
746 size_t start = STRING_START (name);
747 size_t length = STRING_LENGTH (name);
748
749 if (IS_SH_STRING (name))
750 {
751 name = SH_STRING_STRING (name);
752 start += STRING_START (name);
753 }
754 buf = STRING_STRINGBUF (name);
755
756 if (start == 0 && length == STRINGBUF_LENGTH (buf))
757 {
758 /* reuse buf. */
759 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
760 SET_STRINGBUF_SHARED (buf);
761 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
762 }
763 else
764 {
765 /* make new buf. */
766 if (scm_i_is_narrow_string (name))
767 {
768 SCM new_buf = make_stringbuf (length);
769 memcpy (STRINGBUF_CHARS (new_buf),
770 STRINGBUF_CHARS (buf) + start, length);
771 buf = new_buf;
772 }
773 else
774 {
775 SCM new_buf = make_wide_stringbuf (length);
776 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
777 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
778 length);
779 buf = new_buf;
780 }
781 }
782 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
783 (scm_t_bits) hash, SCM_UNPACK (props));
784 }
785
786 SCM
787 scm_i_c_make_symbol (const char *name, size_t len,
788 scm_t_bits flags, unsigned long hash, SCM props)
789 {
790 SCM buf = make_stringbuf (len);
791 memcpy (STRINGBUF_CHARS (buf), name, len);
792
793 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
794 (scm_t_bits) hash, SCM_UNPACK (props));
795 }
796
797 /* Returns the number of characters in SYM. This may be different
798 from the memory size of SYM. */
799 size_t
800 scm_i_symbol_length (SCM sym)
801 {
802 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
803 }
804
805 size_t
806 scm_c_symbol_length (SCM sym)
807 #define FUNC_NAME "scm_c_symbol_length"
808 {
809 SCM_VALIDATE_SYMBOL (1, sym);
810
811 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
812 }
813 #undef FUNC_NAME
814
815 /* True if the name of SYM is stored as a Latin-1 encoded string.
816 False if it is stored as a 32-bit UCS-4-encoded string. */
817 int
818 scm_i_is_narrow_symbol (SCM sym)
819 {
820 SCM buf;
821
822 buf = SYMBOL_STRINGBUF (sym);
823 return !STRINGBUF_WIDE (buf);
824 }
825
826 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
827 contains the name of SYM. */
828 const char *
829 scm_i_symbol_chars (SCM sym)
830 {
831 SCM buf;
832
833 buf = SYMBOL_STRINGBUF (sym);
834 if (!STRINGBUF_WIDE (buf))
835 return (const char *) STRINGBUF_CHARS (buf);
836 else
837 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
838 scm_list_1 (sym));
839 }
840
841 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
842 symbol's name. */
843 const scm_t_wchar *
844 scm_i_symbol_wide_chars (SCM sym)
845 {
846 SCM buf;
847
848 buf = SYMBOL_STRINGBUF (sym);
849 if (STRINGBUF_WIDE (buf))
850 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
851 else
852 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
853 scm_list_1 (sym));
854 }
855
856 SCM
857 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
858 {
859 SCM buf = SYMBOL_STRINGBUF (sym);
860 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
861 SET_STRINGBUF_SHARED (buf);
862 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
863 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
864 (scm_t_bits)start, (scm_t_bits) end - start);
865 }
866
867 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
868 scm_t_wchar
869 scm_i_symbol_ref (SCM sym, size_t x)
870 {
871 if (scm_i_is_narrow_symbol (sym))
872 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
873 else
874 return scm_i_symbol_wide_chars (sym)[x];
875 }
876
877 /* Debugging
878 */
879
880 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
881 "Returns an association list containing debugging information\n"
882 "for @var{str}. The association list has the following entries."
883 "@table @code\n"
884 "@item string\n"
885 "The string itself.\n"
886 "@item start\n"
887 "The start index of the string into its stringbuf\n"
888 "@item length\n"
889 "The length of the string\n"
890 "@item shared\n"
891 "If this string is a substring, it returns its parent string.\n"
892 "Otherwise, it returns @code{#f}\n"
893 "@item read-only\n"
894 "@code{#t} if the string is read-only\n"
895 "@item stringbuf-chars\n"
896 "A new string containing this string's stringbuf's characters\n"
897 "@item stringbuf-length\n"
898 "The number of characters in this stringbuf\n"
899 "@item stringbuf-shared\n"
900 "@code{#t} if this stringbuf is shared\n"
901 "@item stringbuf-wide\n"
902 "@code{#t} if this stringbuf's characters are stored in a\n"
903 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
904 "buffer\n"
905 "@end table")
906 #define FUNC_NAME s_scm_sys_string_dump
907 {
908 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
909 SCM buf;
910 SCM_VALIDATE_STRING (1, str);
911
912 /* String info */
913 e1 = scm_cons (scm_from_latin1_symbol ("string"),
914 str);
915 e2 = scm_cons (scm_from_latin1_symbol ("start"),
916 scm_from_size_t (STRING_START (str)));
917 e3 = scm_cons (scm_from_latin1_symbol ("length"),
918 scm_from_size_t (STRING_LENGTH (str)));
919
920 if (IS_SH_STRING (str))
921 {
922 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
923 SH_STRING_STRING (str));
924 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
925 }
926 else
927 {
928 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
929 SCM_BOOL_F);
930 buf = STRING_STRINGBUF (str);
931 }
932
933 if (IS_RO_STRING (str))
934 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
935 SCM_BOOL_T);
936 else
937 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
938 SCM_BOOL_F);
939
940 /* Stringbuf info */
941 if (!STRINGBUF_WIDE (buf))
942 {
943 size_t len = STRINGBUF_LENGTH (buf);
944 char *cbuf;
945 SCM sbc = scm_i_make_string (len, &cbuf, 0);
946 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
947 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
948 sbc);
949 }
950 else
951 {
952 size_t len = STRINGBUF_LENGTH (buf);
953 scm_t_wchar *cbuf;
954 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
955 u32_cpy ((scm_t_uint32 *) cbuf,
956 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
957 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
958 sbc);
959 }
960 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
961 scm_from_size_t (STRINGBUF_LENGTH (buf)));
962 if (STRINGBUF_SHARED (buf))
963 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
964 SCM_BOOL_T);
965 else
966 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
967 SCM_BOOL_F);
968 if (STRINGBUF_WIDE (buf))
969 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
970 SCM_BOOL_T);
971 else
972 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
973 SCM_BOOL_F);
974
975 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
976 }
977 #undef FUNC_NAME
978
979 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
980 "Returns an association list containing debugging information\n"
981 "for @var{sym}. The association list has the following entries."
982 "@table @code\n"
983 "@item symbol\n"
984 "The symbol itself\n"
985 "@item hash\n"
986 "Its hash value\n"
987 "@item interned\n"
988 "@code{#t} if it is an interned symbol\n"
989 "@item stringbuf-chars\n"
990 "A new string containing this symbols's stringbuf's characters\n"
991 "@item stringbuf-length\n"
992 "The number of characters in this stringbuf\n"
993 "@item stringbuf-shared\n"
994 "@code{#t} if this stringbuf is shared\n"
995 "@item stringbuf-wide\n"
996 "@code{#t} if this stringbuf's characters are stored in a\n"
997 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
998 "buffer\n"
999 "@end table")
1000 #define FUNC_NAME s_scm_sys_symbol_dump
1001 {
1002 SCM e1, e2, e3, e4, e5, e6, e7;
1003 SCM buf;
1004 SCM_VALIDATE_SYMBOL (1, sym);
1005 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1006 sym);
1007 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1008 scm_from_ulong (scm_i_symbol_hash (sym)));
1009 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1010 scm_symbol_interned_p (sym));
1011 buf = SYMBOL_STRINGBUF (sym);
1012
1013 /* Stringbuf info */
1014 if (!STRINGBUF_WIDE (buf))
1015 {
1016 size_t len = STRINGBUF_LENGTH (buf);
1017 char *cbuf;
1018 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1019 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1020 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1021 sbc);
1022 }
1023 else
1024 {
1025 size_t len = STRINGBUF_LENGTH (buf);
1026 scm_t_wchar *cbuf;
1027 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1028 u32_cpy ((scm_t_uint32 *) cbuf,
1029 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1030 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1031 sbc);
1032 }
1033 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1034 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1035 if (STRINGBUF_SHARED (buf))
1036 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037 SCM_BOOL_T);
1038 else
1039 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1040 SCM_BOOL_F);
1041 if (STRINGBUF_WIDE (buf))
1042 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043 SCM_BOOL_T);
1044 else
1045 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1046 SCM_BOOL_F);
1047 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1048
1049 }
1050 #undef FUNC_NAME
1051
1052 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1053
1054 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1055 #define FUNC_NAME s_scm_sys_stringbuf_hist
1056 {
1057 int i;
1058 for (i = 0; i < 1000; i++)
1059 if (lenhist[i])
1060 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1061 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1062 return SCM_UNSPECIFIED;
1063 }
1064 #undef FUNC_NAME
1065
1066 #endif
1067
1068 \f
1069
1070 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1071 (SCM obj),
1072 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1073 #define FUNC_NAME s_scm_string_p
1074 {
1075 return scm_from_bool (IS_STRING (obj));
1076 }
1077 #undef FUNC_NAME
1078
1079
1080 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1081
1082 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1083 (SCM chrs),
1084 "@deffnx {Scheme Procedure} list->string chrs\n"
1085 "Return a newly allocated string composed of the arguments,\n"
1086 "@var{chrs}.")
1087 #define FUNC_NAME s_scm_string
1088 {
1089 SCM result = SCM_BOOL_F;
1090 SCM rest;
1091 size_t len;
1092 size_t p = 0;
1093 long i;
1094 int wide = 0;
1095
1096 /* Verify that this is a list of chars. */
1097 i = scm_ilength (chrs);
1098 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1099
1100 len = (size_t) i;
1101 rest = chrs;
1102
1103 while (len > 0 && scm_is_pair (rest))
1104 {
1105 SCM elt = SCM_CAR (rest);
1106 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1107 if (SCM_CHAR (elt) > 0xFF)
1108 wide = 1;
1109 rest = SCM_CDR (rest);
1110 len--;
1111 scm_remember_upto_here_1 (elt);
1112 }
1113
1114 /* Construct a string containing this list of chars. */
1115 len = (size_t) i;
1116 rest = chrs;
1117
1118 if (wide == 0)
1119 {
1120 char *buf;
1121
1122 result = scm_i_make_string (len, NULL, 0);
1123 result = scm_i_string_start_writing (result);
1124 buf = scm_i_string_writable_chars (result);
1125 while (len > 0 && scm_is_pair (rest))
1126 {
1127 SCM elt = SCM_CAR (rest);
1128 buf[p] = (unsigned char) SCM_CHAR (elt);
1129 p++;
1130 rest = SCM_CDR (rest);
1131 len--;
1132 scm_remember_upto_here_1 (elt);
1133 }
1134 }
1135 else
1136 {
1137 scm_t_wchar *buf;
1138
1139 result = scm_i_make_wide_string (len, NULL, 0);
1140 result = scm_i_string_start_writing (result);
1141 buf = scm_i_string_writable_wide_chars (result);
1142 while (len > 0 && scm_is_pair (rest))
1143 {
1144 SCM elt = SCM_CAR (rest);
1145 buf[p] = SCM_CHAR (elt);
1146 p++;
1147 rest = SCM_CDR (rest);
1148 len--;
1149 scm_remember_upto_here_1 (elt);
1150 }
1151 }
1152 scm_i_string_stop_writing ();
1153
1154 if (len > 0)
1155 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1156 if (!scm_is_null (rest))
1157 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1158
1159 return result;
1160 }
1161 #undef FUNC_NAME
1162
1163 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1164 (SCM k, SCM chr),
1165 "Return a newly allocated string of\n"
1166 "length @var{k}. If @var{chr} is given, then all elements of\n"
1167 "the string are initialized to @var{chr}, otherwise the contents\n"
1168 "of the string are all set to @code{#\nul}.")
1169 #define FUNC_NAME s_scm_make_string
1170 {
1171 return scm_c_make_string (scm_to_size_t (k), chr);
1172 }
1173 #undef FUNC_NAME
1174
1175 SCM
1176 scm_c_make_string (size_t len, SCM chr)
1177 #define FUNC_NAME NULL
1178 {
1179 size_t p;
1180 char *contents = NULL;
1181 SCM res = scm_i_make_string (len, &contents, 0);
1182
1183 /* If no char is given, initialize string contents to NULL. */
1184 if (SCM_UNBNDP (chr))
1185 memset (contents, 0, len);
1186 else
1187 {
1188 SCM_VALIDATE_CHAR (0, chr);
1189 res = scm_i_string_start_writing (res);
1190 for (p = 0; p < len; p++)
1191 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1192 scm_i_string_stop_writing ();
1193 }
1194
1195 return res;
1196 }
1197 #undef FUNC_NAME
1198
1199 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1200 (SCM string),
1201 "Return the number of characters in @var{string}.")
1202 #define FUNC_NAME s_scm_string_length
1203 {
1204 SCM_VALIDATE_STRING (1, string);
1205 return scm_from_size_t (STRING_LENGTH (string));
1206 }
1207 #undef FUNC_NAME
1208
1209 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1210 (SCM string),
1211 "Return the bytes used to represent a character in @var{string}."
1212 "This will return 1 or 4.")
1213 #define FUNC_NAME s_scm_string_bytes_per_char
1214 {
1215 SCM_VALIDATE_STRING (1, string);
1216 if (!scm_i_is_narrow_string (string))
1217 return scm_from_int (4);
1218
1219 return scm_from_int (1);
1220 }
1221 #undef FUNC_NAME
1222
1223 size_t
1224 scm_c_string_length (SCM string)
1225 {
1226 if (!IS_STRING (string))
1227 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1228 return STRING_LENGTH (string);
1229 }
1230
1231 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1232 (SCM str, SCM k),
1233 "Return character @var{k} of @var{str} using zero-origin\n"
1234 "indexing. @var{k} must be a valid index of @var{str}.")
1235 #define FUNC_NAME s_scm_string_ref
1236 {
1237 size_t len;
1238 unsigned long idx;
1239
1240 SCM_VALIDATE_STRING (1, str);
1241
1242 len = scm_i_string_length (str);
1243 if (SCM_LIKELY (len > 0))
1244 idx = scm_to_unsigned_integer (k, 0, len - 1);
1245 else
1246 scm_out_of_range (NULL, k);
1247
1248 if (scm_i_is_narrow_string (str))
1249 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1250 else
1251 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1252 }
1253 #undef FUNC_NAME
1254
1255 SCM
1256 scm_c_string_ref (SCM str, size_t p)
1257 {
1258 if (p >= scm_i_string_length (str))
1259 scm_out_of_range (NULL, scm_from_size_t (p));
1260 if (scm_i_is_narrow_string (str))
1261 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1262 else
1263 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1264
1265 }
1266
1267 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1268 (SCM str, SCM k, SCM chr),
1269 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1270 "an unspecified value. @var{k} must be a valid index of\n"
1271 "@var{str}.")
1272 #define FUNC_NAME s_scm_string_set_x
1273 {
1274 size_t len;
1275 unsigned long idx;
1276
1277 SCM_VALIDATE_STRING (1, str);
1278
1279 len = scm_i_string_length (str);
1280 if (SCM_LIKELY (len > 0))
1281 idx = scm_to_unsigned_integer (k, 0, len - 1);
1282 else
1283 scm_out_of_range (NULL, k);
1284
1285 SCM_VALIDATE_CHAR (3, chr);
1286 str = scm_i_string_start_writing (str);
1287 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1288 scm_i_string_stop_writing ();
1289
1290 return SCM_UNSPECIFIED;
1291 }
1292 #undef FUNC_NAME
1293
1294 void
1295 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1296 {
1297 if (p >= scm_i_string_length (str))
1298 scm_out_of_range (NULL, scm_from_size_t (p));
1299 str = scm_i_string_start_writing (str);
1300 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1301 scm_i_string_stop_writing ();
1302 }
1303
1304 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1305 (SCM str, SCM start, SCM end),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring
1313 {
1314 size_t len, from, to;
1315
1316 SCM_VALIDATE_STRING (1, str);
1317 len = scm_i_string_length (str);
1318 from = scm_to_unsigned_integer (start, 0, len);
1319 if (SCM_UNBNDP (end))
1320 to = len;
1321 else
1322 to = scm_to_unsigned_integer (end, from, len);
1323 return scm_i_substring (str, from, to);
1324 }
1325 #undef FUNC_NAME
1326
1327 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1328 (SCM str, SCM start, SCM end),
1329 "Return a newly allocated string formed from the characters\n"
1330 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1331 "ending with index @var{end} (exclusive).\n"
1332 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1333 "exact integers satisfying:\n"
1334 "\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1336 "\n"
1337 "The returned string is read-only.\n")
1338 #define FUNC_NAME s_scm_substring_read_only
1339 {
1340 size_t len, from, to;
1341
1342 SCM_VALIDATE_STRING (1, str);
1343 len = scm_i_string_length (str);
1344 from = scm_to_unsigned_integer (start, 0, len);
1345 if (SCM_UNBNDP (end))
1346 to = len;
1347 else
1348 to = scm_to_unsigned_integer (end, from, len);
1349 return scm_i_substring_read_only (str, from, to);
1350 }
1351 #undef FUNC_NAME
1352
1353 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1354 (SCM str, SCM start, SCM end),
1355 "Return a newly allocated string formed from the characters\n"
1356 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1357 "ending with index @var{end} (exclusive).\n"
1358 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1359 "exact integers satisfying:\n\n"
1360 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1361 #define FUNC_NAME s_scm_substring_copy
1362 {
1363 /* For the Scheme version, START is mandatory, but for the C
1364 version, it is optional. See scm_string_copy in srfi-13.c for a
1365 rationale.
1366 */
1367
1368 size_t from, to;
1369
1370 SCM_VALIDATE_STRING (1, str);
1371 scm_i_get_substring_spec (scm_i_string_length (str),
1372 start, &from, end, &to);
1373 return scm_i_substring_copy (str, from, to);
1374 }
1375 #undef FUNC_NAME
1376
1377 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1378 (SCM str, SCM start, SCM end),
1379 "Return string that indirectly refers to the characters\n"
1380 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1381 "ending with index @var{end} (exclusive).\n"
1382 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1383 "exact integers satisfying:\n\n"
1384 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1385 #define FUNC_NAME s_scm_substring_shared
1386 {
1387 size_t len, from, to;
1388
1389 SCM_VALIDATE_STRING (1, str);
1390 len = scm_i_string_length (str);
1391 from = scm_to_unsigned_integer (start, 0, len);
1392 if (SCM_UNBNDP (end))
1393 to = len;
1394 else
1395 to = scm_to_unsigned_integer (end, from, len);
1396 return scm_i_substring_shared (str, from, to);
1397 }
1398 #undef FUNC_NAME
1399
1400 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1401 (SCM args),
1402 "Return a newly allocated string whose characters form the\n"
1403 "concatenation of the given strings, @var{args}.")
1404 #define FUNC_NAME s_scm_string_append
1405 {
1406 SCM res;
1407 size_t total = 0;
1408 size_t len;
1409 int wide = 0;
1410 SCM l, s;
1411 size_t i;
1412 union
1413 {
1414 char *narrow;
1415 scm_t_wchar *wide;
1416 } data;
1417
1418 SCM_VALIDATE_REST_ARGUMENT (args);
1419 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1420 {
1421 s = SCM_CAR (l);
1422 SCM_VALIDATE_STRING (SCM_ARGn, s);
1423 len = scm_i_string_length (s);
1424 if (((size_t) -1) - total < len)
1425 scm_num_overflow (s_scm_string_append);
1426 total += len;
1427 if (!scm_i_is_narrow_string (s))
1428 wide = 1;
1429 }
1430 data.narrow = NULL;
1431 if (!wide)
1432 res = scm_i_make_string (total, &data.narrow, 0);
1433 else
1434 res = scm_i_make_wide_string (total, &data.wide, 0);
1435
1436 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1437 {
1438 size_t len;
1439 s = SCM_CAR (l);
1440 SCM_VALIDATE_STRING (SCM_ARGn, s);
1441 len = scm_i_string_length (s);
1442 if (len > total)
1443 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1444 if (!wide)
1445 {
1446 memcpy (data.narrow, scm_i_string_chars (s), len);
1447 data.narrow += len;
1448 }
1449 else
1450 {
1451 if (scm_i_is_narrow_string (s))
1452 {
1453 const char *src = scm_i_string_chars (s);
1454 for (i = 0; i < len; i++)
1455 data.wide[i] = (unsigned char) src[i];
1456 }
1457 else
1458 u32_cpy ((scm_t_uint32 *) data.wide,
1459 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1460 data.wide += len;
1461 }
1462 total -= len;
1463 scm_remember_upto_here_1 (s);
1464 }
1465 if (total != 0)
1466 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1467 return res;
1468 }
1469 #undef FUNC_NAME
1470
1471
1472 \f
1473 /* Charset conversion error handling. */
1474
1475 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1476 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1477
1478 /* Raise an exception informing that character CHR could not be written
1479 to PORT in its current encoding. */
1480 void
1481 scm_encoding_error (const char *subr, int err, const char *message,
1482 SCM port, SCM chr)
1483 {
1484 scm_throw (scm_encoding_error_key,
1485 scm_list_n (scm_from_latin1_string (subr),
1486 scm_from_latin1_string (message),
1487 scm_from_int (err),
1488 port, chr,
1489 SCM_UNDEFINED));
1490 }
1491
1492 /* Raise an exception informing of an encoding error on PORT. This
1493 means that a character could not be written in PORT's encoding. */
1494 void
1495 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1496 {
1497 scm_throw (scm_decoding_error_key,
1498 scm_list_n (scm_from_latin1_string (subr),
1499 scm_from_latin1_string (message),
1500 scm_from_int (err),
1501 port,
1502 SCM_UNDEFINED));
1503 }
1504
1505 \f
1506 /* String conversion to/from C. */
1507
1508 static void
1509 decoding_error (const char *func_name, int errno_save,
1510 const char *str, size_t len)
1511 {
1512 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1513 handler. */
1514 SCM bv;
1515 signed char *buf;
1516
1517 buf = scm_gc_malloc_pointerless (len, "bytevector");
1518 memcpy (buf, str, len);
1519 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1520
1521 scm_decoding_error (func_name, errno_save,
1522 "input locale conversion error", bv);
1523 }
1524
1525 SCM
1526 scm_from_stringn (const char *str, size_t len, const char *encoding,
1527 scm_t_string_failed_conversion_handler handler)
1528 {
1529 size_t u32len, i;
1530 scm_t_wchar *u32;
1531 int wide = 0;
1532 SCM res;
1533
1534 /* The order of these checks is important. */
1535 if (!str && len != 0)
1536 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1537 if (len == (size_t) -1)
1538 len = strlen (str);
1539
1540 if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1541 return scm_from_latin1_stringn (str, len);
1542 else if (c_strcasecmp (encoding, "UTF-8") == 0
1543 && handler == SCM_FAILED_CONVERSION_ERROR)
1544 return scm_from_utf8_stringn (str, len);
1545
1546 u32len = 0;
1547 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1548 (enum iconv_ilseq_handler)
1549 handler,
1550 str, len,
1551 NULL,
1552 NULL, &u32len);
1553
1554 if (SCM_UNLIKELY (u32 == NULL))
1555 decoding_error (__func__, errno, str, len);
1556
1557 i = 0;
1558 while (i < u32len)
1559 if (u32[i++] > 0xFF)
1560 {
1561 wide = 1;
1562 break;
1563 }
1564
1565 if (!wide)
1566 {
1567 char *dst;
1568 res = scm_i_make_string (u32len, &dst, 0);
1569 for (i = 0; i < u32len; i ++)
1570 dst[i] = (unsigned char) u32[i];
1571 dst[u32len] = '\0';
1572 }
1573 else
1574 {
1575 scm_t_wchar *wdst;
1576 res = scm_i_make_wide_string (u32len, &wdst, 0);
1577 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578 wdst[u32len] = 0;
1579 }
1580
1581 free (u32);
1582 return res;
1583 }
1584
1585 SCM
1586 scm_from_locale_string (const char *str)
1587 {
1588 return scm_from_locale_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_locale_stringn (const char *str, size_t len)
1593 {
1594 return scm_from_stringn (str, len, locale_charset (),
1595 scm_i_default_port_conversion_handler ());
1596 }
1597
1598 SCM
1599 scm_from_latin1_string (const char *str)
1600 {
1601 return scm_from_latin1_stringn (str, -1);
1602 }
1603
1604 SCM
1605 scm_from_latin1_stringn (const char *str, size_t len)
1606 {
1607 char *buf;
1608 SCM result;
1609
1610 if (len == (size_t) -1)
1611 len = strlen (str);
1612
1613 /* Make a narrow string and copy STR as is. */
1614 result = scm_i_make_string (len, &buf, 0);
1615 memcpy (buf, str, len);
1616
1617 return result;
1618 }
1619
1620 SCM
1621 scm_from_utf8_string (const char *str)
1622 {
1623 return scm_from_utf8_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_utf8_stringn (const char *str, size_t len)
1628 {
1629 size_t i, char_len;
1630 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1631 int ascii = 1, narrow = 1;
1632 SCM res;
1633
1634 if (len == (size_t) -1)
1635 len = strlen (str);
1636
1637 i = 0;
1638 char_len = 0;
1639
1640 while (i < len)
1641 {
1642 if (ustr[i] <= 127)
1643 {
1644 char_len++;
1645 i++;
1646 }
1647 else
1648 {
1649 ucs4_t c;
1650 int nbytes;
1651
1652 ascii = 0;
1653
1654 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1655
1656 if (c == 0xfffd)
1657 /* Bad UTF-8. */
1658 decoding_error (__func__, errno, str, len);
1659
1660 if (c > 255)
1661 narrow = 0;
1662
1663 char_len++;
1664 i += nbytes;
1665 }
1666 }
1667
1668 if (ascii)
1669 {
1670 char *dst;
1671 res = scm_i_make_string (char_len, &dst, 0);
1672 memcpy (dst, str, len);
1673 }
1674 else if (narrow)
1675 {
1676 char *dst;
1677 size_t j;
1678 ucs4_t c;
1679
1680 res = scm_i_make_string (char_len, &dst, 0);
1681
1682 for (i = 0, j = 0; i < len; j++)
1683 {
1684 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1685 dst[j] = (signed char) c;
1686 }
1687 }
1688 else
1689 {
1690 scm_t_wchar *dst;
1691 size_t j;
1692 ucs4_t c;
1693
1694 res = scm_i_make_wide_string (char_len, &dst, 0);
1695
1696 for (i = 0, j = 0; i < len; j++)
1697 {
1698 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1699 dst[j] = c;
1700 }
1701 }
1702
1703 return res;
1704 }
1705
1706 SCM
1707 scm_from_utf32_string (const scm_t_wchar *str)
1708 {
1709 return scm_from_utf32_stringn (str, -1);
1710 }
1711
1712 SCM
1713 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1714 {
1715 SCM result;
1716 scm_t_wchar *buf;
1717
1718 if (len == (size_t) -1)
1719 len = u32_strlen ((uint32_t *) str);
1720
1721 result = scm_i_make_wide_string (len, &buf, 0);
1722 memcpy (buf, str, len * sizeof (scm_t_wchar));
1723 scm_i_try_narrow_string (result);
1724
1725 return result;
1726 }
1727
1728 SCM
1729 scm_from_port_string (const char *str, SCM port)
1730 {
1731 return scm_from_port_stringn (str, -1, port);
1732 }
1733
1734 SCM
1735 scm_from_port_stringn (const char *str, size_t len, SCM port)
1736 {
1737 scm_t_port *pt = SCM_PTAB_ENTRY (port);
1738 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1739
1740 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1741 return scm_from_latin1_stringn (str, len);
1742 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1743 && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1744 || (u8_check ((uint8_t *) str, len) == NULL)))
1745 return scm_from_utf8_stringn (str, len);
1746 else
1747 return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1748 }
1749
1750 /* Create a new scheme string from the C string STR. The memory of
1751 STR may be used directly as storage for the new string. */
1752 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1753 would be to register a finalizer to eventually free(3) STR, which isn't
1754 worth it. Should we just deprecate the `scm_take_' functions? */
1755 SCM
1756 scm_take_locale_stringn (char *str, size_t len)
1757 {
1758 SCM res;
1759
1760 res = scm_from_locale_stringn (str, len);
1761 free (str);
1762
1763 return res;
1764 }
1765
1766 SCM
1767 scm_take_locale_string (char *str)
1768 {
1769 return scm_take_locale_stringn (str, -1);
1770 }
1771
1772 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1773 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1774 Set *LENP to the size of the resulting string.
1775
1776 FIXME: This is a hack we should get rid of. See
1777 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1778 for details. */
1779 static void
1780 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1781 {
1782 char *before, *after;
1783 size_t i, j;
1784
1785 before = buf;
1786 after = buf;
1787 i = 0;
1788 j = 0;
1789 while (i < *lenp)
1790 {
1791 if ((i <= *lenp - 6)
1792 && before[i] == '\\'
1793 && before[i + 1] == 'u'
1794 && before[i + 2] == '0' && before[i + 3] == '0')
1795 {
1796 /* Convert \u00NN to \xNN */
1797 after[j] = '\\';
1798 after[j + 1] = 'x';
1799 after[j + 2] = tolower ((int) before[i + 4]);
1800 after[j + 3] = tolower ((int) before[i + 5]);
1801 i += 6;
1802 j += 4;
1803 }
1804 else if ((i <= *lenp - 10)
1805 && before[i] == '\\'
1806 && before[i + 1] == 'U'
1807 && before[i + 2] == '0' && before[i + 3] == '0')
1808 {
1809 /* Convert \U00NNNNNN to \UNNNNNN */
1810 after[j] = '\\';
1811 after[j + 1] = 'U';
1812 after[j + 2] = tolower ((int) before[i + 4]);
1813 after[j + 3] = tolower ((int) before[i + 5]);
1814 after[j + 4] = tolower ((int) before[i + 6]);
1815 after[j + 5] = tolower ((int) before[i + 7]);
1816 after[j + 6] = tolower ((int) before[i + 8]);
1817 after[j + 7] = tolower ((int) before[i + 9]);
1818 i += 10;
1819 j += 8;
1820 }
1821 else
1822 {
1823 after[j] = before[i];
1824 i++;
1825 j++;
1826 }
1827 }
1828 *lenp = j;
1829 }
1830
1831 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1832 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1833 of the resulting string. BUF must be large enough to handle the
1834 worst case when `\uXXXX' escapes (6 characters) are replaced by
1835 `\xXXXX;' (7 characters). */
1836 static void
1837 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1838 {
1839 char *before, *after;
1840 size_t i, j;
1841 /* The worst case is if the input string contains all 4-digit hex escapes.
1842 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1843 size_t max_out_len = (*lenp * 7) / 6 + 1;
1844 size_t nzeros, ndigits;
1845
1846 before = buf;
1847 after = alloca (max_out_len);
1848 i = 0;
1849 j = 0;
1850 while (i < *lenp)
1851 {
1852 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1853 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1854 {
1855 if (before[i + 1] == 'u')
1856 ndigits = 4;
1857 else if (before[i + 1] == 'U')
1858 ndigits = 8;
1859 else
1860 abort ();
1861
1862 /* Add the R6RS hex escape initial sequence. */
1863 after[j] = '\\';
1864 after[j + 1] = 'x';
1865
1866 /* Move string positions to the start of the hex numbers. */
1867 i += 2;
1868 j += 2;
1869
1870 /* Find the number of initial zeros in this hex number. */
1871 nzeros = 0;
1872 while (before[i + nzeros] == '0' && nzeros < ndigits)
1873 nzeros++;
1874
1875 /* Copy the number, skipping initial zeros, and then move the string
1876 positions. */
1877 if (nzeros == ndigits)
1878 {
1879 after[j] = '0';
1880 i += ndigits;
1881 j += 1;
1882 }
1883 else
1884 {
1885 int pos;
1886 for (pos = 0; pos < ndigits - nzeros; pos++)
1887 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1888 i += ndigits;
1889 j += (ndigits - nzeros);
1890 }
1891
1892 /* Add terminating semicolon. */
1893 after[j] = ';';
1894 j++;
1895 }
1896 else
1897 {
1898 after[j] = before[i];
1899 i++;
1900 j++;
1901 }
1902 }
1903 *lenp = j;
1904 memcpy (before, after, j);
1905 }
1906
1907 char *
1908 scm_to_locale_string (SCM str)
1909 {
1910 return scm_to_locale_stringn (str, NULL);
1911 }
1912
1913 char *
1914 scm_to_locale_stringn (SCM str, size_t *lenp)
1915 {
1916 return scm_to_stringn (str, lenp,
1917 locale_charset (),
1918 scm_i_default_port_conversion_handler ());
1919 }
1920
1921 char *
1922 scm_to_latin1_string (SCM str)
1923 {
1924 return scm_to_latin1_stringn (str, NULL);
1925 }
1926
1927 char *
1928 scm_to_latin1_stringn (SCM str, size_t *lenp)
1929 #define FUNC_NAME "scm_to_latin1_stringn"
1930 {
1931 char *result;
1932
1933 SCM_VALIDATE_STRING (1, str);
1934
1935 if (scm_i_is_narrow_string (str))
1936 {
1937 size_t len = scm_i_string_length (str);
1938
1939 if (lenp)
1940 *lenp = len;
1941
1942 result = scm_strndup (scm_i_string_data (str), len);
1943 }
1944 else
1945 result = scm_to_stringn (str, lenp, NULL,
1946 SCM_FAILED_CONVERSION_ERROR);
1947
1948 return result;
1949 }
1950 #undef FUNC_NAME
1951
1952 char *
1953 scm_to_utf8_string (SCM str)
1954 {
1955 return scm_to_utf8_stringn (str, NULL);
1956 }
1957
1958 static size_t
1959 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1960 {
1961 size_t ret, i;
1962 for (i = 0, ret = 0; i < len; i++)
1963 ret += (str[i] < 128) ? 1 : 2;
1964 return ret;
1965 }
1966
1967 static scm_t_uint8*
1968 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1969 scm_t_uint8 *u8_result, size_t *u8_lenp)
1970 {
1971 size_t i, n;
1972 size_t u8_len = latin1_u8_strlen (str, latin_len);
1973
1974 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1975 u8_result = scm_malloc (u8_len + 1);
1976 if (u8_lenp)
1977 *u8_lenp = u8_len;
1978
1979 for (i = 0, n = 0; i < latin_len; i++)
1980 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1981 if (n != u8_len)
1982 abort ();
1983 u8_result[n] = 0;
1984
1985 return u8_result;
1986 }
1987
1988 /* UTF-8 code table
1989
1990 (Note that this includes code points that are not allowed by Unicode,
1991 but since this function has no way to report an error, and its
1992 purpose is to determine the size of destination buffers for
1993 libunicode conversion functions, we err on the safe side and handle
1994 everything that libunicode might conceivably handle, now or in the
1995 future.)
1996
1997 Char. number range | UTF-8 octet sequence
1998 (hexadecimal) | (binary)
1999 --------------------+------------------------------------------------------
2000 0000 0000-0000 007F | 0xxxxxxx
2001 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2002 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2003 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2004 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2005 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2006 */
2007
2008 static size_t
2009 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2010 {
2011 size_t ret, i;
2012
2013 for (i = 0, ret = 0; i < len; i++)
2014 {
2015 scm_t_uint32 c = str[i];
2016
2017 if (c <= 0x7f)
2018 ret += 1;
2019 else if (c <= 0x7ff)
2020 ret += 2;
2021 else if (c <= 0xffff)
2022 ret += 3;
2023 else if (c <= 0x1fffff)
2024 ret += 4;
2025 else if (c <= 0x3ffffff)
2026 ret += 5;
2027 else
2028 ret += 6;
2029 }
2030
2031 return ret;
2032 }
2033
2034 char *
2035 scm_to_utf8_stringn (SCM str, size_t *lenp)
2036 #define FUNC_NAME "scm_to_utf8_stringn"
2037 {
2038 SCM_VALIDATE_STRING (1, str);
2039
2040 if (scm_i_is_narrow_string (str))
2041 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2042 scm_i_string_length (str),
2043 NULL, lenp);
2044 else
2045 {
2046 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2047 scm_t_uint8 *buf, *ret;
2048 size_t num_chars = scm_i_string_length (str);
2049 size_t num_bytes_predicted, num_bytes_actual;
2050
2051 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2052
2053 if (lenp)
2054 {
2055 *lenp = num_bytes_predicted;
2056 buf = scm_malloc (num_bytes_predicted);
2057 }
2058 else
2059 {
2060 buf = scm_malloc (num_bytes_predicted + 1);
2061 buf[num_bytes_predicted] = 0;
2062 }
2063
2064 num_bytes_actual = num_bytes_predicted;
2065 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2066
2067 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2068 return (char *) ret;
2069
2070 /* An error: a bad codepoint. */
2071 {
2072 int saved_errno = errno;
2073
2074 free (buf);
2075 if (!saved_errno)
2076 abort ();
2077
2078 scm_decoding_error ("scm_to_utf8_stringn", errno,
2079 "invalid codepoint in string", str);
2080
2081 /* Not reached. */
2082 return NULL;
2083 }
2084 }
2085 }
2086 #undef FUNC_NAME
2087
2088 scm_t_wchar *
2089 scm_to_utf32_string (SCM str)
2090 {
2091 return scm_to_utf32_stringn (str, NULL);
2092 }
2093
2094 scm_t_wchar *
2095 scm_to_utf32_stringn (SCM str, size_t *lenp)
2096 #define FUNC_NAME "scm_to_utf32_stringn"
2097 {
2098 scm_t_wchar *result;
2099
2100 SCM_VALIDATE_STRING (1, str);
2101
2102 if (scm_i_is_narrow_string (str))
2103 {
2104 scm_t_uint8 *codepoints;
2105 size_t i, len;
2106
2107 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2108 len = scm_i_string_length (str);
2109 if (lenp)
2110 *lenp = len;
2111
2112 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2113 for (i = 0; i < len; i++)
2114 result[i] = codepoints[i];
2115 result[len] = 0;
2116 }
2117 else
2118 {
2119 size_t len;
2120
2121 len = scm_i_string_length (str);
2122 if (lenp)
2123 *lenp = len;
2124
2125 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2126 memcpy (result, scm_i_string_wide_chars (str),
2127 len * sizeof (scm_t_wchar));
2128 result[len] = 0;
2129 }
2130
2131 return result;
2132 }
2133 #undef FUNC_NAME
2134
2135 char *
2136 scm_to_port_string (SCM str, SCM port)
2137 {
2138 return scm_to_port_stringn (str, NULL, port);
2139 }
2140
2141 char *
2142 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2143 {
2144 scm_t_port *pt = SCM_PTAB_ENTRY (port);
2145 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2146
2147 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2148 && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2149 return scm_to_latin1_stringn (str, lenp);
2150 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2151 return scm_to_utf8_stringn (str, lenp);
2152 else
2153 return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2154 }
2155
2156 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2157 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2158 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2159 defined by HANDLER. */
2160 char *
2161 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2162 scm_t_string_failed_conversion_handler handler)
2163 {
2164 char *buf;
2165 size_t ilen, len, i;
2166 int ret;
2167 const char *enc;
2168
2169 if (!scm_is_string (str))
2170 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2171 ilen = scm_i_string_length (str);
2172
2173 if (ilen == 0)
2174 {
2175 buf = scm_malloc (1);
2176 buf[0] = '\0';
2177 if (lenp)
2178 *lenp = 0;
2179 return buf;
2180 }
2181
2182 if (lenp == NULL)
2183 for (i = 0; i < ilen; i++)
2184 if (scm_i_string_ref (str, i) == '\0')
2185 scm_misc_error (NULL,
2186 "string contains #\\nul character: ~S",
2187 scm_list_1 (str));
2188
2189 if (scm_i_is_narrow_string (str)
2190 && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2191 {
2192 /* If using native Latin-1 encoding, just copy the string
2193 contents. */
2194 if (lenp)
2195 {
2196 buf = scm_malloc (ilen);
2197 memcpy (buf, scm_i_string_chars (str), ilen);
2198 *lenp = ilen;
2199 return buf;
2200 }
2201 else
2202 {
2203 buf = scm_malloc (ilen + 1);
2204 memcpy (buf, scm_i_string_chars (str), ilen);
2205 buf[ilen] = '\0';
2206 return buf;
2207 }
2208 }
2209
2210
2211 buf = NULL;
2212 len = 0;
2213 enc = encoding;
2214 if (enc == NULL)
2215 enc = "ISO-8859-1";
2216 if (scm_i_is_narrow_string (str))
2217 {
2218 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2219 "ISO-8859-1", enc,
2220 (enum iconv_ilseq_handler) handler, NULL,
2221 &buf, &len);
2222
2223 if (ret != 0)
2224 scm_encoding_error (__func__, errno,
2225 "cannot convert narrow string to output locale",
2226 SCM_BOOL_F,
2227 /* FIXME: Faulty character unknown. */
2228 SCM_BOOL_F);
2229 }
2230 else
2231 {
2232 buf = u32_conv_to_encoding (enc,
2233 (enum iconv_ilseq_handler) handler,
2234 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2235 ilen,
2236 NULL,
2237 NULL, &len);
2238 if (buf == NULL)
2239 scm_encoding_error (__func__, errno,
2240 "cannot convert wide string to output locale",
2241 SCM_BOOL_F,
2242 /* FIXME: Faulty character unknown. */
2243 SCM_BOOL_F);
2244 }
2245 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2246 {
2247 if (SCM_R6RS_ESCAPES_P)
2248 {
2249 /* The worst case is if the input string contains all 4-digit
2250 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2251 (seven characters). Make BUF large enough to hold
2252 that. */
2253 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2254 unistring_escapes_to_r6rs_escapes (buf, &len);
2255 }
2256 else
2257 unistring_escapes_to_guile_escapes (buf, &len);
2258
2259 buf = scm_realloc (buf, len);
2260 }
2261 if (lenp)
2262 *lenp = len;
2263 else
2264 {
2265 buf = scm_realloc (buf, len + 1);
2266 buf[len] = '\0';
2267 }
2268
2269 scm_remember_upto_here_1 (str);
2270 return buf;
2271 }
2272
2273 size_t
2274 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2275 {
2276 size_t len;
2277 char *result = NULL;
2278 if (!scm_is_string (str))
2279 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2280 result = scm_to_locale_stringn (str, &len);
2281
2282 memcpy (buf, result, (len > max_len) ? max_len : len);
2283 free (result);
2284
2285 scm_remember_upto_here_1 (str);
2286 return len;
2287 }
2288
2289 \f
2290 /* Unicode string normalization. */
2291
2292 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2293 libguile/i18n.c. It would be useful to have this factored out into a more
2294 convenient location, but its use of alloca makes that tricky to do. */
2295
2296 static SCM
2297 normalize_str (SCM string, uninorm_t form)
2298 {
2299 SCM ret;
2300 scm_t_uint32 *w_str;
2301 scm_t_wchar *cbuf;
2302 size_t rlen, len = scm_i_string_length (string);
2303
2304 if (scm_i_is_narrow_string (string))
2305 {
2306 size_t i;
2307 const char *buf = scm_i_string_chars (string);
2308
2309 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2310
2311 for (i = 0; i < len; i ++)
2312 w_str[i] = (unsigned char) buf[i];
2313 w_str[len] = 0;
2314 }
2315 else
2316 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2317
2318 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2319
2320 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2321 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2322 free (w_str);
2323
2324 scm_i_try_narrow_string (ret);
2325
2326 return ret;
2327 }
2328
2329 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2330 (SCM string),
2331 "Returns the NFC normalized form of @var{string}.")
2332 #define FUNC_NAME s_scm_string_normalize_nfc
2333 {
2334 SCM_VALIDATE_STRING (1, string);
2335 return normalize_str (string, UNINORM_NFC);
2336 }
2337 #undef FUNC_NAME
2338
2339 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2340 (SCM string),
2341 "Returns the NFD normalized form of @var{string}.")
2342 #define FUNC_NAME s_scm_string_normalize_nfd
2343 {
2344 SCM_VALIDATE_STRING (1, string);
2345 return normalize_str (string, UNINORM_NFD);
2346 }
2347 #undef FUNC_NAME
2348
2349 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2350 (SCM string),
2351 "Returns the NFKC normalized form of @var{string}.")
2352 #define FUNC_NAME s_scm_string_normalize_nfkc
2353 {
2354 SCM_VALIDATE_STRING (1, string);
2355 return normalize_str (string, UNINORM_NFKC);
2356 }
2357 #undef FUNC_NAME
2358
2359 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2360 (SCM string),
2361 "Returns the NFKD normalized form of @var{string}.")
2362 #define FUNC_NAME s_scm_string_normalize_nfkd
2363 {
2364 SCM_VALIDATE_STRING (1, string);
2365 return normalize_str (string, UNINORM_NFKD);
2366 }
2367 #undef FUNC_NAME
2368
2369 /* converts C scm_array of strings to SCM scm_list of strings.
2370 If argc < 0, a null terminated scm_array is assumed.
2371 The current locale encoding is assumed */
2372 SCM
2373 scm_makfromstrs (int argc, char **argv)
2374 {
2375 int i = argc;
2376 SCM lst = SCM_EOL;
2377 if (0 > i)
2378 for (i = 0; argv[i]; i++);
2379 while (i--)
2380 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2381 return lst;
2382 }
2383
2384 /* Return a newly allocated array of char pointers to each of the strings
2385 in args, with a terminating NULL pointer. The strings are encoded using
2386 the current locale. */
2387
2388 char **
2389 scm_i_allocate_string_pointers (SCM list)
2390 #define FUNC_NAME "scm_i_allocate_string_pointers"
2391 {
2392 char **result;
2393 int list_len = scm_ilength (list);
2394 int i;
2395
2396 if (list_len < 0)
2397 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2398
2399 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2400 "string pointers");
2401 result[list_len] = NULL;
2402
2403 /* The list might have been modified in another thread, so
2404 we check LIST before each access.
2405 */
2406 for (i = 0; i < list_len && scm_is_pair (list); i++)
2407 {
2408 SCM str = SCM_CAR (list);
2409 size_t len; /* String length in bytes */
2410 char *c_str = scm_to_locale_stringn (str, &len);
2411
2412 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2413 scm_malloc to allocate the returned string, which must be
2414 explicitly deallocated. This forces us to copy the string a
2415 second time into a new buffer. Ideally there would be variants
2416 of scm_to_*_stringn that can return garbage-collected buffers. */
2417
2418 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2419 memcpy (result[i], c_str, len);
2420 result[i][len] = '\0';
2421 free (c_str);
2422
2423 list = SCM_CDR (list);
2424 }
2425
2426 return result;
2427 }
2428 #undef FUNC_NAME
2429
2430 void
2431 scm_i_get_substring_spec (size_t len,
2432 SCM start, size_t *cstart,
2433 SCM end, size_t *cend)
2434 {
2435 if (SCM_UNBNDP (start))
2436 *cstart = 0;
2437 else
2438 *cstart = scm_to_unsigned_integer (start, 0, len);
2439
2440 if (SCM_UNBNDP (end))
2441 *cend = len;
2442 else
2443 *cend = scm_to_unsigned_integer (end, *cstart, len);
2444 }
2445
2446 static SCM
2447 string_handle_ref (scm_t_array_handle *h, size_t index)
2448 {
2449 return scm_c_string_ref (h->array, index);
2450 }
2451
2452 static void
2453 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2454 {
2455 scm_c_string_set_x (h->array, index, val);
2456 }
2457
2458 static void
2459 string_get_handle (SCM v, scm_t_array_handle *h)
2460 {
2461 h->array = v;
2462 h->ndims = 1;
2463 h->dims = &h->dim0;
2464 h->dim0.lbnd = 0;
2465 h->dim0.ubnd = scm_c_string_length (v) - 1;
2466 h->dim0.inc = 1;
2467 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2468 h->elements = h->writable_elements = NULL;
2469 }
2470
2471 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2472 string_handle_ref, string_handle_set,
2473 string_get_handle)
2474 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2475
2476 void
2477 scm_init_strings ()
2478 {
2479 scm_nullstr = scm_i_make_string (0, NULL, 0);
2480
2481 #include "libguile/strings.x"
2482 }
2483
2484
2485 /*
2486 Local Variables:
2487 c-file-style: "gnu"
2488 End:
2489 */