Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32 #include <c-strcase.h>
33
34 #include "striconveh.h"
35
36 #include "libguile/_scm.h"
37 #include "libguile/chars.h"
38 #include "libguile/root.h"
39 #include "libguile/strings.h"
40 #include "libguile/ports.h"
41 #include "libguile/ports-internal.h"
42 #include "libguile/error.h"
43 #include "libguile/generalized-vectors.h"
44 #include "libguile/deprecation.h"
45 #include "libguile/validate.h"
46 #include "libguile/private-options.h"
47
48 \f
49
50 /* {Strings}
51 */
52
53
54 /* Stringbufs
55 *
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
58 * shared, not whether it definitely is.
59 *
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
66 *
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
71 *
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 * strings.
75 */
76
77 /* The size in words of the stringbuf header (type tag + size). */
78 #define STRINGBUF_HEADER_SIZE 2U
79
80 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
81
82 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
83 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
84
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
88
89 #define STRINGBUF_CONTENTS(buf) ((void *) \
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
92 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
94
95 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
96
97 #define SET_STRINGBUF_SHARED(buf) \
98 do \
99 { \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 } \
105 while (0)
106
107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
108 static size_t lenhist[1001];
109 #endif
110
111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
112 characters. */
113 static SCM
114 make_stringbuf (size_t len)
115 {
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
120 can be dropped.
121 */
122
123 SCM buf;
124
125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130 #endif
131
132 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
133 "string"));
134
135 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
136 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
137
138 STRINGBUF_CHARS (buf)[len] = 0;
139
140 return buf;
141 }
142
143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
144 characters. */
145 static SCM
146 make_wide_stringbuf (size_t len)
147 {
148 SCM buf;
149 size_t raw_len;
150
151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156 #endif
157
158 raw_len = (len + 1) * sizeof (scm_t_wchar);
159 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
160 "string"));
161
162 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
163 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
164
165 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
166
167 return buf;
168 }
169
170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
172 static SCM
173 wide_stringbuf (SCM buf)
174 {
175 SCM new_buf;
176
177 if (STRINGBUF_WIDE (buf))
178 new_buf = buf;
179 else
180 {
181 size_t i, len;
182 scm_t_wchar *mem;
183
184 len = STRINGBUF_LENGTH (buf);
185
186 new_buf = make_wide_stringbuf (len);
187
188 mem = STRINGBUF_WIDE_CHARS (new_buf);
189 for (i = 0; i < len; i++)
190 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
191 mem[len] = 0;
192 }
193
194 return new_buf;
195 }
196
197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
199 static SCM
200 narrow_stringbuf (SCM buf)
201 {
202 SCM new_buf;
203
204 if (!STRINGBUF_WIDE (buf))
205 new_buf = buf;
206 else
207 {
208 size_t i, len;
209 scm_t_wchar *wmem;
210 unsigned char *mem;
211
212 len = STRINGBUF_LENGTH (buf);
213 wmem = STRINGBUF_WIDE_CHARS (buf);
214
215 for (i = 0; i < len; i++)
216 if (wmem[i] > 0xFF)
217 /* BUF cannot be narrowed. */
218 return buf;
219
220 new_buf = make_stringbuf (len);
221
222 mem = STRINGBUF_CHARS (new_buf);
223 for (i = 0; i < len; i++)
224 mem[i] = (unsigned char) wmem[i];
225 mem[len] = 0;
226 }
227
228 return new_buf;
229 }
230
231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
232
233 \f
234 /* Copy-on-write strings.
235 */
236
237 #define STRING_TAG scm_tc7_string
238
239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
242
243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
245
246 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
247
248 /* Read-only strings.
249 */
250
251 #define RO_STRING_TAG scm_tc7_ro_string
252 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
253
254 /* Mutation-sharing substrings
255 */
256
257 #define SH_STRING_TAG (scm_tc7_string + 0x100)
258
259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260 /* START and LENGTH as for STRINGs. */
261
262 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263
264 SCM scm_nullstr;
265
266 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
267 characters. CHARSP, if not NULL, will be set to location of the
268 char array. If READ_ONLY_P, the returned string is read-only;
269 otherwise it is writable. */
270 SCM
271 scm_i_make_string (size_t len, char **charsp, int read_only_p)
272 {
273 static SCM null_stringbuf = SCM_BOOL_F;
274 SCM buf;
275 SCM res;
276
277 if (len == 0)
278 {
279 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
280 {
281 null_stringbuf = make_stringbuf (0);
282 SET_STRINGBUF_SHARED (null_stringbuf);
283 }
284 buf = null_stringbuf;
285 }
286 else
287 buf = make_stringbuf (len);
288
289 if (charsp)
290 *charsp = (char *) STRINGBUF_CHARS (buf);
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295 }
296
297 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
298 characters. CHARSP, if not NULL, will be set to location of the
299 character array. If READ_ONLY_P, the returned string is read-only;
300 otherwise it is writable. */
301 SCM
302 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
303 {
304 SCM buf = make_wide_stringbuf (len);
305 SCM res;
306 if (charsp)
307 *charsp = STRINGBUF_WIDE_CHARS (buf);
308 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
309 SCM_UNPACK (buf),
310 (scm_t_bits) 0, (scm_t_bits) len);
311 return res;
312 }
313
314 static void
315 validate_substring_args (SCM str, size_t start, size_t end)
316 {
317 if (!IS_STRING (str))
318 scm_wrong_type_arg_msg (NULL, 0, str, "string");
319 if (start > STRING_LENGTH (str))
320 scm_out_of_range (NULL, scm_from_size_t (start));
321 if (end > STRING_LENGTH (str) || end < start)
322 scm_out_of_range (NULL, scm_from_size_t (end));
323 }
324
325 static inline void
326 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
327 {
328 *start = STRING_START (*str);
329 if (IS_SH_STRING (*str))
330 {
331 *str = SH_STRING_STRING (*str);
332 *start += STRING_START (*str);
333 }
334 *buf = STRING_STRINGBUF (*str);
335 }
336
337 SCM
338 scm_i_substring (SCM str, size_t start, size_t end)
339 {
340 if (start == end)
341 return scm_i_make_string (0, NULL, 0);
342 else
343 {
344 SCM buf;
345 size_t str_start;
346 get_str_buf_start (&str, &buf, &str_start);
347 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
348 SET_STRINGBUF_SHARED (buf);
349 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
350 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
351 (scm_t_bits)str_start + start,
352 (scm_t_bits) end - start);
353 }
354 }
355
356 SCM
357 scm_i_substring_read_only (SCM str, size_t start, size_t end)
358 {
359 if (start == end)
360 return scm_i_make_string (0, NULL, 1);
361 else
362 {
363 SCM buf;
364 size_t str_start;
365 get_str_buf_start (&str, &buf, &str_start);
366 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
367 SET_STRINGBUF_SHARED (buf);
368 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
369 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
370 (scm_t_bits)str_start + start,
371 (scm_t_bits) end - start);
372 }
373 }
374
375 SCM
376 scm_i_substring_copy (SCM str, size_t start, size_t end)
377 {
378 if (start == end)
379 return scm_i_make_string (0, NULL, 0);
380 else
381 {
382 size_t len = end - start;
383 SCM buf, my_buf, substr;
384 size_t str_start;
385 int wide = 0;
386 get_str_buf_start (&str, &buf, &str_start);
387 if (scm_i_is_narrow_string (str))
388 {
389 my_buf = make_stringbuf (len);
390 memcpy (STRINGBUF_CHARS (my_buf),
391 STRINGBUF_CHARS (buf) + str_start + start, len);
392 }
393 else
394 {
395 my_buf = make_wide_stringbuf (len);
396 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
397 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
398 + start), len);
399 wide = 1;
400 }
401 scm_remember_upto_here_1 (buf);
402 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
403 (scm_t_bits) 0, (scm_t_bits) len);
404 if (wide)
405 scm_i_try_narrow_string (substr);
406 return substr;
407 }
408 }
409
410 SCM
411 scm_i_substring_shared (SCM str, size_t start, size_t end)
412 {
413 if (start == 0 && end == STRING_LENGTH (str))
414 return str;
415 else if (start == end)
416 return scm_i_make_string (0, NULL, 0);
417 else
418 {
419 size_t len = end - start;
420 if (IS_SH_STRING (str))
421 {
422 start += STRING_START (str);
423 str = SH_STRING_STRING (str);
424 }
425 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
426 (scm_t_bits)start, (scm_t_bits) len);
427 }
428 }
429
430 SCM
431 scm_c_substring (SCM str, size_t start, size_t end)
432 {
433 validate_substring_args (str, start, end);
434 return scm_i_substring (str, start, end);
435 }
436
437 SCM
438 scm_c_substring_read_only (SCM str, size_t start, size_t end)
439 {
440 validate_substring_args (str, start, end);
441 return scm_i_substring_read_only (str, start, end);
442 }
443
444 SCM
445 scm_c_substring_copy (SCM str, size_t start, size_t end)
446 {
447 validate_substring_args (str, start, end);
448 return scm_i_substring_copy (str, start, end);
449 }
450
451 SCM
452 scm_c_substring_shared (SCM str, size_t start, size_t end)
453 {
454 validate_substring_args (str, start, end);
455 return scm_i_substring_shared (str, start, end);
456 }
457
458 \f
459 /* Internal accessors
460 */
461
462 /* Returns the number of characters in STR. This may be different
463 than the memory size of the string storage. */
464 size_t
465 scm_i_string_length (SCM str)
466 {
467 return STRING_LENGTH (str);
468 }
469
470 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
471 encoding. False if it is 'wide', having a 32-bit UCS-4
472 encoding. */
473 int
474 scm_i_is_narrow_string (SCM str)
475 {
476 if (IS_SH_STRING (str))
477 str = SH_STRING_STRING (str);
478
479 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
480 }
481
482 /* Try to coerce a string to be narrow. It if is narrow already, do
483 nothing. If it is wide, shrink it to narrow if none of its
484 characters are above 0xFF. Return true if the string is narrow or
485 was made to be narrow. */
486 int
487 scm_i_try_narrow_string (SCM str)
488 {
489 if (IS_SH_STRING (str))
490 str = SH_STRING_STRING (str);
491
492 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
493
494 return scm_i_is_narrow_string (str);
495 }
496
497 /* Return a pointer to the raw data of the string, which can be either Latin-1
498 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
499 const void *
500 scm_i_string_data (SCM str)
501 {
502 SCM buf;
503 size_t start;
504 const char *data;
505
506 get_str_buf_start (&str, &buf, &start);
507
508 data = STRINGBUF_CONTENTS (buf);
509 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
510
511 return data;
512 }
513
514 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
515 STR. */
516 const char *
517 scm_i_string_chars (SCM str)
518 {
519 SCM buf;
520 size_t start;
521 get_str_buf_start (&str, &buf, &start);
522 if (scm_i_is_narrow_string (str))
523 return (const char *) STRINGBUF_CHARS (buf) + start;
524 else
525 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
526 scm_list_1 (str));
527 return NULL;
528 }
529
530 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
531 STR. */
532 const scm_t_wchar *
533 scm_i_string_wide_chars (SCM str)
534 {
535 SCM buf;
536 size_t start;
537
538 get_str_buf_start (&str, &buf, &start);
539 if (!scm_i_is_narrow_string (str))
540 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
541 else
542 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
543 scm_list_1 (str));
544 }
545
546 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
547 a new string buffer, so that it can be modified without modifying
548 other strings. Also, lock the string mutex. Later, one must call
549 scm_i_string_stop_writing to unlock the mutex. */
550 SCM
551 scm_i_string_start_writing (SCM orig_str)
552 {
553 SCM buf, str = orig_str;
554 size_t start;
555
556 get_str_buf_start (&str, &buf, &start);
557 if (IS_RO_STRING (str))
558 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
559
560 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
561 if (STRINGBUF_SHARED (buf))
562 {
563 /* Clone the stringbuf. */
564 size_t len = STRING_LENGTH (str);
565 SCM new_buf;
566
567 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
568
569 if (scm_i_is_narrow_string (str))
570 {
571 new_buf = make_stringbuf (len);
572 memcpy (STRINGBUF_CHARS (new_buf),
573 STRINGBUF_CHARS (buf) + STRING_START (str), len);
574
575 }
576 else
577 {
578 new_buf = make_wide_stringbuf (len);
579 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
580 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
581 + STRING_START (str)), len);
582 }
583
584 SET_STRING_STRINGBUF (str, new_buf);
585 start -= STRING_START (str);
586
587 /* FIXME: The following operations are not atomic, so other threads
588 looking at STR may see an inconsistent state. Nevertheless it can't
589 hurt much since (i) accessing STR while it is being mutated can't
590 yield a crash, and (ii) concurrent accesses to STR should be
591 protected by a mutex at the application level. The latter may not
592 apply when STR != ORIG_STR, though. */
593 SET_STRING_START (str, 0);
594 SET_STRING_STRINGBUF (str, new_buf);
595
596 buf = new_buf;
597
598 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
599 }
600 return orig_str;
601 }
602
603 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
604 char *
605 scm_i_string_writable_chars (SCM str)
606 {
607 SCM buf;
608 size_t start;
609
610 get_str_buf_start (&str, &buf, &start);
611 if (scm_i_is_narrow_string (str))
612 return (char *) STRINGBUF_CHARS (buf) + start;
613 else
614 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
615 scm_list_1 (str));
616 return NULL;
617 }
618
619 /* Return a pointer to the UCS-4 codepoints of a string. */
620 static scm_t_wchar *
621 scm_i_string_writable_wide_chars (SCM str)
622 {
623 SCM buf;
624 size_t start;
625
626 get_str_buf_start (&str, &buf, &start);
627 if (!scm_i_is_narrow_string (str))
628 return STRINGBUF_WIDE_CHARS (buf) + start;
629 else
630 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
631 scm_list_1 (str));
632 }
633
634 /* Unlock the string mutex that was locked when
635 scm_i_string_start_writing was called. */
636 void
637 scm_i_string_stop_writing (void)
638 {
639 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
640 }
641
642 /* Return the Xth character of STR as a UCS-4 codepoint. */
643 scm_t_wchar
644 scm_i_string_ref (SCM str, size_t x)
645 {
646 if (scm_i_is_narrow_string (str))
647 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
648 else
649 return scm_i_string_wide_chars (str)[x];
650 }
651
652 /* Returns index+1 of the first char in STR that matches C, or
653 0 if the char is not found. */
654 int
655 scm_i_string_contains_char (SCM str, char ch)
656 {
657 size_t i;
658 size_t len = scm_i_string_length (str);
659
660 i = 0;
661 if (scm_i_is_narrow_string (str))
662 {
663 while (i < len)
664 {
665 if (scm_i_string_chars (str)[i] == ch)
666 return i+1;
667 i++;
668 }
669 }
670 else
671 {
672 while (i < len)
673 {
674 if (scm_i_string_wide_chars (str)[i]
675 == (unsigned char) ch)
676 return i+1;
677 i++;
678 }
679 }
680 return 0;
681 }
682
683 int
684 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
685 {
686 if (scm_i_is_narrow_string (sstr))
687 {
688 const char *a = scm_i_string_chars (sstr) + start_x;
689 const char *b = cstr;
690 return strncmp (a, b, strlen(b));
691 }
692 else
693 {
694 size_t i;
695 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
696 const char *b = cstr;
697 for (i = 0; i < strlen (b); i++)
698 {
699 if (a[i] != (unsigned char) b[i])
700 return 1;
701 }
702 }
703 return 0;
704 }
705
706 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
707 void
708 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
709 {
710 if (IS_SH_STRING (str))
711 {
712 p += STRING_START (str);
713 str = SH_STRING_STRING (str);
714 }
715
716 if (chr > 0xFF && scm_i_is_narrow_string (str))
717 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
718
719 if (scm_i_is_narrow_string (str))
720 {
721 char *dst = scm_i_string_writable_chars (str);
722 dst[p] = chr;
723 }
724 else
725 {
726 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
727 dst[p] = chr;
728 }
729 }
730
731 \f
732 /* Symbols.
733
734 Basic symbol creation and accessing is done here, the rest is in
735 symbols.[hc]. This has been done to keep stringbufs and the
736 internals of strings and string-like objects confined to this file.
737 */
738
739 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
740
741 SCM
742 scm_i_make_symbol (SCM name, scm_t_bits flags,
743 unsigned long hash, SCM props)
744 {
745 SCM buf;
746 size_t start = STRING_START (name);
747 size_t length = STRING_LENGTH (name);
748
749 if (IS_SH_STRING (name))
750 {
751 name = SH_STRING_STRING (name);
752 start += STRING_START (name);
753 }
754 buf = STRING_STRINGBUF (name);
755
756 if (start == 0 && length == STRINGBUF_LENGTH (buf))
757 {
758 /* reuse buf. */
759 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
760 SET_STRINGBUF_SHARED (buf);
761 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
762 }
763 else
764 {
765 /* make new buf. */
766 if (scm_i_is_narrow_string (name))
767 {
768 SCM new_buf = make_stringbuf (length);
769 memcpy (STRINGBUF_CHARS (new_buf),
770 STRINGBUF_CHARS (buf) + start, length);
771 buf = new_buf;
772 }
773 else
774 {
775 SCM new_buf = make_wide_stringbuf (length);
776 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
777 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
778 length);
779 buf = new_buf;
780 }
781 }
782 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
783 (scm_t_bits) hash, SCM_UNPACK (props));
784 }
785
786 SCM
787 scm_i_c_make_symbol (const char *name, size_t len,
788 scm_t_bits flags, unsigned long hash, SCM props)
789 {
790 SCM buf = make_stringbuf (len);
791 memcpy (STRINGBUF_CHARS (buf), name, len);
792
793 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
794 (scm_t_bits) hash, SCM_UNPACK (props));
795 }
796
797 /* Returns the number of characters in SYM. This may be different
798 from the memory size of SYM. */
799 size_t
800 scm_i_symbol_length (SCM sym)
801 {
802 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
803 }
804
805 size_t
806 scm_c_symbol_length (SCM sym)
807 #define FUNC_NAME "scm_c_symbol_length"
808 {
809 SCM_VALIDATE_SYMBOL (1, sym);
810
811 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
812 }
813 #undef FUNC_NAME
814
815 /* True if the name of SYM is stored as a Latin-1 encoded string.
816 False if it is stored as a 32-bit UCS-4-encoded string. */
817 int
818 scm_i_is_narrow_symbol (SCM sym)
819 {
820 SCM buf;
821
822 buf = SYMBOL_STRINGBUF (sym);
823 return !STRINGBUF_WIDE (buf);
824 }
825
826 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
827 contains the name of SYM. */
828 const char *
829 scm_i_symbol_chars (SCM sym)
830 {
831 SCM buf;
832
833 buf = SYMBOL_STRINGBUF (sym);
834 if (!STRINGBUF_WIDE (buf))
835 return (const char *) STRINGBUF_CHARS (buf);
836 else
837 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
838 scm_list_1 (sym));
839 }
840
841 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
842 symbol's name. */
843 const scm_t_wchar *
844 scm_i_symbol_wide_chars (SCM sym)
845 {
846 SCM buf;
847
848 buf = SYMBOL_STRINGBUF (sym);
849 if (STRINGBUF_WIDE (buf))
850 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
851 else
852 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
853 scm_list_1 (sym));
854 }
855
856 SCM
857 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
858 {
859 SCM buf = SYMBOL_STRINGBUF (sym);
860 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
861 SET_STRINGBUF_SHARED (buf);
862 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
863 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
864 (scm_t_bits)start, (scm_t_bits) end - start);
865 }
866
867 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
868 scm_t_wchar
869 scm_i_symbol_ref (SCM sym, size_t x)
870 {
871 if (scm_i_is_narrow_symbol (sym))
872 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
873 else
874 return scm_i_symbol_wide_chars (sym)[x];
875 }
876
877 /* Debugging
878 */
879
880 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
881 "Returns an association list containing debugging information\n"
882 "for @var{str}. The association list has the following entries."
883 "@table @code\n"
884 "@item string\n"
885 "The string itself.\n"
886 "@item start\n"
887 "The start index of the string into its stringbuf\n"
888 "@item length\n"
889 "The length of the string\n"
890 "@item shared\n"
891 "If this string is a substring, it returns its parent string.\n"
892 "Otherwise, it returns @code{#f}\n"
893 "@item read-only\n"
894 "@code{#t} if the string is read-only\n"
895 "@item stringbuf-chars\n"
896 "A new string containing this string's stringbuf's characters\n"
897 "@item stringbuf-length\n"
898 "The number of characters in this stringbuf\n"
899 "@item stringbuf-shared\n"
900 "@code{#t} if this stringbuf is shared\n"
901 "@item stringbuf-wide\n"
902 "@code{#t} if this stringbuf's characters are stored in a\n"
903 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
904 "buffer\n"
905 "@end table")
906 #define FUNC_NAME s_scm_sys_string_dump
907 {
908 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
909 SCM buf;
910 SCM_VALIDATE_STRING (1, str);
911
912 /* String info */
913 e1 = scm_cons (scm_from_latin1_symbol ("string"),
914 str);
915 e2 = scm_cons (scm_from_latin1_symbol ("start"),
916 scm_from_size_t (STRING_START (str)));
917 e3 = scm_cons (scm_from_latin1_symbol ("length"),
918 scm_from_size_t (STRING_LENGTH (str)));
919
920 if (IS_SH_STRING (str))
921 {
922 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
923 SH_STRING_STRING (str));
924 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
925 }
926 else
927 {
928 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
929 SCM_BOOL_F);
930 buf = STRING_STRINGBUF (str);
931 }
932
933 if (IS_RO_STRING (str))
934 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
935 SCM_BOOL_T);
936 else
937 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
938 SCM_BOOL_F);
939
940 /* Stringbuf info */
941 if (!STRINGBUF_WIDE (buf))
942 {
943 size_t len = STRINGBUF_LENGTH (buf);
944 char *cbuf;
945 SCM sbc = scm_i_make_string (len, &cbuf, 0);
946 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
947 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
948 sbc);
949 }
950 else
951 {
952 size_t len = STRINGBUF_LENGTH (buf);
953 scm_t_wchar *cbuf;
954 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
955 u32_cpy ((scm_t_uint32 *) cbuf,
956 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
957 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
958 sbc);
959 }
960 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
961 scm_from_size_t (STRINGBUF_LENGTH (buf)));
962 if (STRINGBUF_SHARED (buf))
963 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
964 SCM_BOOL_T);
965 else
966 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
967 SCM_BOOL_F);
968 if (STRINGBUF_WIDE (buf))
969 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
970 SCM_BOOL_T);
971 else
972 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
973 SCM_BOOL_F);
974
975 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
976 }
977 #undef FUNC_NAME
978
979 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
980 "Returns an association list containing debugging information\n"
981 "for @var{sym}. The association list has the following entries."
982 "@table @code\n"
983 "@item symbol\n"
984 "The symbol itself\n"
985 "@item hash\n"
986 "Its hash value\n"
987 "@item interned\n"
988 "@code{#t} if it is an interned symbol\n"
989 "@item stringbuf-chars\n"
990 "A new string containing this symbols's stringbuf's characters\n"
991 "@item stringbuf-length\n"
992 "The number of characters in this stringbuf\n"
993 "@item stringbuf-shared\n"
994 "@code{#t} if this stringbuf is shared\n"
995 "@item stringbuf-wide\n"
996 "@code{#t} if this stringbuf's characters are stored in a\n"
997 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
998 "buffer\n"
999 "@end table")
1000 #define FUNC_NAME s_scm_sys_symbol_dump
1001 {
1002 SCM e1, e2, e3, e4, e5, e6, e7;
1003 SCM buf;
1004 SCM_VALIDATE_SYMBOL (1, sym);
1005 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1006 sym);
1007 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1008 scm_from_ulong (scm_i_symbol_hash (sym)));
1009 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1010 scm_symbol_interned_p (sym));
1011 buf = SYMBOL_STRINGBUF (sym);
1012
1013 /* Stringbuf info */
1014 if (!STRINGBUF_WIDE (buf))
1015 {
1016 size_t len = STRINGBUF_LENGTH (buf);
1017 char *cbuf;
1018 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1019 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1020 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1021 sbc);
1022 }
1023 else
1024 {
1025 size_t len = STRINGBUF_LENGTH (buf);
1026 scm_t_wchar *cbuf;
1027 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1028 u32_cpy ((scm_t_uint32 *) cbuf,
1029 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1030 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1031 sbc);
1032 }
1033 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1034 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1035 if (STRINGBUF_SHARED (buf))
1036 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037 SCM_BOOL_T);
1038 else
1039 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1040 SCM_BOOL_F);
1041 if (STRINGBUF_WIDE (buf))
1042 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043 SCM_BOOL_T);
1044 else
1045 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1046 SCM_BOOL_F);
1047 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1048
1049 }
1050 #undef FUNC_NAME
1051
1052 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1053
1054 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1055 #define FUNC_NAME s_scm_sys_stringbuf_hist
1056 {
1057 int i;
1058 for (i = 0; i < 1000; i++)
1059 if (lenhist[i])
1060 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1061 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1062 return SCM_UNSPECIFIED;
1063 }
1064 #undef FUNC_NAME
1065
1066 #endif
1067
1068 \f
1069
1070 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1071 (SCM obj),
1072 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1073 #define FUNC_NAME s_scm_string_p
1074 {
1075 return scm_from_bool (IS_STRING (obj));
1076 }
1077 #undef FUNC_NAME
1078
1079
1080 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1081
1082 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1083 (SCM chrs),
1084 "@deffnx {Scheme Procedure} list->string chrs\n"
1085 "Return a newly allocated string composed of the arguments,\n"
1086 "@var{chrs}.")
1087 #define FUNC_NAME s_scm_string
1088 {
1089 SCM result = SCM_BOOL_F;
1090 SCM rest;
1091 size_t len;
1092 size_t p = 0;
1093 long i;
1094 int wide = 0;
1095
1096 /* Verify that this is a list of chars. */
1097 i = scm_ilength (chrs);
1098 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1099
1100 len = (size_t) i;
1101 rest = chrs;
1102
1103 while (len > 0 && scm_is_pair (rest))
1104 {
1105 SCM elt = SCM_CAR (rest);
1106 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1107 if (SCM_CHAR (elt) > 0xFF)
1108 wide = 1;
1109 rest = SCM_CDR (rest);
1110 len--;
1111 scm_remember_upto_here_1 (elt);
1112 }
1113
1114 /* Construct a string containing this list of chars. */
1115 len = (size_t) i;
1116 rest = chrs;
1117
1118 if (wide == 0)
1119 {
1120 char *buf;
1121
1122 result = scm_i_make_string (len, NULL, 0);
1123 result = scm_i_string_start_writing (result);
1124 buf = scm_i_string_writable_chars (result);
1125 while (len > 0 && scm_is_pair (rest))
1126 {
1127 SCM elt = SCM_CAR (rest);
1128 buf[p] = (unsigned char) SCM_CHAR (elt);
1129 p++;
1130 rest = SCM_CDR (rest);
1131 len--;
1132 scm_remember_upto_here_1 (elt);
1133 }
1134 }
1135 else
1136 {
1137 scm_t_wchar *buf;
1138
1139 result = scm_i_make_wide_string (len, NULL, 0);
1140 result = scm_i_string_start_writing (result);
1141 buf = scm_i_string_writable_wide_chars (result);
1142 while (len > 0 && scm_is_pair (rest))
1143 {
1144 SCM elt = SCM_CAR (rest);
1145 buf[p] = SCM_CHAR (elt);
1146 p++;
1147 rest = SCM_CDR (rest);
1148 len--;
1149 scm_remember_upto_here_1 (elt);
1150 }
1151 }
1152 scm_i_string_stop_writing ();
1153
1154 if (len > 0)
1155 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1156 if (!scm_is_null (rest))
1157 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1158
1159 return result;
1160 }
1161 #undef FUNC_NAME
1162
1163 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1164 (SCM k, SCM chr),
1165 "Return a newly allocated string of\n"
1166 "length @var{k}. If @var{chr} is given, then all elements of\n"
1167 "the string are initialized to @var{chr}, otherwise the contents\n"
1168 "of the string are all set to @code{#\nul}.")
1169 #define FUNC_NAME s_scm_make_string
1170 {
1171 return scm_c_make_string (scm_to_size_t (k), chr);
1172 }
1173 #undef FUNC_NAME
1174
1175 SCM
1176 scm_c_make_string (size_t len, SCM chr)
1177 #define FUNC_NAME NULL
1178 {
1179 size_t p;
1180 char *contents = NULL;
1181 SCM res = scm_i_make_string (len, &contents, 0);
1182
1183 /* If no char is given, initialize string contents to NULL. */
1184 if (SCM_UNBNDP (chr))
1185 memset (contents, 0, len);
1186 else
1187 {
1188 SCM_VALIDATE_CHAR (0, chr);
1189 res = scm_i_string_start_writing (res);
1190 for (p = 0; p < len; p++)
1191 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1192 scm_i_string_stop_writing ();
1193 }
1194
1195 return res;
1196 }
1197 #undef FUNC_NAME
1198
1199 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1200 (SCM string),
1201 "Return the number of characters in @var{string}.")
1202 #define FUNC_NAME s_scm_string_length
1203 {
1204 SCM_VALIDATE_STRING (1, string);
1205 return scm_from_size_t (STRING_LENGTH (string));
1206 }
1207 #undef FUNC_NAME
1208
1209 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1210 (SCM string),
1211 "Return the bytes used to represent a character in @var{string}."
1212 "This will return 1 or 4.")
1213 #define FUNC_NAME s_scm_string_bytes_per_char
1214 {
1215 SCM_VALIDATE_STRING (1, string);
1216 if (!scm_i_is_narrow_string (string))
1217 return scm_from_int (4);
1218
1219 return scm_from_int (1);
1220 }
1221 #undef FUNC_NAME
1222
1223 size_t
1224 scm_c_string_length (SCM string)
1225 {
1226 if (!IS_STRING (string))
1227 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1228 return STRING_LENGTH (string);
1229 }
1230
1231 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1232 (SCM str, SCM k),
1233 "Return character @var{k} of @var{str} using zero-origin\n"
1234 "indexing. @var{k} must be a valid index of @var{str}.")
1235 #define FUNC_NAME s_scm_string_ref
1236 {
1237 size_t len;
1238 unsigned long idx;
1239
1240 SCM_VALIDATE_STRING (1, str);
1241
1242 len = scm_i_string_length (str);
1243 if (SCM_LIKELY (len > 0))
1244 idx = scm_to_unsigned_integer (k, 0, len - 1);
1245 else
1246 scm_out_of_range (NULL, k);
1247
1248 if (scm_i_is_narrow_string (str))
1249 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1250 else
1251 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1252 }
1253 #undef FUNC_NAME
1254
1255 SCM
1256 scm_c_string_ref (SCM str, size_t p)
1257 {
1258 if (p >= scm_i_string_length (str))
1259 scm_out_of_range (NULL, scm_from_size_t (p));
1260 if (scm_i_is_narrow_string (str))
1261 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1262 else
1263 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1264
1265 }
1266
1267 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1268 (SCM str, SCM k, SCM chr),
1269 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1270 "an unspecified value. @var{k} must be a valid index of\n"
1271 "@var{str}.")
1272 #define FUNC_NAME s_scm_string_set_x
1273 {
1274 size_t len;
1275 unsigned long idx;
1276
1277 SCM_VALIDATE_STRING (1, str);
1278
1279 len = scm_i_string_length (str);
1280 if (SCM_LIKELY (len > 0))
1281 idx = scm_to_unsigned_integer (k, 0, len - 1);
1282 else
1283 scm_out_of_range (NULL, k);
1284
1285 SCM_VALIDATE_CHAR (3, chr);
1286 str = scm_i_string_start_writing (str);
1287 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1288 scm_i_string_stop_writing ();
1289
1290 return SCM_UNSPECIFIED;
1291 }
1292 #undef FUNC_NAME
1293
1294 void
1295 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1296 {
1297 if (p >= scm_i_string_length (str))
1298 scm_out_of_range (NULL, scm_from_size_t (p));
1299 str = scm_i_string_start_writing (str);
1300 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1301 scm_i_string_stop_writing ();
1302 }
1303
1304 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1305 (SCM str, SCM start, SCM end),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring
1313 {
1314 size_t len, from, to;
1315
1316 SCM_VALIDATE_STRING (1, str);
1317 len = scm_i_string_length (str);
1318 from = scm_to_unsigned_integer (start, 0, len);
1319 if (SCM_UNBNDP (end))
1320 to = len;
1321 else
1322 to = scm_to_unsigned_integer (end, from, len);
1323 return scm_i_substring (str, from, to);
1324 }
1325 #undef FUNC_NAME
1326
1327 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1328 (SCM str, SCM start, SCM end),
1329 "Return a newly allocated string formed from the characters\n"
1330 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1331 "ending with index @var{end} (exclusive).\n"
1332 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1333 "exact integers satisfying:\n"
1334 "\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1336 "\n"
1337 "The returned string is read-only.\n")
1338 #define FUNC_NAME s_scm_substring_read_only
1339 {
1340 size_t len, from, to;
1341
1342 SCM_VALIDATE_STRING (1, str);
1343 len = scm_i_string_length (str);
1344 from = scm_to_unsigned_integer (start, 0, len);
1345 if (SCM_UNBNDP (end))
1346 to = len;
1347 else
1348 to = scm_to_unsigned_integer (end, from, len);
1349 return scm_i_substring_read_only (str, from, to);
1350 }
1351 #undef FUNC_NAME
1352
1353 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1354 (SCM str, SCM start, SCM end),
1355 "Return a newly allocated string formed from the characters\n"
1356 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1357 "ending with index @var{end} (exclusive).\n"
1358 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1359 "exact integers satisfying:\n\n"
1360 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1361 #define FUNC_NAME s_scm_substring_copy
1362 {
1363 /* For the Scheme version, START is mandatory, but for the C
1364 version, it is optional. See scm_string_copy in srfi-13.c for a
1365 rationale.
1366 */
1367
1368 size_t from, to;
1369
1370 SCM_VALIDATE_STRING (1, str);
1371 scm_i_get_substring_spec (scm_i_string_length (str),
1372 start, &from, end, &to);
1373 return scm_i_substring_copy (str, from, to);
1374 }
1375 #undef FUNC_NAME
1376
1377 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1378 (SCM str, SCM start, SCM end),
1379 "Return string that indirectly refers to the characters\n"
1380 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1381 "ending with index @var{end} (exclusive).\n"
1382 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1383 "exact integers satisfying:\n\n"
1384 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1385 #define FUNC_NAME s_scm_substring_shared
1386 {
1387 size_t len, from, to;
1388
1389 SCM_VALIDATE_STRING (1, str);
1390 len = scm_i_string_length (str);
1391 from = scm_to_unsigned_integer (start, 0, len);
1392 if (SCM_UNBNDP (end))
1393 to = len;
1394 else
1395 to = scm_to_unsigned_integer (end, from, len);
1396 return scm_i_substring_shared (str, from, to);
1397 }
1398 #undef FUNC_NAME
1399
1400 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1401 (SCM args),
1402 "Return a newly allocated string whose characters form the\n"
1403 "concatenation of the given strings, @var{args}.")
1404 #define FUNC_NAME s_scm_string_append
1405 {
1406 SCM res;
1407 size_t total = 0;
1408 size_t len;
1409 int wide = 0;
1410 SCM l, s;
1411 size_t i;
1412 union
1413 {
1414 char *narrow;
1415 scm_t_wchar *wide;
1416 } data;
1417
1418 SCM_VALIDATE_REST_ARGUMENT (args);
1419 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1420 {
1421 s = SCM_CAR (l);
1422 SCM_VALIDATE_STRING (SCM_ARGn, s);
1423 len = scm_i_string_length (s);
1424 if (((size_t) -1) - total < len)
1425 scm_num_overflow (s_scm_string_append);
1426 total += len;
1427 if (!scm_i_is_narrow_string (s))
1428 wide = 1;
1429 }
1430 data.narrow = NULL;
1431 if (!wide)
1432 res = scm_i_make_string (total, &data.narrow, 0);
1433 else
1434 res = scm_i_make_wide_string (total, &data.wide, 0);
1435
1436 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1437 {
1438 size_t len;
1439 s = SCM_CAR (l);
1440 SCM_VALIDATE_STRING (SCM_ARGn, s);
1441 len = scm_i_string_length (s);
1442 if (len > total)
1443 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1444 if (!wide)
1445 {
1446 memcpy (data.narrow, scm_i_string_chars (s), len);
1447 data.narrow += len;
1448 }
1449 else
1450 {
1451 if (scm_i_is_narrow_string (s))
1452 {
1453 const char *src = scm_i_string_chars (s);
1454 for (i = 0; i < len; i++)
1455 data.wide[i] = (unsigned char) src[i];
1456 }
1457 else
1458 u32_cpy ((scm_t_uint32 *) data.wide,
1459 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1460 data.wide += len;
1461 }
1462 total -= len;
1463 scm_remember_upto_here_1 (s);
1464 }
1465 if (total != 0)
1466 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1467 return res;
1468 }
1469 #undef FUNC_NAME
1470
1471
1472 \f
1473 /* Charset conversion error handling. */
1474
1475 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1476 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1477
1478 /* Raise an exception informing that character CHR could not be written
1479 to PORT in its current encoding. */
1480 void
1481 scm_encoding_error (const char *subr, int err, const char *message,
1482 SCM port, SCM chr)
1483 {
1484 scm_throw (scm_encoding_error_key,
1485 scm_list_n (scm_from_latin1_string (subr),
1486 scm_from_latin1_string (message),
1487 scm_from_int (err),
1488 port, chr,
1489 SCM_UNDEFINED));
1490 }
1491
1492 /* Raise an exception informing of an encoding error on PORT. This
1493 means that a character could not be written in PORT's encoding. */
1494 void
1495 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1496 {
1497 scm_throw (scm_decoding_error_key,
1498 scm_list_n (scm_from_latin1_string (subr),
1499 scm_from_latin1_string (message),
1500 scm_from_int (err),
1501 port,
1502 SCM_UNDEFINED));
1503 }
1504
1505 \f
1506 /* String conversion to/from C. */
1507
1508 static void
1509 decoding_error (const char *func_name, int errno_save,
1510 const char *str, size_t len)
1511 {
1512 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1513 handler. */
1514 SCM bv;
1515 signed char *buf;
1516
1517 buf = scm_gc_malloc_pointerless (len, "bytevector");
1518 memcpy (buf, str, len);
1519 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1520
1521 scm_decoding_error (func_name, errno_save,
1522 "input locale conversion error", bv);
1523 }
1524
1525 SCM
1526 scm_from_stringn (const char *str, size_t len, const char *encoding,
1527 scm_t_string_failed_conversion_handler handler)
1528 {
1529 size_t u32len, i;
1530 scm_t_wchar *u32;
1531 int wide = 0;
1532 SCM res;
1533
1534 /* The order of these checks is important. */
1535 if (!str && len != 0)
1536 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1537 if (len == (size_t) -1)
1538 len = strlen (str);
1539
1540 if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1541 return scm_from_latin1_stringn (str, len);
1542 else if (c_strcasecmp (encoding, "UTF-8") == 0
1543 && handler == SCM_FAILED_CONVERSION_ERROR)
1544 return scm_from_utf8_stringn (str, len);
1545
1546 u32len = 0;
1547 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1548 (enum iconv_ilseq_handler)
1549 handler,
1550 str, len,
1551 NULL,
1552 NULL, &u32len);
1553
1554 if (SCM_UNLIKELY (u32 == NULL))
1555 decoding_error (__func__, errno, str, len);
1556
1557 i = 0;
1558 while (i < u32len)
1559 if (u32[i++] > 0xFF)
1560 {
1561 wide = 1;
1562 break;
1563 }
1564
1565 if (!wide)
1566 {
1567 char *dst;
1568 res = scm_i_make_string (u32len, &dst, 0);
1569 for (i = 0; i < u32len; i ++)
1570 dst[i] = (unsigned char) u32[i];
1571 dst[u32len] = '\0';
1572 }
1573 else
1574 {
1575 scm_t_wchar *wdst;
1576 res = scm_i_make_wide_string (u32len, &wdst, 0);
1577 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578 wdst[u32len] = 0;
1579 }
1580
1581 free (u32);
1582 return res;
1583 }
1584
1585 SCM
1586 scm_from_locale_string (const char *str)
1587 {
1588 return scm_from_locale_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_locale_stringn (const char *str, size_t len)
1593 {
1594 return scm_from_stringn (str, len, locale_charset (),
1595 scm_i_default_port_conversion_handler ());
1596 }
1597
1598 SCM
1599 scm_from_latin1_string (const char *str)
1600 {
1601 return scm_from_latin1_stringn (str, -1);
1602 }
1603
1604 SCM
1605 scm_from_latin1_stringn (const char *str, size_t len)
1606 {
1607 char *buf;
1608 SCM result;
1609
1610 if (len == (size_t) -1)
1611 len = strlen (str);
1612
1613 /* Make a narrow string and copy STR as is. */
1614 result = scm_i_make_string (len, &buf, 0);
1615 memcpy (buf, str, len);
1616
1617 return result;
1618 }
1619
1620 SCM
1621 scm_from_utf8_string (const char *str)
1622 {
1623 return scm_from_utf8_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_utf8_stringn (const char *str, size_t len)
1628 {
1629 size_t i, char_len;
1630 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1631 int ascii = 1, narrow = 1;
1632 SCM res;
1633
1634 if (len == (size_t) -1)
1635 len = strlen (str);
1636
1637 i = 0;
1638 char_len = 0;
1639
1640 while (i < len)
1641 {
1642 if (ustr[i] <= 127)
1643 {
1644 char_len++;
1645 i++;
1646 }
1647 else
1648 {
1649 ucs4_t c;
1650 int nbytes;
1651
1652 ascii = 0;
1653
1654 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1655
1656 if (c == 0xfffd)
1657 /* Bad UTF-8. */
1658 decoding_error (__func__, errno, str, len);
1659
1660 if (c > 255)
1661 narrow = 0;
1662
1663 char_len++;
1664 i += nbytes;
1665 }
1666 }
1667
1668 if (ascii)
1669 {
1670 char *dst;
1671 res = scm_i_make_string (char_len, &dst, 0);
1672 memcpy (dst, str, len);
1673 }
1674 else if (narrow)
1675 {
1676 char *dst;
1677 size_t j;
1678 ucs4_t c;
1679
1680 res = scm_i_make_string (char_len, &dst, 0);
1681
1682 for (i = 0, j = 0; i < len; j++)
1683 {
1684 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1685 dst[j] = (signed char) c;
1686 }
1687 }
1688 else
1689 {
1690 scm_t_wchar *dst;
1691 size_t j;
1692 ucs4_t c;
1693
1694 res = scm_i_make_wide_string (char_len, &dst, 0);
1695
1696 for (i = 0, j = 0; i < len; j++)
1697 {
1698 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1699 dst[j] = c;
1700 }
1701 }
1702
1703 return res;
1704 }
1705
1706 SCM
1707 scm_from_utf32_string (const scm_t_wchar *str)
1708 {
1709 return scm_from_utf32_stringn (str, -1);
1710 }
1711
1712 SCM
1713 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1714 {
1715 SCM result;
1716 scm_t_wchar *buf;
1717
1718 if (len == (size_t) -1)
1719 len = u32_strlen ((uint32_t *) str);
1720
1721 result = scm_i_make_wide_string (len, &buf, 0);
1722 memcpy (buf, str, len * sizeof (scm_t_wchar));
1723 scm_i_try_narrow_string (result);
1724
1725 return result;
1726 }
1727
1728 SCM
1729 scm_from_port_string (const char *str, SCM port)
1730 {
1731 return scm_from_port_stringn (str, -1, port);
1732 }
1733
1734 SCM
1735 scm_from_port_stringn (const char *str, size_t len, SCM port)
1736 {
1737 scm_t_port *pt = SCM_PTAB_ENTRY (port);
1738 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1739
1740 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1741 return scm_from_latin1_stringn (str, len);
1742 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1743 && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
1744 return scm_from_utf8_stringn (str, len);
1745 else
1746 return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1747 }
1748
1749 /* Create a new scheme string from the C string STR. The memory of
1750 STR may be used directly as storage for the new string. */
1751 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1752 would be to register a finalizer to eventually free(3) STR, which isn't
1753 worth it. Should we just deprecate the `scm_take_' functions? */
1754 SCM
1755 scm_take_locale_stringn (char *str, size_t len)
1756 {
1757 SCM res;
1758
1759 res = scm_from_locale_stringn (str, len);
1760 free (str);
1761
1762 return res;
1763 }
1764
1765 SCM
1766 scm_take_locale_string (char *str)
1767 {
1768 return scm_take_locale_stringn (str, -1);
1769 }
1770
1771 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1772 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1773 Set *LENP to the size of the resulting string.
1774
1775 FIXME: This is a hack we should get rid of. See
1776 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1777 for details. */
1778 static void
1779 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1780 {
1781 char *before, *after;
1782 size_t i, j;
1783
1784 before = buf;
1785 after = buf;
1786 i = 0;
1787 j = 0;
1788 while (i < *lenp)
1789 {
1790 if ((i <= *lenp - 6)
1791 && before[i] == '\\'
1792 && before[i + 1] == 'u'
1793 && before[i + 2] == '0' && before[i + 3] == '0')
1794 {
1795 /* Convert \u00NN to \xNN */
1796 after[j] = '\\';
1797 after[j + 1] = 'x';
1798 after[j + 2] = tolower ((int) before[i + 4]);
1799 after[j + 3] = tolower ((int) before[i + 5]);
1800 i += 6;
1801 j += 4;
1802 }
1803 else if ((i <= *lenp - 10)
1804 && before[i] == '\\'
1805 && before[i + 1] == 'U'
1806 && before[i + 2] == '0' && before[i + 3] == '0')
1807 {
1808 /* Convert \U00NNNNNN to \UNNNNNN */
1809 after[j] = '\\';
1810 after[j + 1] = 'U';
1811 after[j + 2] = tolower ((int) before[i + 4]);
1812 after[j + 3] = tolower ((int) before[i + 5]);
1813 after[j + 4] = tolower ((int) before[i + 6]);
1814 after[j + 5] = tolower ((int) before[i + 7]);
1815 after[j + 6] = tolower ((int) before[i + 8]);
1816 after[j + 7] = tolower ((int) before[i + 9]);
1817 i += 10;
1818 j += 8;
1819 }
1820 else
1821 {
1822 after[j] = before[i];
1823 i++;
1824 j++;
1825 }
1826 }
1827 *lenp = j;
1828 }
1829
1830 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1831 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1832 of the resulting string. BUF must be large enough to handle the
1833 worst case when `\uXXXX' escapes (6 characters) are replaced by
1834 `\xXXXX;' (7 characters). */
1835 static void
1836 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1837 {
1838 char *before, *after;
1839 size_t i, j;
1840 /* The worst case is if the input string contains all 4-digit hex escapes.
1841 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1842 size_t max_out_len = (*lenp * 7) / 6 + 1;
1843 size_t nzeros, ndigits;
1844
1845 before = buf;
1846 after = alloca (max_out_len);
1847 i = 0;
1848 j = 0;
1849 while (i < *lenp)
1850 {
1851 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1852 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1853 {
1854 if (before[i + 1] == 'u')
1855 ndigits = 4;
1856 else if (before[i + 1] == 'U')
1857 ndigits = 8;
1858 else
1859 abort ();
1860
1861 /* Add the R6RS hex escape initial sequence. */
1862 after[j] = '\\';
1863 after[j + 1] = 'x';
1864
1865 /* Move string positions to the start of the hex numbers. */
1866 i += 2;
1867 j += 2;
1868
1869 /* Find the number of initial zeros in this hex number. */
1870 nzeros = 0;
1871 while (before[i + nzeros] == '0' && nzeros < ndigits)
1872 nzeros++;
1873
1874 /* Copy the number, skipping initial zeros, and then move the string
1875 positions. */
1876 if (nzeros == ndigits)
1877 {
1878 after[j] = '0';
1879 i += ndigits;
1880 j += 1;
1881 }
1882 else
1883 {
1884 int pos;
1885 for (pos = 0; pos < ndigits - nzeros; pos++)
1886 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1887 i += ndigits;
1888 j += (ndigits - nzeros);
1889 }
1890
1891 /* Add terminating semicolon. */
1892 after[j] = ';';
1893 j++;
1894 }
1895 else
1896 {
1897 after[j] = before[i];
1898 i++;
1899 j++;
1900 }
1901 }
1902 *lenp = j;
1903 memcpy (before, after, j);
1904 }
1905
1906 char *
1907 scm_to_locale_string (SCM str)
1908 {
1909 return scm_to_locale_stringn (str, NULL);
1910 }
1911
1912 char *
1913 scm_to_locale_stringn (SCM str, size_t *lenp)
1914 {
1915 return scm_to_stringn (str, lenp,
1916 locale_charset (),
1917 scm_i_default_port_conversion_handler ());
1918 }
1919
1920 char *
1921 scm_to_latin1_string (SCM str)
1922 {
1923 return scm_to_latin1_stringn (str, NULL);
1924 }
1925
1926 char *
1927 scm_to_latin1_stringn (SCM str, size_t *lenp)
1928 #define FUNC_NAME "scm_to_latin1_stringn"
1929 {
1930 char *result;
1931
1932 SCM_VALIDATE_STRING (1, str);
1933
1934 if (scm_i_is_narrow_string (str))
1935 {
1936 size_t len = scm_i_string_length (str);
1937
1938 if (lenp)
1939 *lenp = len;
1940
1941 result = scm_strndup (scm_i_string_data (str), len);
1942 }
1943 else
1944 result = scm_to_stringn (str, lenp, NULL,
1945 SCM_FAILED_CONVERSION_ERROR);
1946
1947 return result;
1948 }
1949 #undef FUNC_NAME
1950
1951 char *
1952 scm_to_utf8_string (SCM str)
1953 {
1954 return scm_to_utf8_stringn (str, NULL);
1955 }
1956
1957 static size_t
1958 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1959 {
1960 size_t ret, i;
1961 for (i = 0, ret = 0; i < len; i++)
1962 ret += (str[i] < 128) ? 1 : 2;
1963 return ret;
1964 }
1965
1966 static scm_t_uint8*
1967 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1968 scm_t_uint8 *u8_result, size_t *u8_lenp)
1969 {
1970 size_t i, n;
1971 size_t u8_len = latin1_u8_strlen (str, latin_len);
1972
1973 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1974 u8_result = scm_malloc (u8_len + 1);
1975 if (u8_lenp)
1976 *u8_lenp = u8_len;
1977
1978 for (i = 0, n = 0; i < latin_len; i++)
1979 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1980 if (n != u8_len)
1981 abort ();
1982 u8_result[n] = 0;
1983
1984 return u8_result;
1985 }
1986
1987 /* UTF-8 code table
1988
1989 (Note that this includes code points that are not allowed by Unicode,
1990 but since this function has no way to report an error, and its
1991 purpose is to determine the size of destination buffers for
1992 libunicode conversion functions, we err on the safe side and handle
1993 everything that libunicode might conceivably handle, now or in the
1994 future.)
1995
1996 Char. number range | UTF-8 octet sequence
1997 (hexadecimal) | (binary)
1998 --------------------+------------------------------------------------------
1999 0000 0000-0000 007F | 0xxxxxxx
2000 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2001 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2002 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2003 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2004 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2005 */
2006
2007 static size_t
2008 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2009 {
2010 size_t ret, i;
2011
2012 for (i = 0, ret = 0; i < len; i++)
2013 {
2014 scm_t_uint32 c = str[i];
2015
2016 if (c <= 0x7f)
2017 ret += 1;
2018 else if (c <= 0x7ff)
2019 ret += 2;
2020 else if (c <= 0xffff)
2021 ret += 3;
2022 else if (c <= 0x1fffff)
2023 ret += 4;
2024 else if (c <= 0x3ffffff)
2025 ret += 5;
2026 else
2027 ret += 6;
2028 }
2029
2030 return ret;
2031 }
2032
2033 char *
2034 scm_to_utf8_stringn (SCM str, size_t *lenp)
2035 #define FUNC_NAME "scm_to_utf8_stringn"
2036 {
2037 SCM_VALIDATE_STRING (1, str);
2038
2039 if (scm_i_is_narrow_string (str))
2040 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2041 scm_i_string_length (str),
2042 NULL, lenp);
2043 else
2044 {
2045 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2046 scm_t_uint8 *buf, *ret;
2047 size_t num_chars = scm_i_string_length (str);
2048 size_t num_bytes_predicted, num_bytes_actual;
2049
2050 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2051
2052 if (lenp)
2053 {
2054 *lenp = num_bytes_predicted;
2055 buf = scm_malloc (num_bytes_predicted);
2056 }
2057 else
2058 {
2059 buf = scm_malloc (num_bytes_predicted + 1);
2060 buf[num_bytes_predicted] = 0;
2061 }
2062
2063 num_bytes_actual = num_bytes_predicted;
2064 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2065
2066 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2067 return (char *) ret;
2068
2069 /* An error: a bad codepoint. */
2070 {
2071 int saved_errno = errno;
2072
2073 free (buf);
2074 if (!saved_errno)
2075 abort ();
2076
2077 scm_decoding_error ("scm_to_utf8_stringn", errno,
2078 "invalid codepoint in string", str);
2079
2080 /* Not reached. */
2081 return NULL;
2082 }
2083 }
2084 }
2085 #undef FUNC_NAME
2086
2087 scm_t_wchar *
2088 scm_to_utf32_string (SCM str)
2089 {
2090 return scm_to_utf32_stringn (str, NULL);
2091 }
2092
2093 scm_t_wchar *
2094 scm_to_utf32_stringn (SCM str, size_t *lenp)
2095 #define FUNC_NAME "scm_to_utf32_stringn"
2096 {
2097 scm_t_wchar *result;
2098
2099 SCM_VALIDATE_STRING (1, str);
2100
2101 if (scm_i_is_narrow_string (str))
2102 {
2103 scm_t_uint8 *codepoints;
2104 size_t i, len;
2105
2106 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2107 len = scm_i_string_length (str);
2108 if (lenp)
2109 *lenp = len;
2110
2111 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2112 for (i = 0; i < len; i++)
2113 result[i] = codepoints[i];
2114 result[len] = 0;
2115 }
2116 else
2117 {
2118 size_t len;
2119
2120 len = scm_i_string_length (str);
2121 if (lenp)
2122 *lenp = len;
2123
2124 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2125 memcpy (result, scm_i_string_wide_chars (str),
2126 len * sizeof (scm_t_wchar));
2127 result[len] = 0;
2128 }
2129
2130 return result;
2131 }
2132 #undef FUNC_NAME
2133
2134 char *
2135 scm_to_port_string (SCM str, SCM port)
2136 {
2137 return scm_to_port_stringn (str, NULL, port);
2138 }
2139
2140 char *
2141 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2142 {
2143 scm_t_port *pt = SCM_PTAB_ENTRY (port);
2144 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2145
2146 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2147 && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2148 return scm_to_latin1_stringn (str, lenp);
2149 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2150 return scm_to_utf8_stringn (str, lenp);
2151 else
2152 return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2153 }
2154
2155 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2156 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2157 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2158 defined by HANDLER. */
2159 char *
2160 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2161 scm_t_string_failed_conversion_handler handler)
2162 {
2163 char *buf;
2164 size_t ilen, len, i;
2165 int ret;
2166 const char *enc;
2167
2168 if (!scm_is_string (str))
2169 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2170 ilen = scm_i_string_length (str);
2171
2172 if (ilen == 0)
2173 {
2174 buf = scm_malloc (1);
2175 buf[0] = '\0';
2176 if (lenp)
2177 *lenp = 0;
2178 return buf;
2179 }
2180
2181 if (lenp == NULL)
2182 for (i = 0; i < ilen; i++)
2183 if (scm_i_string_ref (str, i) == '\0')
2184 scm_misc_error (NULL,
2185 "string contains #\\nul character: ~S",
2186 scm_list_1 (str));
2187
2188 if (scm_i_is_narrow_string (str)
2189 && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2190 {
2191 /* If using native Latin-1 encoding, just copy the string
2192 contents. */
2193 if (lenp)
2194 {
2195 buf = scm_malloc (ilen);
2196 memcpy (buf, scm_i_string_chars (str), ilen);
2197 *lenp = ilen;
2198 return buf;
2199 }
2200 else
2201 {
2202 buf = scm_malloc (ilen + 1);
2203 memcpy (buf, scm_i_string_chars (str), ilen);
2204 buf[ilen] = '\0';
2205 return buf;
2206 }
2207 }
2208
2209
2210 buf = NULL;
2211 len = 0;
2212 enc = encoding;
2213 if (enc == NULL)
2214 enc = "ISO-8859-1";
2215 if (scm_i_is_narrow_string (str))
2216 {
2217 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2218 "ISO-8859-1", enc,
2219 (enum iconv_ilseq_handler) handler, NULL,
2220 &buf, &len);
2221
2222 if (ret != 0)
2223 scm_encoding_error (__func__, errno,
2224 "cannot convert narrow string to output locale",
2225 SCM_BOOL_F,
2226 /* FIXME: Faulty character unknown. */
2227 SCM_BOOL_F);
2228 }
2229 else
2230 {
2231 buf = u32_conv_to_encoding (enc,
2232 (enum iconv_ilseq_handler) handler,
2233 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2234 ilen,
2235 NULL,
2236 NULL, &len);
2237 if (buf == NULL)
2238 scm_encoding_error (__func__, errno,
2239 "cannot convert wide string to output locale",
2240 SCM_BOOL_F,
2241 /* FIXME: Faulty character unknown. */
2242 SCM_BOOL_F);
2243 }
2244 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2245 {
2246 if (SCM_R6RS_ESCAPES_P)
2247 {
2248 /* The worst case is if the input string contains all 4-digit
2249 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2250 (seven characters). Make BUF large enough to hold
2251 that. */
2252 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2253 unistring_escapes_to_r6rs_escapes (buf, &len);
2254 }
2255 else
2256 unistring_escapes_to_guile_escapes (buf, &len);
2257
2258 buf = scm_realloc (buf, len);
2259 }
2260 if (lenp)
2261 *lenp = len;
2262 else
2263 {
2264 buf = scm_realloc (buf, len + 1);
2265 buf[len] = '\0';
2266 }
2267
2268 scm_remember_upto_here_1 (str);
2269 return buf;
2270 }
2271
2272 size_t
2273 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2274 {
2275 size_t len;
2276 char *result = NULL;
2277 if (!scm_is_string (str))
2278 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2279 result = scm_to_locale_stringn (str, &len);
2280
2281 memcpy (buf, result, (len > max_len) ? max_len : len);
2282 free (result);
2283
2284 scm_remember_upto_here_1 (str);
2285 return len;
2286 }
2287
2288 \f
2289 /* Unicode string normalization. */
2290
2291 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2292 libguile/i18n.c. It would be useful to have this factored out into a more
2293 convenient location, but its use of alloca makes that tricky to do. */
2294
2295 static SCM
2296 normalize_str (SCM string, uninorm_t form)
2297 {
2298 SCM ret;
2299 scm_t_uint32 *w_str;
2300 scm_t_wchar *cbuf;
2301 size_t rlen, len = scm_i_string_length (string);
2302
2303 if (scm_i_is_narrow_string (string))
2304 {
2305 size_t i;
2306 const char *buf = scm_i_string_chars (string);
2307
2308 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2309
2310 for (i = 0; i < len; i ++)
2311 w_str[i] = (unsigned char) buf[i];
2312 w_str[len] = 0;
2313 }
2314 else
2315 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2316
2317 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2318
2319 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2320 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2321 free (w_str);
2322
2323 scm_i_try_narrow_string (ret);
2324
2325 return ret;
2326 }
2327
2328 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2329 (SCM string),
2330 "Returns the NFC normalized form of @var{string}.")
2331 #define FUNC_NAME s_scm_string_normalize_nfc
2332 {
2333 SCM_VALIDATE_STRING (1, string);
2334 return normalize_str (string, UNINORM_NFC);
2335 }
2336 #undef FUNC_NAME
2337
2338 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2339 (SCM string),
2340 "Returns the NFD normalized form of @var{string}.")
2341 #define FUNC_NAME s_scm_string_normalize_nfd
2342 {
2343 SCM_VALIDATE_STRING (1, string);
2344 return normalize_str (string, UNINORM_NFD);
2345 }
2346 #undef FUNC_NAME
2347
2348 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2349 (SCM string),
2350 "Returns the NFKC normalized form of @var{string}.")
2351 #define FUNC_NAME s_scm_string_normalize_nfkc
2352 {
2353 SCM_VALIDATE_STRING (1, string);
2354 return normalize_str (string, UNINORM_NFKC);
2355 }
2356 #undef FUNC_NAME
2357
2358 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2359 (SCM string),
2360 "Returns the NFKD normalized form of @var{string}.")
2361 #define FUNC_NAME s_scm_string_normalize_nfkd
2362 {
2363 SCM_VALIDATE_STRING (1, string);
2364 return normalize_str (string, UNINORM_NFKD);
2365 }
2366 #undef FUNC_NAME
2367
2368 /* converts C scm_array of strings to SCM scm_list of strings.
2369 If argc < 0, a null terminated scm_array is assumed.
2370 The current locale encoding is assumed */
2371 SCM
2372 scm_makfromstrs (int argc, char **argv)
2373 {
2374 int i = argc;
2375 SCM lst = SCM_EOL;
2376 if (0 > i)
2377 for (i = 0; argv[i]; i++);
2378 while (i--)
2379 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2380 return lst;
2381 }
2382
2383 /* Return a newly allocated array of char pointers to each of the strings
2384 in args, with a terminating NULL pointer. The strings are encoded using
2385 the current locale. */
2386
2387 char **
2388 scm_i_allocate_string_pointers (SCM list)
2389 #define FUNC_NAME "scm_i_allocate_string_pointers"
2390 {
2391 char **result;
2392 int list_len = scm_ilength (list);
2393 int i;
2394
2395 if (list_len < 0)
2396 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2397
2398 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2399 "string pointers");
2400 result[list_len] = NULL;
2401
2402 /* The list might have been modified in another thread, so
2403 we check LIST before each access.
2404 */
2405 for (i = 0; i < list_len && scm_is_pair (list); i++)
2406 {
2407 SCM str = SCM_CAR (list);
2408 size_t len; /* String length in bytes */
2409 char *c_str = scm_to_locale_stringn (str, &len);
2410
2411 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2412 scm_malloc to allocate the returned string, which must be
2413 explicitly deallocated. This forces us to copy the string a
2414 second time into a new buffer. Ideally there would be variants
2415 of scm_to_*_stringn that can return garbage-collected buffers. */
2416
2417 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2418 memcpy (result[i], c_str, len);
2419 result[i][len] = '\0';
2420 free (c_str);
2421
2422 list = SCM_CDR (list);
2423 }
2424
2425 return result;
2426 }
2427 #undef FUNC_NAME
2428
2429 void
2430 scm_i_get_substring_spec (size_t len,
2431 SCM start, size_t *cstart,
2432 SCM end, size_t *cend)
2433 {
2434 if (SCM_UNBNDP (start))
2435 *cstart = 0;
2436 else
2437 *cstart = scm_to_unsigned_integer (start, 0, len);
2438
2439 if (SCM_UNBNDP (end))
2440 *cend = len;
2441 else
2442 *cend = scm_to_unsigned_integer (end, *cstart, len);
2443 }
2444
2445 static SCM
2446 string_handle_ref (scm_t_array_handle *h, size_t index)
2447 {
2448 return scm_c_string_ref (h->array, index);
2449 }
2450
2451 static void
2452 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2453 {
2454 scm_c_string_set_x (h->array, index, val);
2455 }
2456
2457 static void
2458 string_get_handle (SCM v, scm_t_array_handle *h)
2459 {
2460 h->array = v;
2461 h->ndims = 1;
2462 h->dims = &h->dim0;
2463 h->dim0.lbnd = 0;
2464 h->dim0.ubnd = scm_c_string_length (v) - 1;
2465 h->dim0.inc = 1;
2466 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2467 h->elements = h->writable_elements = NULL;
2468 }
2469
2470 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2471 string_handle_ref, string_handle_set,
2472 string_get_handle)
2473 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2474
2475 void
2476 scm_init_strings ()
2477 {
2478 scm_nullstr = scm_i_make_string (0, NULL, 0);
2479
2480 #include "libguile/strings.x"
2481 }
2482
2483
2484 /*
2485 Local Variables:
2486 c-file-style: "gnu"
2487 End:
2488 */