compile-elisp fn
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32 #include <c-strcase.h>
33
34 #include "striconveh.h"
35
36 #include "libguile/_scm.h"
37 #include "libguile/chars.h"
38 #include "libguile/root.h"
39 #include "libguile/strings.h"
40 #include "libguile/ports.h"
41 #include "libguile/ports-internal.h"
42 #include "libguile/error.h"
43 #include "libguile/generalized-vectors.h"
44 #include "libguile/deprecation.h"
45 #include "libguile/validate.h"
46 #include "libguile/private-options.h"
47
48 \f
49
50 /* {Strings}
51 */
52
53
54 /* Stringbufs
55 *
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
58 * shared, not whether it definitely is.
59 *
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
66 *
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
71 *
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 * strings.
75 */
76
77 /* The size in words of the stringbuf header (type tag + size). */
78 #define STRINGBUF_HEADER_SIZE 2U
79
80 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
81
82 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
83 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
84
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
88
89 #define STRINGBUF_CONTENTS(buf) ((void *) \
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
92 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
94
95 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
96
97 #define SET_STRINGBUF_SHARED(buf) \
98 do \
99 { \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 } \
105 while (0)
106
107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
108 static size_t lenhist[1001];
109 #endif
110
111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
112 characters. */
113 static SCM
114 make_stringbuf (size_t len)
115 {
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
120 can be dropped.
121 */
122
123 SCM buf;
124
125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130 #endif
131
132 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
133 "string"));
134
135 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
136 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
137
138 STRINGBUF_CHARS (buf)[len] = 0;
139
140 return buf;
141 }
142
143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
144 characters. */
145 static SCM
146 make_wide_stringbuf (size_t len)
147 {
148 SCM buf;
149 size_t raw_len;
150
151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156 #endif
157
158 raw_len = (len + 1) * sizeof (scm_t_wchar);
159 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
160 "string"));
161
162 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
163 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
164
165 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
166
167 return buf;
168 }
169
170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
172 static SCM
173 wide_stringbuf (SCM buf)
174 {
175 SCM new_buf;
176
177 if (STRINGBUF_WIDE (buf))
178 new_buf = buf;
179 else
180 {
181 size_t i, len;
182 scm_t_wchar *mem;
183
184 len = STRINGBUF_LENGTH (buf);
185
186 new_buf = make_wide_stringbuf (len);
187
188 mem = STRINGBUF_WIDE_CHARS (new_buf);
189 for (i = 0; i < len; i++)
190 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
191 mem[len] = 0;
192 }
193
194 return new_buf;
195 }
196
197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
199 static SCM
200 narrow_stringbuf (SCM buf)
201 {
202 SCM new_buf;
203
204 if (!STRINGBUF_WIDE (buf))
205 new_buf = buf;
206 else
207 {
208 size_t i, len;
209 scm_t_wchar *wmem;
210 unsigned char *mem;
211
212 len = STRINGBUF_LENGTH (buf);
213 wmem = STRINGBUF_WIDE_CHARS (buf);
214
215 for (i = 0; i < len; i++)
216 if (wmem[i] > 0xFF)
217 /* BUF cannot be narrowed. */
218 return buf;
219
220 new_buf = make_stringbuf (len);
221
222 mem = STRINGBUF_CHARS (new_buf);
223 for (i = 0; i < len; i++)
224 mem[i] = (unsigned char) wmem[i];
225 mem[len] = 0;
226 }
227
228 return new_buf;
229 }
230
231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
232
233 \f
234 /* Copy-on-write strings.
235 */
236
237 #define STRING_TAG scm_tc7_string
238
239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
242
243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
245
246 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
247
248 /* Read-only strings.
249 */
250
251 #define RO_STRING_TAG scm_tc7_ro_string
252 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
253
254 /* Mutation-sharing substrings
255 */
256
257 #define SH_STRING_TAG (scm_tc7_string + 0x100)
258
259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260 /* START and LENGTH as for STRINGs. */
261
262 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263
264 void
265 scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate)
266 {
267 SCM str;
268
269 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
270 SET_STRINGBUF_SHARED (exp);
271 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
272
273 str = scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp),
274 0, STRINGBUF_LENGTH (exp));
275
276 scm_puts ("#<stringbuf ", port);
277 scm_iprin1 (str, port, pstate);
278 scm_puts (">", port);
279 }
280
281 SCM scm_nullstr;
282
283 static SCM null_stringbuf;
284
285 static void
286 init_null_stringbuf (void)
287 {
288 null_stringbuf = make_stringbuf (0);
289 SET_STRINGBUF_SHARED (null_stringbuf);
290 }
291
292 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
293 characters. CHARSP, if not NULL, will be set to location of the
294 char array. If READ_ONLY_P, the returned string is read-only;
295 otherwise it is writable. */
296 SCM
297 scm_i_make_string (size_t len, char **charsp, int read_only_p)
298 {
299 SCM buf;
300 SCM res;
301
302 if (len == 0)
303 {
304 static scm_i_pthread_once_t once = SCM_I_PTHREAD_ONCE_INIT;
305 scm_i_pthread_once (&once, init_null_stringbuf);
306 buf = null_stringbuf;
307 }
308 else
309 buf = make_stringbuf (len);
310
311 if (charsp)
312 *charsp = (char *) STRINGBUF_CHARS (buf);
313 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
314 SCM_UNPACK (buf),
315 (scm_t_bits) 0, (scm_t_bits) len);
316 return res;
317 }
318
319 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
320 characters. CHARSP, if not NULL, will be set to location of the
321 character array. If READ_ONLY_P, the returned string is read-only;
322 otherwise it is writable. */
323 SCM
324 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
325 {
326 SCM buf = make_wide_stringbuf (len);
327 SCM res;
328 if (charsp)
329 *charsp = STRINGBUF_WIDE_CHARS (buf);
330 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
331 SCM_UNPACK (buf),
332 (scm_t_bits) 0, (scm_t_bits) len);
333 return res;
334 }
335
336 static void
337 validate_substring_args (SCM str, size_t start, size_t end)
338 {
339 if (!IS_STRING (str))
340 scm_wrong_type_arg_msg (NULL, 0, str, "string");
341 if (start > STRING_LENGTH (str))
342 scm_out_of_range (NULL, scm_from_size_t (start));
343 if (end > STRING_LENGTH (str) || end < start)
344 scm_out_of_range (NULL, scm_from_size_t (end));
345 }
346
347 static inline void
348 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
349 {
350 *start = STRING_START (*str);
351 if (IS_SH_STRING (*str))
352 {
353 *str = SH_STRING_STRING (*str);
354 *start += STRING_START (*str);
355 }
356 *buf = STRING_STRINGBUF (*str);
357 }
358
359 SCM
360 scm_i_substring (SCM str, size_t start, size_t end)
361 {
362 if (start == end)
363 return scm_i_make_string (0, NULL, 0);
364 else
365 {
366 SCM buf;
367 size_t str_start;
368 get_str_buf_start (&str, &buf, &str_start);
369 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
370 SET_STRINGBUF_SHARED (buf);
371 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
372 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
373 (scm_t_bits)str_start + start,
374 (scm_t_bits) end - start);
375 }
376 }
377
378 SCM
379 scm_i_substring_read_only (SCM str, size_t start, size_t end)
380 {
381 if (start == end)
382 return scm_i_make_string (0, NULL, 1);
383 else
384 {
385 SCM buf;
386 size_t str_start;
387 get_str_buf_start (&str, &buf, &str_start);
388 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
389 SET_STRINGBUF_SHARED (buf);
390 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
391 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
392 (scm_t_bits)str_start + start,
393 (scm_t_bits) end - start);
394 }
395 }
396
397 SCM
398 scm_i_substring_copy (SCM str, size_t start, size_t end)
399 {
400 if (start == end)
401 return scm_i_make_string (0, NULL, 0);
402 else
403 {
404 size_t len = end - start;
405 SCM buf, my_buf, substr;
406 size_t str_start;
407 int wide = 0;
408 get_str_buf_start (&str, &buf, &str_start);
409 if (scm_i_is_narrow_string (str))
410 {
411 my_buf = make_stringbuf (len);
412 memcpy (STRINGBUF_CHARS (my_buf),
413 STRINGBUF_CHARS (buf) + str_start + start, len);
414 }
415 else
416 {
417 my_buf = make_wide_stringbuf (len);
418 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
419 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
420 + start), len);
421 wide = 1;
422 }
423 scm_remember_upto_here_1 (buf);
424 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
425 (scm_t_bits) 0, (scm_t_bits) len);
426 if (wide)
427 scm_i_try_narrow_string (substr);
428 return substr;
429 }
430 }
431
432 SCM
433 scm_i_substring_shared (SCM str, size_t start, size_t end)
434 {
435 if (start == 0 && end == STRING_LENGTH (str))
436 return str;
437 else if (start == end)
438 return scm_i_make_string (0, NULL, 0);
439 else
440 {
441 size_t len = end - start;
442 if (IS_SH_STRING (str))
443 {
444 start += STRING_START (str);
445 str = SH_STRING_STRING (str);
446 }
447 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
448 (scm_t_bits)start, (scm_t_bits) len);
449 }
450 }
451
452 SCM
453 scm_c_substring (SCM str, size_t start, size_t end)
454 {
455 validate_substring_args (str, start, end);
456 return scm_i_substring (str, start, end);
457 }
458
459 SCM
460 scm_c_substring_read_only (SCM str, size_t start, size_t end)
461 {
462 validate_substring_args (str, start, end);
463 return scm_i_substring_read_only (str, start, end);
464 }
465
466 SCM
467 scm_c_substring_copy (SCM str, size_t start, size_t end)
468 {
469 validate_substring_args (str, start, end);
470 return scm_i_substring_copy (str, start, end);
471 }
472
473 SCM
474 scm_c_substring_shared (SCM str, size_t start, size_t end)
475 {
476 validate_substring_args (str, start, end);
477 return scm_i_substring_shared (str, start, end);
478 }
479
480 \f
481 /* Internal accessors
482 */
483
484 /* Returns the number of characters in STR. This may be different
485 than the memory size of the string storage. */
486 size_t
487 scm_i_string_length (SCM str)
488 {
489 return STRING_LENGTH (str);
490 }
491
492 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
493 encoding. False if it is 'wide', having a 32-bit UCS-4
494 encoding. */
495 int
496 scm_i_is_narrow_string (SCM str)
497 {
498 if (IS_SH_STRING (str))
499 str = SH_STRING_STRING (str);
500
501 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
502 }
503
504 /* Try to coerce a string to be narrow. It if is narrow already, do
505 nothing. If it is wide, shrink it to narrow if none of its
506 characters are above 0xFF. Return true if the string is narrow or
507 was made to be narrow. */
508 int
509 scm_i_try_narrow_string (SCM str)
510 {
511 if (IS_SH_STRING (str))
512 str = SH_STRING_STRING (str);
513
514 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
515
516 return scm_i_is_narrow_string (str);
517 }
518
519 /* Return a pointer to the raw data of the string, which can be either Latin-1
520 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
521 const void *
522 scm_i_string_data (SCM str)
523 {
524 SCM buf;
525 size_t start;
526 const char *data;
527
528 get_str_buf_start (&str, &buf, &start);
529
530 data = STRINGBUF_CONTENTS (buf);
531 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
532
533 return data;
534 }
535
536 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
537 STR. */
538 const char *
539 scm_i_string_chars (SCM str)
540 {
541 SCM buf;
542 size_t start;
543 get_str_buf_start (&str, &buf, &start);
544 if (scm_i_is_narrow_string (str))
545 return (const char *) STRINGBUF_CHARS (buf) + start;
546 else
547 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
548 scm_list_1 (str));
549 return NULL;
550 }
551
552 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
553 STR. */
554 const scm_t_wchar *
555 scm_i_string_wide_chars (SCM str)
556 {
557 SCM buf;
558 size_t start;
559
560 get_str_buf_start (&str, &buf, &start);
561 if (!scm_i_is_narrow_string (str))
562 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
563 else
564 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
565 scm_list_1 (str));
566 }
567
568 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
569 a new string buffer, so that it can be modified without modifying
570 other strings. Also, lock the string mutex. Later, one must call
571 scm_i_string_stop_writing to unlock the mutex. */
572 SCM
573 scm_i_string_start_writing (SCM orig_str)
574 {
575 SCM buf, str = orig_str;
576 size_t start;
577
578 get_str_buf_start (&str, &buf, &start);
579 if (IS_RO_STRING (str))
580 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
581
582 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
583 if (STRINGBUF_SHARED (buf))
584 {
585 /* Clone the stringbuf. */
586 size_t len = STRING_LENGTH (str);
587 SCM new_buf;
588
589 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
590
591 if (scm_i_is_narrow_string (str))
592 {
593 new_buf = make_stringbuf (len);
594 memcpy (STRINGBUF_CHARS (new_buf),
595 STRINGBUF_CHARS (buf) + STRING_START (str), len);
596
597 }
598 else
599 {
600 new_buf = make_wide_stringbuf (len);
601 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
602 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
603 + STRING_START (str)), len);
604 }
605
606 SET_STRING_STRINGBUF (str, new_buf);
607 start -= STRING_START (str);
608
609 /* FIXME: The following operations are not atomic, so other threads
610 looking at STR may see an inconsistent state. Nevertheless it can't
611 hurt much since (i) accessing STR while it is being mutated can't
612 yield a crash, and (ii) concurrent accesses to STR should be
613 protected by a mutex at the application level. The latter may not
614 apply when STR != ORIG_STR, though. */
615 SET_STRING_START (str, 0);
616 SET_STRING_STRINGBUF (str, new_buf);
617
618 buf = new_buf;
619
620 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
621 }
622 return orig_str;
623 }
624
625 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
626 char *
627 scm_i_string_writable_chars (SCM str)
628 {
629 SCM buf;
630 size_t start;
631
632 get_str_buf_start (&str, &buf, &start);
633 if (scm_i_is_narrow_string (str))
634 return (char *) STRINGBUF_CHARS (buf) + start;
635 else
636 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
637 scm_list_1 (str));
638 return NULL;
639 }
640
641 /* Return a pointer to the UCS-4 codepoints of a string. */
642 static scm_t_wchar *
643 scm_i_string_writable_wide_chars (SCM str)
644 {
645 SCM buf;
646 size_t start;
647
648 get_str_buf_start (&str, &buf, &start);
649 if (!scm_i_is_narrow_string (str))
650 return STRINGBUF_WIDE_CHARS (buf) + start;
651 else
652 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
653 scm_list_1 (str));
654 }
655
656 /* Unlock the string mutex that was locked when
657 scm_i_string_start_writing was called. */
658 void
659 scm_i_string_stop_writing (void)
660 {
661 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
662 }
663
664 /* Return the Xth character of STR as a UCS-4 codepoint. */
665 scm_t_wchar
666 scm_i_string_ref (SCM str, size_t x)
667 {
668 if (scm_i_is_narrow_string (str))
669 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
670 else
671 return scm_i_string_wide_chars (str)[x];
672 }
673
674 /* Returns index+1 of the first char in STR that matches C, or
675 0 if the char is not found. */
676 int
677 scm_i_string_contains_char (SCM str, char ch)
678 {
679 size_t i;
680 size_t len = scm_i_string_length (str);
681
682 i = 0;
683 if (scm_i_is_narrow_string (str))
684 {
685 while (i < len)
686 {
687 if (scm_i_string_chars (str)[i] == ch)
688 return i+1;
689 i++;
690 }
691 }
692 else
693 {
694 while (i < len)
695 {
696 if (scm_i_string_wide_chars (str)[i]
697 == (unsigned char) ch)
698 return i+1;
699 i++;
700 }
701 }
702 return 0;
703 }
704
705 int
706 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
707 {
708 if (scm_i_is_narrow_string (sstr))
709 {
710 const char *a = scm_i_string_chars (sstr) + start_x;
711 const char *b = cstr;
712 return strncmp (a, b, strlen(b));
713 }
714 else
715 {
716 size_t i;
717 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
718 const char *b = cstr;
719 for (i = 0; i < strlen (b); i++)
720 {
721 if (a[i] != (unsigned char) b[i])
722 return 1;
723 }
724 }
725 return 0;
726 }
727
728 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
729 void
730 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
731 {
732 if (IS_SH_STRING (str))
733 {
734 p += STRING_START (str);
735 str = SH_STRING_STRING (str);
736 }
737
738 if (chr > 0xFF && scm_i_is_narrow_string (str))
739 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
740
741 if (scm_i_is_narrow_string (str))
742 {
743 char *dst = scm_i_string_writable_chars (str);
744 dst[p] = chr;
745 }
746 else
747 {
748 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
749 dst[p] = chr;
750 }
751 }
752
753 \f
754 /* Symbols.
755
756 Basic symbol creation and accessing is done here, the rest is in
757 symbols.[hc]. This has been done to keep stringbufs and the
758 internals of strings and string-like objects confined to this file.
759 */
760
761 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
762
763 SCM
764 scm_i_make_symbol (SCM name, scm_t_bits flags,
765 unsigned long hash, SCM props)
766 {
767 SCM buf;
768 size_t start = STRING_START (name);
769 size_t length = STRING_LENGTH (name);
770
771 if (IS_SH_STRING (name))
772 {
773 name = SH_STRING_STRING (name);
774 start += STRING_START (name);
775 }
776 buf = STRING_STRINGBUF (name);
777
778 if (start == 0 && length == STRINGBUF_LENGTH (buf))
779 {
780 /* reuse buf. */
781 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
782 SET_STRINGBUF_SHARED (buf);
783 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
784 }
785 else
786 {
787 /* make new buf. */
788 if (scm_i_is_narrow_string (name))
789 {
790 SCM new_buf = make_stringbuf (length);
791 memcpy (STRINGBUF_CHARS (new_buf),
792 STRINGBUF_CHARS (buf) + start, length);
793 buf = new_buf;
794 }
795 else
796 {
797 SCM new_buf = make_wide_stringbuf (length);
798 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
799 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
800 length);
801 buf = new_buf;
802 }
803 }
804 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
805 (scm_t_bits) hash, SCM_UNPACK (props));
806 }
807
808 SCM
809 scm_i_c_make_symbol (const char *name, size_t len,
810 scm_t_bits flags, unsigned long hash, SCM props)
811 {
812 SCM buf = make_stringbuf (len);
813 memcpy (STRINGBUF_CHARS (buf), name, len);
814
815 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
816 (scm_t_bits) hash, SCM_UNPACK (props));
817 }
818
819 /* Returns the number of characters in SYM. This may be different
820 from the memory size of SYM. */
821 size_t
822 scm_i_symbol_length (SCM sym)
823 {
824 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
825 }
826
827 size_t
828 scm_c_symbol_length (SCM sym)
829 #define FUNC_NAME "scm_c_symbol_length"
830 {
831 SCM_VALIDATE_SYMBOL (1, sym);
832
833 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
834 }
835 #undef FUNC_NAME
836
837 /* True if the name of SYM is stored as a Latin-1 encoded string.
838 False if it is stored as a 32-bit UCS-4-encoded string. */
839 int
840 scm_i_is_narrow_symbol (SCM sym)
841 {
842 SCM buf;
843
844 buf = SYMBOL_STRINGBUF (sym);
845 return !STRINGBUF_WIDE (buf);
846 }
847
848 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
849 contains the name of SYM. */
850 const char *
851 scm_i_symbol_chars (SCM sym)
852 {
853 SCM buf;
854
855 buf = SYMBOL_STRINGBUF (sym);
856 if (!STRINGBUF_WIDE (buf))
857 return (const char *) STRINGBUF_CHARS (buf);
858 else
859 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
860 scm_list_1 (sym));
861 }
862
863 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
864 symbol's name. */
865 const scm_t_wchar *
866 scm_i_symbol_wide_chars (SCM sym)
867 {
868 SCM buf;
869
870 buf = SYMBOL_STRINGBUF (sym);
871 if (STRINGBUF_WIDE (buf))
872 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
873 else
874 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
875 scm_list_1 (sym));
876 }
877
878 SCM
879 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
880 {
881 SCM buf = SYMBOL_STRINGBUF (sym);
882 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
883 SET_STRINGBUF_SHARED (buf);
884 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
885 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
886 (scm_t_bits)start, (scm_t_bits) end - start);
887 }
888
889 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
890 scm_t_wchar
891 scm_i_symbol_ref (SCM sym, size_t x)
892 {
893 if (scm_i_is_narrow_symbol (sym))
894 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
895 else
896 return scm_i_symbol_wide_chars (sym)[x];
897 }
898
899 /* Debugging
900 */
901
902 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
903 "Returns an association list containing debugging information\n"
904 "for @var{str}. The association list has the following entries."
905 "@table @code\n"
906 "@item string\n"
907 "The string itself.\n"
908 "@item start\n"
909 "The start index of the string into its stringbuf\n"
910 "@item length\n"
911 "The length of the string\n"
912 "@item shared\n"
913 "If this string is a substring, it returns its parent string.\n"
914 "Otherwise, it returns @code{#f}\n"
915 "@item read-only\n"
916 "@code{#t} if the string is read-only\n"
917 "@item stringbuf-chars\n"
918 "A new string containing this string's stringbuf's characters\n"
919 "@item stringbuf-length\n"
920 "The number of characters in this stringbuf\n"
921 "@item stringbuf-shared\n"
922 "@code{#t} if this stringbuf is shared\n"
923 "@item stringbuf-wide\n"
924 "@code{#t} if this stringbuf's characters are stored in a\n"
925 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
926 "buffer\n"
927 "@end table")
928 #define FUNC_NAME s_scm_sys_string_dump
929 {
930 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
931 SCM buf;
932 SCM_VALIDATE_STRING (1, str);
933
934 /* String info */
935 e1 = scm_cons (scm_from_latin1_symbol ("string"),
936 str);
937 e2 = scm_cons (scm_from_latin1_symbol ("start"),
938 scm_from_size_t (STRING_START (str)));
939 e3 = scm_cons (scm_from_latin1_symbol ("length"),
940 scm_from_size_t (STRING_LENGTH (str)));
941
942 if (IS_SH_STRING (str))
943 {
944 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
945 SH_STRING_STRING (str));
946 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
947 }
948 else
949 {
950 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
951 SCM_BOOL_F);
952 buf = STRING_STRINGBUF (str);
953 }
954
955 if (IS_RO_STRING (str))
956 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
957 SCM_BOOL_T);
958 else
959 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
960 SCM_BOOL_F);
961
962 /* Stringbuf info */
963 if (!STRINGBUF_WIDE (buf))
964 {
965 size_t len = STRINGBUF_LENGTH (buf);
966 char *cbuf;
967 SCM sbc = scm_i_make_string (len, &cbuf, 0);
968 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
969 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
970 sbc);
971 }
972 else
973 {
974 size_t len = STRINGBUF_LENGTH (buf);
975 scm_t_wchar *cbuf;
976 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
977 u32_cpy ((scm_t_uint32 *) cbuf,
978 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
979 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
980 sbc);
981 }
982 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
983 scm_from_size_t (STRINGBUF_LENGTH (buf)));
984 if (STRINGBUF_SHARED (buf))
985 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
986 SCM_BOOL_T);
987 else
988 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
989 SCM_BOOL_F);
990 if (STRINGBUF_WIDE (buf))
991 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
992 SCM_BOOL_T);
993 else
994 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
995 SCM_BOOL_F);
996
997 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
998 }
999 #undef FUNC_NAME
1000
1001 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
1002 "Returns an association list containing debugging information\n"
1003 "for @var{sym}. The association list has the following entries."
1004 "@table @code\n"
1005 "@item symbol\n"
1006 "The symbol itself\n"
1007 "@item hash\n"
1008 "Its hash value\n"
1009 "@item interned\n"
1010 "@code{#t} if it is an interned symbol\n"
1011 "@item stringbuf-chars\n"
1012 "A new string containing this symbols's stringbuf's characters\n"
1013 "@item stringbuf-length\n"
1014 "The number of characters in this stringbuf\n"
1015 "@item stringbuf-shared\n"
1016 "@code{#t} if this stringbuf is shared\n"
1017 "@item stringbuf-wide\n"
1018 "@code{#t} if this stringbuf's characters are stored in a\n"
1019 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1020 "buffer\n"
1021 "@end table")
1022 #define FUNC_NAME s_scm_sys_symbol_dump
1023 {
1024 SCM e1, e2, e3, e4, e5, e6, e7;
1025 SCM buf;
1026 SCM_VALIDATE_SYMBOL (1, sym);
1027 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1028 sym);
1029 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1030 scm_from_ulong (scm_i_symbol_hash (sym)));
1031 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1032 scm_symbol_interned_p (sym));
1033 buf = SYMBOL_STRINGBUF (sym);
1034
1035 /* Stringbuf info */
1036 if (!STRINGBUF_WIDE (buf))
1037 {
1038 size_t len = STRINGBUF_LENGTH (buf);
1039 char *cbuf;
1040 SCM sbc = scm_i_make_string (len, &cbuf, 0);
1041 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1042 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1043 sbc);
1044 }
1045 else
1046 {
1047 size_t len = STRINGBUF_LENGTH (buf);
1048 scm_t_wchar *cbuf;
1049 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1050 u32_cpy ((scm_t_uint32 *) cbuf,
1051 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1052 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1053 sbc);
1054 }
1055 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1056 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1057 if (STRINGBUF_SHARED (buf))
1058 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1059 SCM_BOOL_T);
1060 else
1061 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1062 SCM_BOOL_F);
1063 if (STRINGBUF_WIDE (buf))
1064 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1065 SCM_BOOL_T);
1066 else
1067 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1068 SCM_BOOL_F);
1069 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1070
1071 }
1072 #undef FUNC_NAME
1073
1074 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1075
1076 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1077 #define FUNC_NAME s_scm_sys_stringbuf_hist
1078 {
1079 int i;
1080 for (i = 0; i < 1000; i++)
1081 if (lenhist[i])
1082 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1083 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1084 return SCM_UNSPECIFIED;
1085 }
1086 #undef FUNC_NAME
1087
1088 #endif
1089
1090 \f
1091
1092 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1093 (SCM obj),
1094 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1095 #define FUNC_NAME s_scm_string_p
1096 {
1097 return scm_from_bool (IS_STRING (obj));
1098 }
1099 #undef FUNC_NAME
1100
1101
1102 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1103
1104 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1105 (SCM chrs),
1106 "@deffnx {Scheme Procedure} list->string chrs\n"
1107 "Return a newly allocated string composed of the arguments,\n"
1108 "@var{chrs}.")
1109 #define FUNC_NAME s_scm_string
1110 {
1111 SCM result = SCM_BOOL_F;
1112 SCM rest;
1113 size_t len;
1114 size_t p = 0;
1115 long i;
1116 int wide = 0;
1117
1118 /* Verify that this is a list of chars. */
1119 i = scm_ilength (chrs);
1120 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1121
1122 len = (size_t) i;
1123 rest = chrs;
1124
1125 while (len > 0 && scm_is_pair (rest))
1126 {
1127 SCM elt = SCM_CAR (rest);
1128 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1129 if (SCM_CHAR (elt) > 0xFF)
1130 wide = 1;
1131 rest = SCM_CDR (rest);
1132 len--;
1133 scm_remember_upto_here_1 (elt);
1134 }
1135
1136 /* Construct a string containing this list of chars. */
1137 len = (size_t) i;
1138 rest = chrs;
1139
1140 if (wide == 0)
1141 {
1142 char *buf;
1143
1144 result = scm_i_make_string (len, NULL, 0);
1145 result = scm_i_string_start_writing (result);
1146 buf = scm_i_string_writable_chars (result);
1147 while (len > 0 && scm_is_pair (rest))
1148 {
1149 SCM elt = SCM_CAR (rest);
1150 buf[p] = (unsigned char) SCM_CHAR (elt);
1151 p++;
1152 rest = SCM_CDR (rest);
1153 len--;
1154 scm_remember_upto_here_1 (elt);
1155 }
1156 }
1157 else
1158 {
1159 scm_t_wchar *buf;
1160
1161 result = scm_i_make_wide_string (len, NULL, 0);
1162 result = scm_i_string_start_writing (result);
1163 buf = scm_i_string_writable_wide_chars (result);
1164 while (len > 0 && scm_is_pair (rest))
1165 {
1166 SCM elt = SCM_CAR (rest);
1167 buf[p] = SCM_CHAR (elt);
1168 p++;
1169 rest = SCM_CDR (rest);
1170 len--;
1171 scm_remember_upto_here_1 (elt);
1172 }
1173 }
1174 scm_i_string_stop_writing ();
1175
1176 if (len > 0)
1177 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1178 if (!scm_is_null (rest))
1179 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1180
1181 return result;
1182 }
1183 #undef FUNC_NAME
1184
1185 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1186 (SCM k, SCM chr),
1187 "Return a newly allocated string of\n"
1188 "length @var{k}. If @var{chr} is given, then all elements of\n"
1189 "the string are initialized to @var{chr}, otherwise the contents\n"
1190 "of the string are all set to @code{#\nul}.")
1191 #define FUNC_NAME s_scm_make_string
1192 {
1193 return scm_c_make_string (scm_to_size_t (k), chr);
1194 }
1195 #undef FUNC_NAME
1196
1197 SCM
1198 scm_c_make_string (size_t len, SCM chr)
1199 #define FUNC_NAME NULL
1200 {
1201 size_t p;
1202 char *contents = NULL;
1203 SCM res = scm_i_make_string (len, &contents, 0);
1204
1205 /* If no char is given, initialize string contents to NULL. */
1206 if (SCM_UNBNDP (chr))
1207 memset (contents, 0, len);
1208 else
1209 {
1210 SCM_VALIDATE_CHAR (0, chr);
1211 res = scm_i_string_start_writing (res);
1212 for (p = 0; p < len; p++)
1213 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1214 scm_i_string_stop_writing ();
1215 }
1216
1217 return res;
1218 }
1219 #undef FUNC_NAME
1220
1221 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1222 (SCM string),
1223 "Return the number of characters in @var{string}.")
1224 #define FUNC_NAME s_scm_string_length
1225 {
1226 SCM_VALIDATE_STRING (1, string);
1227 return scm_from_size_t (STRING_LENGTH (string));
1228 }
1229 #undef FUNC_NAME
1230
1231 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1232 (SCM string),
1233 "Return the bytes used to represent a character in @var{string}."
1234 "This will return 1 or 4.")
1235 #define FUNC_NAME s_scm_string_bytes_per_char
1236 {
1237 SCM_VALIDATE_STRING (1, string);
1238 if (!scm_i_is_narrow_string (string))
1239 return scm_from_int (4);
1240
1241 return scm_from_int (1);
1242 }
1243 #undef FUNC_NAME
1244
1245 size_t
1246 scm_c_string_length (SCM string)
1247 {
1248 if (!IS_STRING (string))
1249 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1250 return STRING_LENGTH (string);
1251 }
1252
1253 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1254 (SCM str, SCM k),
1255 "Return character @var{k} of @var{str} using zero-origin\n"
1256 "indexing. @var{k} must be a valid index of @var{str}.")
1257 #define FUNC_NAME s_scm_string_ref
1258 {
1259 size_t len;
1260 unsigned long idx;
1261
1262 SCM_VALIDATE_STRING (1, str);
1263
1264 len = scm_i_string_length (str);
1265 if (SCM_LIKELY (len > 0))
1266 idx = scm_to_unsigned_integer (k, 0, len - 1);
1267 else
1268 scm_out_of_range (NULL, k);
1269
1270 if (scm_i_is_narrow_string (str))
1271 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1272 else
1273 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1274 }
1275 #undef FUNC_NAME
1276
1277 SCM
1278 scm_c_string_ref (SCM str, size_t p)
1279 {
1280 if (p >= scm_i_string_length (str))
1281 scm_out_of_range (NULL, scm_from_size_t (p));
1282 if (scm_i_is_narrow_string (str))
1283 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1284 else
1285 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1286
1287 }
1288
1289 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1290 (SCM str, SCM k, SCM chr),
1291 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1292 "an unspecified value. @var{k} must be a valid index of\n"
1293 "@var{str}.")
1294 #define FUNC_NAME s_scm_string_set_x
1295 {
1296 size_t len;
1297 unsigned long idx;
1298
1299 SCM_VALIDATE_STRING (1, str);
1300
1301 len = scm_i_string_length (str);
1302 if (SCM_LIKELY (len > 0))
1303 idx = scm_to_unsigned_integer (k, 0, len - 1);
1304 else
1305 scm_out_of_range (NULL, k);
1306
1307 SCM_VALIDATE_CHAR (3, chr);
1308 str = scm_i_string_start_writing (str);
1309 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1310 scm_i_string_stop_writing ();
1311
1312 return SCM_UNSPECIFIED;
1313 }
1314 #undef FUNC_NAME
1315
1316 void
1317 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1318 {
1319 if (p >= scm_i_string_length (str))
1320 scm_out_of_range (NULL, scm_from_size_t (p));
1321 str = scm_i_string_start_writing (str);
1322 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1323 scm_i_string_stop_writing ();
1324 }
1325
1326 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1327 (SCM str, SCM start, SCM end),
1328 "Return a newly allocated string formed from the characters\n"
1329 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1330 "ending with index @var{end} (exclusive).\n"
1331 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1332 "exact integers satisfying:\n\n"
1333 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1334 #define FUNC_NAME s_scm_substring
1335 {
1336 size_t len, from, to;
1337
1338 SCM_VALIDATE_STRING (1, str);
1339 len = scm_i_string_length (str);
1340 from = scm_to_unsigned_integer (start, 0, len);
1341 if (SCM_UNBNDP (end))
1342 to = len;
1343 else
1344 to = scm_to_unsigned_integer (end, from, len);
1345 return scm_i_substring (str, from, to);
1346 }
1347 #undef FUNC_NAME
1348
1349 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1350 (SCM str, SCM start, SCM end),
1351 "Return a newly allocated string formed from the characters\n"
1352 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1353 "ending with index @var{end} (exclusive).\n"
1354 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1355 "exact integers satisfying:\n"
1356 "\n"
1357 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1358 "\n"
1359 "The returned string is read-only.\n")
1360 #define FUNC_NAME s_scm_substring_read_only
1361 {
1362 size_t len, from, to;
1363
1364 SCM_VALIDATE_STRING (1, str);
1365 len = scm_i_string_length (str);
1366 from = scm_to_unsigned_integer (start, 0, len);
1367 if (SCM_UNBNDP (end))
1368 to = len;
1369 else
1370 to = scm_to_unsigned_integer (end, from, len);
1371 return scm_i_substring_read_only (str, from, to);
1372 }
1373 #undef FUNC_NAME
1374
1375 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1376 (SCM str, SCM start, SCM end),
1377 "Return a newly allocated string formed from the characters\n"
1378 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379 "ending with index @var{end} (exclusive).\n"
1380 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381 "exact integers satisfying:\n\n"
1382 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383 #define FUNC_NAME s_scm_substring_copy
1384 {
1385 /* For the Scheme version, START is mandatory, but for the C
1386 version, it is optional. See scm_string_copy in srfi-13.c for a
1387 rationale.
1388 */
1389
1390 size_t from, to;
1391
1392 SCM_VALIDATE_STRING (1, str);
1393 scm_i_get_substring_spec (scm_i_string_length (str),
1394 start, &from, end, &to);
1395 return scm_i_substring_copy (str, from, to);
1396 }
1397 #undef FUNC_NAME
1398
1399 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1400 (SCM str, SCM start, SCM end),
1401 "Return string that indirectly refers to the characters\n"
1402 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1403 "ending with index @var{end} (exclusive).\n"
1404 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1405 "exact integers satisfying:\n\n"
1406 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1407 #define FUNC_NAME s_scm_substring_shared
1408 {
1409 size_t len, from, to;
1410
1411 SCM_VALIDATE_STRING (1, str);
1412 len = scm_i_string_length (str);
1413 from = scm_to_unsigned_integer (start, 0, len);
1414 if (SCM_UNBNDP (end))
1415 to = len;
1416 else
1417 to = scm_to_unsigned_integer (end, from, len);
1418 return scm_i_substring_shared (str, from, to);
1419 }
1420 #undef FUNC_NAME
1421
1422 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1423 (SCM args),
1424 "Return a newly allocated string whose characters form the\n"
1425 "concatenation of the given strings, @var{args}.")
1426 #define FUNC_NAME s_scm_string_append
1427 {
1428 SCM res;
1429 size_t total = 0;
1430 size_t len;
1431 int wide = 0;
1432 SCM l, s;
1433 size_t i;
1434 union
1435 {
1436 char *narrow;
1437 scm_t_wchar *wide;
1438 } data;
1439
1440 SCM_VALIDATE_REST_ARGUMENT (args);
1441 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1442 {
1443 s = SCM_CAR (l);
1444 SCM_VALIDATE_STRING (SCM_ARGn, s);
1445 len = scm_i_string_length (s);
1446 if (((size_t) -1) - total < len)
1447 scm_num_overflow (s_scm_string_append);
1448 total += len;
1449 if (!scm_i_is_narrow_string (s))
1450 wide = 1;
1451 }
1452 data.narrow = NULL;
1453 if (!wide)
1454 res = scm_i_make_string (total, &data.narrow, 0);
1455 else
1456 res = scm_i_make_wide_string (total, &data.wide, 0);
1457
1458 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1459 {
1460 size_t len;
1461 s = SCM_CAR (l);
1462 SCM_VALIDATE_STRING (SCM_ARGn, s);
1463 len = scm_i_string_length (s);
1464 if (len > total)
1465 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1466 if (!wide)
1467 {
1468 memcpy (data.narrow, scm_i_string_chars (s), len);
1469 data.narrow += len;
1470 }
1471 else
1472 {
1473 if (scm_i_is_narrow_string (s))
1474 {
1475 const char *src = scm_i_string_chars (s);
1476 for (i = 0; i < len; i++)
1477 data.wide[i] = (unsigned char) src[i];
1478 }
1479 else
1480 u32_cpy ((scm_t_uint32 *) data.wide,
1481 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1482 data.wide += len;
1483 }
1484 total -= len;
1485 scm_remember_upto_here_1 (s);
1486 }
1487 if (total != 0)
1488 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1489 return res;
1490 }
1491 #undef FUNC_NAME
1492
1493
1494 \f
1495 /* Charset conversion error handling. */
1496
1497 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1498 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1499
1500 /* Raise an exception informing that character CHR could not be written
1501 to PORT in its current encoding. */
1502 void
1503 scm_encoding_error (const char *subr, int err, const char *message,
1504 SCM port, SCM chr)
1505 {
1506 scm_throw (scm_encoding_error_key,
1507 scm_list_n (scm_from_latin1_string (subr),
1508 scm_from_latin1_string (message),
1509 scm_from_int (err),
1510 port, chr,
1511 SCM_UNDEFINED));
1512 }
1513
1514 /* Raise an exception informing of an encoding error on PORT. This
1515 means that a character could not be written in PORT's encoding. */
1516 void
1517 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1518 {
1519 scm_throw (scm_decoding_error_key,
1520 scm_list_n (scm_from_latin1_string (subr),
1521 scm_from_latin1_string (message),
1522 scm_from_int (err),
1523 port,
1524 SCM_UNDEFINED));
1525 }
1526
1527 \f
1528 /* String conversion to/from C. */
1529
1530 static void
1531 decoding_error (const char *func_name, int errno_save,
1532 const char *str, size_t len)
1533 {
1534 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1535 handler. */
1536 SCM bv;
1537 signed char *buf;
1538
1539 buf = scm_gc_malloc_pointerless (len, "bytevector");
1540 memcpy (buf, str, len);
1541 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1542
1543 scm_decoding_error (func_name, errno_save,
1544 "input locale conversion error", bv);
1545 }
1546
1547 SCM
1548 scm_from_stringn (const char *str, size_t len, const char *encoding,
1549 scm_t_string_failed_conversion_handler handler)
1550 {
1551 size_t u32len, i;
1552 scm_t_wchar *u32;
1553 int wide = 0;
1554 SCM res;
1555
1556 /* The order of these checks is important. */
1557 if (!str && len != 0)
1558 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1559 if (len == (size_t) -1)
1560 len = strlen (str);
1561
1562 if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1563 return scm_from_latin1_stringn (str, len);
1564 else if (c_strcasecmp (encoding, "UTF-8") == 0
1565 && handler == SCM_FAILED_CONVERSION_ERROR)
1566 return scm_from_utf8_stringn (str, len);
1567
1568 u32len = 0;
1569 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1570 (enum iconv_ilseq_handler)
1571 handler,
1572 str, len,
1573 NULL,
1574 NULL, &u32len);
1575
1576 if (SCM_UNLIKELY (u32 == NULL))
1577 decoding_error (__func__, errno, str, len);
1578
1579 i = 0;
1580 while (i < u32len)
1581 if (u32[i++] > 0xFF)
1582 {
1583 wide = 1;
1584 break;
1585 }
1586
1587 if (!wide)
1588 {
1589 char *dst;
1590 res = scm_i_make_string (u32len, &dst, 0);
1591 for (i = 0; i < u32len; i ++)
1592 dst[i] = (unsigned char) u32[i];
1593 dst[u32len] = '\0';
1594 }
1595 else
1596 {
1597 scm_t_wchar *wdst;
1598 res = scm_i_make_wide_string (u32len, &wdst, 0);
1599 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1600 wdst[u32len] = 0;
1601 }
1602
1603 free (u32);
1604 return res;
1605 }
1606
1607 SCM
1608 scm_from_locale_string (const char *str)
1609 {
1610 return scm_from_locale_stringn (str, -1);
1611 }
1612
1613 SCM
1614 scm_from_locale_stringn (const char *str, size_t len)
1615 {
1616 return scm_from_stringn (str, len, locale_charset (),
1617 scm_i_default_port_conversion_handler ());
1618 }
1619
1620 SCM
1621 scm_from_latin1_string (const char *str)
1622 {
1623 return scm_from_latin1_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_latin1_stringn (const char *str, size_t len)
1628 {
1629 char *buf;
1630 SCM result;
1631
1632 if (len == (size_t) -1)
1633 len = strlen (str);
1634
1635 /* Make a narrow string and copy STR as is. */
1636 result = scm_i_make_string (len, &buf, 0);
1637 memcpy (buf, str, len);
1638
1639 return result;
1640 }
1641
1642 SCM
1643 scm_from_utf8_string (const char *str)
1644 {
1645 return scm_from_utf8_stringn (str, -1);
1646 }
1647
1648 SCM
1649 scm_from_utf8_stringn (const char *str, size_t len)
1650 {
1651 size_t i, char_len;
1652 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1653 int ascii = 1, narrow = 1;
1654 SCM res;
1655
1656 if (len == (size_t) -1)
1657 len = strlen (str);
1658
1659 i = 0;
1660 char_len = 0;
1661
1662 while (i < len)
1663 {
1664 if (ustr[i] <= 127)
1665 {
1666 char_len++;
1667 i++;
1668 }
1669 else
1670 {
1671 ucs4_t c;
1672 int nbytes;
1673
1674 ascii = 0;
1675
1676 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1677
1678 if (c == 0xfffd)
1679 /* Bad UTF-8. */
1680 decoding_error (__func__, errno, str, len);
1681
1682 if (c > 255)
1683 narrow = 0;
1684
1685 char_len++;
1686 i += nbytes;
1687 }
1688 }
1689
1690 if (ascii)
1691 {
1692 char *dst;
1693 res = scm_i_make_string (char_len, &dst, 0);
1694 memcpy (dst, str, len);
1695 }
1696 else if (narrow)
1697 {
1698 char *dst;
1699 size_t j;
1700 ucs4_t c;
1701
1702 res = scm_i_make_string (char_len, &dst, 0);
1703
1704 for (i = 0, j = 0; i < len; j++)
1705 {
1706 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1707 dst[j] = (signed char) c;
1708 }
1709 }
1710 else
1711 {
1712 scm_t_wchar *dst;
1713 size_t j;
1714 ucs4_t c;
1715
1716 res = scm_i_make_wide_string (char_len, &dst, 0);
1717
1718 for (i = 0, j = 0; i < len; j++)
1719 {
1720 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1721 dst[j] = c;
1722 }
1723 }
1724
1725 return res;
1726 }
1727
1728 SCM
1729 scm_from_utf32_string (const scm_t_wchar *str)
1730 {
1731 return scm_from_utf32_stringn (str, -1);
1732 }
1733
1734 SCM
1735 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1736 {
1737 SCM result;
1738 scm_t_wchar *buf;
1739
1740 if (len == (size_t) -1)
1741 len = u32_strlen ((uint32_t *) str);
1742
1743 result = scm_i_make_wide_string (len, &buf, 0);
1744 memcpy (buf, str, len * sizeof (scm_t_wchar));
1745 scm_i_try_narrow_string (result);
1746
1747 return result;
1748 }
1749
1750 SCM
1751 scm_from_port_string (const char *str, SCM port)
1752 {
1753 return scm_from_port_stringn (str, -1, port);
1754 }
1755
1756 SCM
1757 scm_from_port_stringn (const char *str, size_t len, SCM port)
1758 {
1759 scm_t_port *pt = SCM_PTAB_ENTRY (port);
1760 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1761
1762 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1763 return scm_from_latin1_stringn (str, len);
1764 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1765 && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1766 || (u8_check ((uint8_t *) str, len) == NULL)))
1767 return scm_from_utf8_stringn (str, len);
1768 else
1769 return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1770 }
1771
1772 /* Create a new scheme string from the C string STR. The memory of
1773 STR may be used directly as storage for the new string. */
1774 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1775 would be to register a finalizer to eventually free(3) STR, which isn't
1776 worth it. Should we just deprecate the `scm_take_' functions? */
1777 SCM
1778 scm_take_locale_stringn (char *str, size_t len)
1779 {
1780 SCM res;
1781
1782 res = scm_from_locale_stringn (str, len);
1783 free (str);
1784
1785 return res;
1786 }
1787
1788 SCM
1789 scm_take_locale_string (char *str)
1790 {
1791 return scm_take_locale_stringn (str, -1);
1792 }
1793
1794 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1795 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1796 Set *LENP to the size of the resulting string.
1797
1798 FIXME: This is a hack we should get rid of. See
1799 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1800 for details. */
1801 static void
1802 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1803 {
1804 char *before, *after;
1805 size_t i, j;
1806
1807 before = buf;
1808 after = buf;
1809 i = 0;
1810 j = 0;
1811 while (i < *lenp)
1812 {
1813 if ((i <= *lenp - 6)
1814 && before[i] == '\\'
1815 && before[i + 1] == 'u'
1816 && before[i + 2] == '0' && before[i + 3] == '0')
1817 {
1818 /* Convert \u00NN to \xNN */
1819 after[j] = '\\';
1820 after[j + 1] = 'x';
1821 after[j + 2] = tolower ((int) before[i + 4]);
1822 after[j + 3] = tolower ((int) before[i + 5]);
1823 i += 6;
1824 j += 4;
1825 }
1826 else if ((i <= *lenp - 10)
1827 && before[i] == '\\'
1828 && before[i + 1] == 'U'
1829 && before[i + 2] == '0' && before[i + 3] == '0')
1830 {
1831 /* Convert \U00NNNNNN to \UNNNNNN */
1832 after[j] = '\\';
1833 after[j + 1] = 'U';
1834 after[j + 2] = tolower ((int) before[i + 4]);
1835 after[j + 3] = tolower ((int) before[i + 5]);
1836 after[j + 4] = tolower ((int) before[i + 6]);
1837 after[j + 5] = tolower ((int) before[i + 7]);
1838 after[j + 6] = tolower ((int) before[i + 8]);
1839 after[j + 7] = tolower ((int) before[i + 9]);
1840 i += 10;
1841 j += 8;
1842 }
1843 else
1844 {
1845 after[j] = before[i];
1846 i++;
1847 j++;
1848 }
1849 }
1850 *lenp = j;
1851 }
1852
1853 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1854 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1855 of the resulting string. BUF must be large enough to handle the
1856 worst case when `\uXXXX' escapes (6 characters) are replaced by
1857 `\xXXXX;' (7 characters). */
1858 static void
1859 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1860 {
1861 char *before, *after;
1862 size_t i, j;
1863 /* The worst case is if the input string contains all 4-digit hex escapes.
1864 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1865 size_t max_out_len = (*lenp * 7) / 6 + 1;
1866 size_t nzeros, ndigits;
1867
1868 before = buf;
1869 after = alloca (max_out_len);
1870 i = 0;
1871 j = 0;
1872 while (i < *lenp)
1873 {
1874 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1875 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1876 {
1877 if (before[i + 1] == 'u')
1878 ndigits = 4;
1879 else if (before[i + 1] == 'U')
1880 ndigits = 8;
1881 else
1882 abort ();
1883
1884 /* Add the R6RS hex escape initial sequence. */
1885 after[j] = '\\';
1886 after[j + 1] = 'x';
1887
1888 /* Move string positions to the start of the hex numbers. */
1889 i += 2;
1890 j += 2;
1891
1892 /* Find the number of initial zeros in this hex number. */
1893 nzeros = 0;
1894 while (before[i + nzeros] == '0' && nzeros < ndigits)
1895 nzeros++;
1896
1897 /* Copy the number, skipping initial zeros, and then move the string
1898 positions. */
1899 if (nzeros == ndigits)
1900 {
1901 after[j] = '0';
1902 i += ndigits;
1903 j += 1;
1904 }
1905 else
1906 {
1907 int pos;
1908 for (pos = 0; pos < ndigits - nzeros; pos++)
1909 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1910 i += ndigits;
1911 j += (ndigits - nzeros);
1912 }
1913
1914 /* Add terminating semicolon. */
1915 after[j] = ';';
1916 j++;
1917 }
1918 else
1919 {
1920 after[j] = before[i];
1921 i++;
1922 j++;
1923 }
1924 }
1925 *lenp = j;
1926 memcpy (before, after, j);
1927 }
1928
1929 char *
1930 scm_to_locale_string (SCM str)
1931 {
1932 return scm_to_locale_stringn (str, NULL);
1933 }
1934
1935 char *
1936 scm_to_locale_stringn (SCM str, size_t *lenp)
1937 {
1938 return scm_to_stringn (str, lenp,
1939 locale_charset (),
1940 scm_i_default_port_conversion_handler ());
1941 }
1942
1943 char *
1944 scm_to_latin1_string (SCM str)
1945 {
1946 return scm_to_latin1_stringn (str, NULL);
1947 }
1948
1949 char *
1950 scm_to_latin1_stringn (SCM str, size_t *lenp)
1951 #define FUNC_NAME "scm_to_latin1_stringn"
1952 {
1953 char *result;
1954
1955 SCM_VALIDATE_STRING (1, str);
1956
1957 if (scm_i_is_narrow_string (str))
1958 {
1959 size_t len = scm_i_string_length (str);
1960
1961 if (lenp)
1962 *lenp = len;
1963
1964 result = scm_strndup (scm_i_string_data (str), len);
1965 }
1966 else
1967 result = scm_to_stringn (str, lenp, NULL,
1968 SCM_FAILED_CONVERSION_ERROR);
1969
1970 return result;
1971 }
1972 #undef FUNC_NAME
1973
1974 char *
1975 scm_to_utf8_string (SCM str)
1976 {
1977 return scm_to_utf8_stringn (str, NULL);
1978 }
1979
1980 static size_t
1981 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1982 {
1983 size_t ret, i;
1984 for (i = 0, ret = 0; i < len; i++)
1985 ret += (str[i] < 128) ? 1 : 2;
1986 return ret;
1987 }
1988
1989 static scm_t_uint8*
1990 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1991 scm_t_uint8 *u8_result, size_t *u8_lenp)
1992 {
1993 size_t i, n;
1994 size_t u8_len = latin1_u8_strlen (str, latin_len);
1995
1996 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1997 u8_result = scm_malloc (u8_len + 1);
1998 if (u8_lenp)
1999 *u8_lenp = u8_len;
2000
2001 for (i = 0, n = 0; i < latin_len; i++)
2002 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
2003 if (n != u8_len)
2004 abort ();
2005 u8_result[n] = 0;
2006
2007 return u8_result;
2008 }
2009
2010 /* UTF-8 code table
2011
2012 (Note that this includes code points that are not allowed by Unicode,
2013 but since this function has no way to report an error, and its
2014 purpose is to determine the size of destination buffers for
2015 libunicode conversion functions, we err on the safe side and handle
2016 everything that libunicode might conceivably handle, now or in the
2017 future.)
2018
2019 Char. number range | UTF-8 octet sequence
2020 (hexadecimal) | (binary)
2021 --------------------+------------------------------------------------------
2022 0000 0000-0000 007F | 0xxxxxxx
2023 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2024 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2025 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2026 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2027 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2028 */
2029
2030 static size_t
2031 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2032 {
2033 size_t ret, i;
2034
2035 for (i = 0, ret = 0; i < len; i++)
2036 {
2037 scm_t_uint32 c = str[i];
2038
2039 if (c <= 0x7f)
2040 ret += 1;
2041 else if (c <= 0x7ff)
2042 ret += 2;
2043 else if (c <= 0xffff)
2044 ret += 3;
2045 else if (c <= 0x1fffff)
2046 ret += 4;
2047 else if (c <= 0x3ffffff)
2048 ret += 5;
2049 else
2050 ret += 6;
2051 }
2052
2053 return ret;
2054 }
2055
2056 char *
2057 scm_to_utf8_stringn (SCM str, size_t *lenp)
2058 #define FUNC_NAME "scm_to_utf8_stringn"
2059 {
2060 SCM_VALIDATE_STRING (1, str);
2061
2062 if (scm_i_is_narrow_string (str))
2063 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2064 scm_i_string_length (str),
2065 NULL, lenp);
2066 else
2067 {
2068 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2069 scm_t_uint8 *buf, *ret;
2070 size_t num_chars = scm_i_string_length (str);
2071 size_t num_bytes_predicted, num_bytes_actual;
2072
2073 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2074
2075 if (lenp)
2076 {
2077 *lenp = num_bytes_predicted;
2078 buf = scm_malloc (num_bytes_predicted);
2079 }
2080 else
2081 {
2082 buf = scm_malloc (num_bytes_predicted + 1);
2083 buf[num_bytes_predicted] = 0;
2084 }
2085
2086 num_bytes_actual = num_bytes_predicted;
2087 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2088
2089 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2090 return (char *) ret;
2091
2092 /* An error: a bad codepoint. */
2093 {
2094 int saved_errno = errno;
2095
2096 free (buf);
2097 if (!saved_errno)
2098 abort ();
2099
2100 scm_decoding_error ("scm_to_utf8_stringn", errno,
2101 "invalid codepoint in string", str);
2102
2103 /* Not reached. */
2104 return NULL;
2105 }
2106 }
2107 }
2108 #undef FUNC_NAME
2109
2110 scm_t_wchar *
2111 scm_to_utf32_string (SCM str)
2112 {
2113 return scm_to_utf32_stringn (str, NULL);
2114 }
2115
2116 scm_t_wchar *
2117 scm_to_utf32_stringn (SCM str, size_t *lenp)
2118 #define FUNC_NAME "scm_to_utf32_stringn"
2119 {
2120 scm_t_wchar *result;
2121
2122 SCM_VALIDATE_STRING (1, str);
2123
2124 if (scm_i_is_narrow_string (str))
2125 {
2126 scm_t_uint8 *codepoints;
2127 size_t i, len;
2128
2129 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2130 len = scm_i_string_length (str);
2131 if (lenp)
2132 *lenp = len;
2133
2134 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2135 for (i = 0; i < len; i++)
2136 result[i] = codepoints[i];
2137 result[len] = 0;
2138 }
2139 else
2140 {
2141 size_t len;
2142
2143 len = scm_i_string_length (str);
2144 if (lenp)
2145 *lenp = len;
2146
2147 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2148 memcpy (result, scm_i_string_wide_chars (str),
2149 len * sizeof (scm_t_wchar));
2150 result[len] = 0;
2151 }
2152
2153 return result;
2154 }
2155 #undef FUNC_NAME
2156
2157 char *
2158 scm_to_port_string (SCM str, SCM port)
2159 {
2160 return scm_to_port_stringn (str, NULL, port);
2161 }
2162
2163 char *
2164 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2165 {
2166 scm_t_port *pt = SCM_PTAB_ENTRY (port);
2167 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2168
2169 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2170 && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2171 return scm_to_latin1_stringn (str, lenp);
2172 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2173 return scm_to_utf8_stringn (str, lenp);
2174 else
2175 return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2176 }
2177
2178 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2179 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2180 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2181 defined by HANDLER. */
2182 char *
2183 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2184 scm_t_string_failed_conversion_handler handler)
2185 {
2186 char *buf;
2187 size_t ilen, len, i;
2188 int ret;
2189 const char *enc;
2190
2191 if (!scm_is_string (str))
2192 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2193 ilen = scm_i_string_length (str);
2194
2195 if (ilen == 0)
2196 {
2197 buf = scm_malloc (1);
2198 buf[0] = '\0';
2199 if (lenp)
2200 *lenp = 0;
2201 return buf;
2202 }
2203
2204 if (lenp == NULL)
2205 for (i = 0; i < ilen; i++)
2206 if (scm_i_string_ref (str, i) == '\0')
2207 scm_misc_error (NULL,
2208 "string contains #\\nul character: ~S",
2209 scm_list_1 (str));
2210
2211 if (scm_i_is_narrow_string (str)
2212 && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2213 {
2214 /* If using native Latin-1 encoding, just copy the string
2215 contents. */
2216 if (lenp)
2217 {
2218 buf = scm_malloc (ilen);
2219 memcpy (buf, scm_i_string_chars (str), ilen);
2220 *lenp = ilen;
2221 return buf;
2222 }
2223 else
2224 {
2225 buf = scm_malloc (ilen + 1);
2226 memcpy (buf, scm_i_string_chars (str), ilen);
2227 buf[ilen] = '\0';
2228 return buf;
2229 }
2230 }
2231
2232
2233 buf = NULL;
2234 len = 0;
2235 enc = encoding;
2236 if (enc == NULL)
2237 enc = "ISO-8859-1";
2238 if (scm_i_is_narrow_string (str))
2239 {
2240 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2241 "ISO-8859-1", enc,
2242 (enum iconv_ilseq_handler) handler, NULL,
2243 &buf, &len);
2244
2245 if (ret != 0)
2246 scm_encoding_error (__func__, errno,
2247 "cannot convert narrow string to output locale",
2248 SCM_BOOL_F,
2249 /* FIXME: Faulty character unknown. */
2250 SCM_BOOL_F);
2251 }
2252 else
2253 {
2254 buf = u32_conv_to_encoding (enc,
2255 (enum iconv_ilseq_handler) handler,
2256 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2257 ilen,
2258 NULL,
2259 NULL, &len);
2260 if (buf == NULL)
2261 scm_encoding_error (__func__, errno,
2262 "cannot convert wide string to output locale",
2263 SCM_BOOL_F,
2264 /* FIXME: Faulty character unknown. */
2265 SCM_BOOL_F);
2266 }
2267 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2268 {
2269 if (SCM_R6RS_ESCAPES_P)
2270 {
2271 /* The worst case is if the input string contains all 4-digit
2272 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2273 (seven characters). Make BUF large enough to hold
2274 that. */
2275 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2276 unistring_escapes_to_r6rs_escapes (buf, &len);
2277 }
2278 else
2279 unistring_escapes_to_guile_escapes (buf, &len);
2280
2281 buf = scm_realloc (buf, len);
2282 }
2283 if (lenp)
2284 *lenp = len;
2285 else
2286 {
2287 buf = scm_realloc (buf, len + 1);
2288 buf[len] = '\0';
2289 }
2290
2291 scm_remember_upto_here_1 (str);
2292 return buf;
2293 }
2294
2295 size_t
2296 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2297 {
2298 size_t len;
2299 char *result = NULL;
2300 if (!scm_is_string (str))
2301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2302 result = scm_to_locale_stringn (str, &len);
2303
2304 memcpy (buf, result, (len > max_len) ? max_len : len);
2305 free (result);
2306
2307 scm_remember_upto_here_1 (str);
2308 return len;
2309 }
2310
2311 \f
2312 /* Unicode string normalization. */
2313
2314 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2315 libguile/i18n.c. It would be useful to have this factored out into a more
2316 convenient location, but its use of alloca makes that tricky to do. */
2317
2318 static SCM
2319 normalize_str (SCM string, uninorm_t form)
2320 {
2321 SCM ret;
2322 scm_t_uint32 *w_str;
2323 scm_t_wchar *cbuf;
2324 size_t rlen, len = scm_i_string_length (string);
2325
2326 if (scm_i_is_narrow_string (string))
2327 {
2328 size_t i;
2329 const char *buf = scm_i_string_chars (string);
2330
2331 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2332
2333 for (i = 0; i < len; i ++)
2334 w_str[i] = (unsigned char) buf[i];
2335 w_str[len] = 0;
2336 }
2337 else
2338 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2339
2340 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2341
2342 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2343 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2344 free (w_str);
2345
2346 scm_i_try_narrow_string (ret);
2347
2348 return ret;
2349 }
2350
2351 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2352 (SCM string),
2353 "Returns the NFC normalized form of @var{string}.")
2354 #define FUNC_NAME s_scm_string_normalize_nfc
2355 {
2356 SCM_VALIDATE_STRING (1, string);
2357 return normalize_str (string, UNINORM_NFC);
2358 }
2359 #undef FUNC_NAME
2360
2361 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2362 (SCM string),
2363 "Returns the NFD normalized form of @var{string}.")
2364 #define FUNC_NAME s_scm_string_normalize_nfd
2365 {
2366 SCM_VALIDATE_STRING (1, string);
2367 return normalize_str (string, UNINORM_NFD);
2368 }
2369 #undef FUNC_NAME
2370
2371 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2372 (SCM string),
2373 "Returns the NFKC normalized form of @var{string}.")
2374 #define FUNC_NAME s_scm_string_normalize_nfkc
2375 {
2376 SCM_VALIDATE_STRING (1, string);
2377 return normalize_str (string, UNINORM_NFKC);
2378 }
2379 #undef FUNC_NAME
2380
2381 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2382 (SCM string),
2383 "Returns the NFKD normalized form of @var{string}.")
2384 #define FUNC_NAME s_scm_string_normalize_nfkd
2385 {
2386 SCM_VALIDATE_STRING (1, string);
2387 return normalize_str (string, UNINORM_NFKD);
2388 }
2389 #undef FUNC_NAME
2390
2391 /* converts C scm_array of strings to SCM scm_list of strings.
2392 If argc < 0, a null terminated scm_array is assumed.
2393 The current locale encoding is assumed */
2394 SCM
2395 scm_makfromstrs (int argc, char **argv)
2396 {
2397 int i = argc;
2398 SCM lst = SCM_EOL;
2399 if (0 > i)
2400 for (i = 0; argv[i]; i++);
2401 while (i--)
2402 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2403 return lst;
2404 }
2405
2406 /* Return a newly allocated array of char pointers to each of the strings
2407 in args, with a terminating NULL pointer. The strings are encoded using
2408 the current locale. */
2409
2410 char **
2411 scm_i_allocate_string_pointers (SCM list)
2412 #define FUNC_NAME "scm_i_allocate_string_pointers"
2413 {
2414 char **result;
2415 int list_len = scm_ilength (list);
2416 int i;
2417
2418 if (list_len < 0)
2419 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2420
2421 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2422 "string pointers");
2423 result[list_len] = NULL;
2424
2425 /* The list might have been modified in another thread, so
2426 we check LIST before each access.
2427 */
2428 for (i = 0; i < list_len && scm_is_pair (list); i++)
2429 {
2430 SCM str = SCM_CAR (list);
2431 size_t len; /* String length in bytes */
2432 char *c_str = scm_to_locale_stringn (str, &len);
2433
2434 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2435 scm_malloc to allocate the returned string, which must be
2436 explicitly deallocated. This forces us to copy the string a
2437 second time into a new buffer. Ideally there would be variants
2438 of scm_to_*_stringn that can return garbage-collected buffers. */
2439
2440 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2441 memcpy (result[i], c_str, len);
2442 result[i][len] = '\0';
2443 free (c_str);
2444
2445 list = SCM_CDR (list);
2446 }
2447
2448 return result;
2449 }
2450 #undef FUNC_NAME
2451
2452 void
2453 scm_i_get_substring_spec (size_t len,
2454 SCM start, size_t *cstart,
2455 SCM end, size_t *cend)
2456 {
2457 if (SCM_UNBNDP (start))
2458 *cstart = 0;
2459 else
2460 *cstart = scm_to_unsigned_integer (start, 0, len);
2461
2462 if (SCM_UNBNDP (end))
2463 *cend = len;
2464 else
2465 *cend = scm_to_unsigned_integer (end, *cstart, len);
2466 }
2467
2468 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2469
2470 void
2471 scm_init_strings ()
2472 {
2473 scm_nullstr = scm_i_make_string (0, NULL, 0);
2474
2475 #include "libguile/strings.x"
2476 }
2477
2478
2479 /*
2480 Local Variables:
2481 c-file-style: "gnu"
2482 End:
2483 */