Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229
230 \f
231 /* Copy-on-write strings.
232 */
233
234 #define STRING_TAG scm_tc7_string
235
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
244
245 /* Read-only strings.
246 */
247
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
251 /* Mutation-sharing substrings
252 */
253
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
258
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
261 SCM scm_nullstr;
262
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
267 SCM
268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
269 {
270 SCM buf = make_stringbuf (len);
271 SCM res;
272 if (charsp)
273 *charsp = (char *) STRINGBUF_CHARS (buf);
274 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
275 SCM_UNPACK (buf),
276 (scm_t_bits) 0, (scm_t_bits) len);
277 return res;
278 }
279
280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
284 SCM
285 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
286 {
287 SCM buf = make_wide_stringbuf (len);
288 SCM res;
289 if (charsp)
290 *charsp = STRINGBUF_WIDE_CHARS (buf);
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295 }
296
297 static void
298 validate_substring_args (SCM str, size_t start, size_t end)
299 {
300 if (!IS_STRING (str))
301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
302 if (start > STRING_LENGTH (str))
303 scm_out_of_range (NULL, scm_from_size_t (start));
304 if (end > STRING_LENGTH (str) || end < start)
305 scm_out_of_range (NULL, scm_from_size_t (end));
306 }
307
308 static inline void
309 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
310 {
311 *start = STRING_START (*str);
312 if (IS_SH_STRING (*str))
313 {
314 *str = SH_STRING_STRING (*str);
315 *start += STRING_START (*str);
316 }
317 *buf = STRING_STRINGBUF (*str);
318 }
319
320 SCM
321 scm_i_substring (SCM str, size_t start, size_t end)
322 {
323 SCM buf;
324 size_t str_start;
325 get_str_buf_start (&str, &buf, &str_start);
326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
327 SET_STRINGBUF_SHARED (buf);
328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
329 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
330 (scm_t_bits)str_start + start,
331 (scm_t_bits) end - start);
332 }
333
334 SCM
335 scm_i_substring_read_only (SCM str, size_t start, size_t end)
336 {
337 SCM buf;
338 size_t str_start;
339 get_str_buf_start (&str, &buf, &str_start);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
341 SET_STRINGBUF_SHARED (buf);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
343 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
344 (scm_t_bits)str_start + start,
345 (scm_t_bits) end - start);
346 }
347
348 SCM
349 scm_i_substring_copy (SCM str, size_t start, size_t end)
350 {
351 size_t len = end - start;
352 SCM buf, my_buf;
353 size_t str_start;
354 get_str_buf_start (&str, &buf, &str_start);
355 if (scm_i_is_narrow_string (str))
356 {
357 my_buf = make_stringbuf (len);
358 memcpy (STRINGBUF_CHARS (my_buf),
359 STRINGBUF_CHARS (buf) + str_start + start, len);
360 }
361 else
362 {
363 my_buf = make_wide_stringbuf (len);
364 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
365 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
366 + start), len);
367 /* Even though this string is wide, the substring may be narrow.
368 Consider adding code to narrow the string. */
369 }
370 scm_remember_upto_here_1 (buf);
371 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
372 (scm_t_bits) 0, (scm_t_bits) len);
373 }
374
375 SCM
376 scm_i_substring_shared (SCM str, size_t start, size_t end)
377 {
378 if (start == 0 && end == STRING_LENGTH (str))
379 return str;
380 else
381 {
382 size_t len = end - start;
383 if (IS_SH_STRING (str))
384 {
385 start += STRING_START (str);
386 str = SH_STRING_STRING (str);
387 }
388 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
389 (scm_t_bits)start, (scm_t_bits) len);
390 }
391 }
392
393 SCM
394 scm_c_substring (SCM str, size_t start, size_t end)
395 {
396 validate_substring_args (str, start, end);
397 return scm_i_substring (str, start, end);
398 }
399
400 SCM
401 scm_c_substring_read_only (SCM str, size_t start, size_t end)
402 {
403 validate_substring_args (str, start, end);
404 return scm_i_substring_read_only (str, start, end);
405 }
406
407 SCM
408 scm_c_substring_copy (SCM str, size_t start, size_t end)
409 {
410 validate_substring_args (str, start, end);
411 return scm_i_substring_copy (str, start, end);
412 }
413
414 SCM
415 scm_c_substring_shared (SCM str, size_t start, size_t end)
416 {
417 validate_substring_args (str, start, end);
418 return scm_i_substring_shared (str, start, end);
419 }
420
421 \f
422 /* Internal accessors
423 */
424
425 /* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
427 size_t
428 scm_i_string_length (SCM str)
429 {
430 return STRING_LENGTH (str);
431 }
432
433 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
435 encoding. */
436 int
437 scm_i_is_narrow_string (SCM str)
438 {
439 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
440 }
441
442 /* Try to coerce a string to be narrow. It if is narrow already, do
443 nothing. If it is wide, shrink it to narrow if none of its
444 characters are above 0xFF. Return true if the string is narrow or
445 was made to be narrow. */
446 int
447 scm_i_try_narrow_string (SCM str)
448 {
449 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
450
451 return scm_i_is_narrow_string (str);
452 }
453
454 /* Return a pointer to the raw data of the string, which can be either Latin-1
455 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
456 const void *
457 scm_i_string_data (SCM str)
458 {
459 SCM buf;
460 size_t start;
461 const char *data;
462
463 get_str_buf_start (&str, &buf, &start);
464
465 data = STRINGBUF_CONTENTS (buf);
466 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
467
468 return data;
469 }
470
471 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
472 STR. */
473 const char *
474 scm_i_string_chars (SCM str)
475 {
476 SCM buf;
477 size_t start;
478 get_str_buf_start (&str, &buf, &start);
479 if (scm_i_is_narrow_string (str))
480 return (const char *) STRINGBUF_CHARS (buf) + start;
481 else
482 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
483 scm_list_1 (str));
484 return NULL;
485 }
486
487 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
488 STR. */
489 const scm_t_wchar *
490 scm_i_string_wide_chars (SCM str)
491 {
492 SCM buf;
493 size_t start;
494
495 get_str_buf_start (&str, &buf, &start);
496 if (!scm_i_is_narrow_string (str))
497 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
498 else
499 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
500 scm_list_1 (str));
501 }
502
503 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
504 a new string buffer, so that it can be modified without modifying
505 other strings. Also, lock the string mutex. Later, one must call
506 scm_i_string_stop_writing to unlock the mutex. */
507 SCM
508 scm_i_string_start_writing (SCM orig_str)
509 {
510 SCM buf, str = orig_str;
511 size_t start;
512
513 get_str_buf_start (&str, &buf, &start);
514 if (IS_RO_STRING (str))
515 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
516
517 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
518 if (STRINGBUF_SHARED (buf))
519 {
520 /* Clone the stringbuf. */
521 size_t len = STRING_LENGTH (str);
522 SCM new_buf;
523
524 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
525
526 if (scm_i_is_narrow_string (str))
527 {
528 new_buf = make_stringbuf (len);
529 memcpy (STRINGBUF_CHARS (new_buf),
530 STRINGBUF_CHARS (buf) + STRING_START (str), len);
531
532 }
533 else
534 {
535 new_buf = make_wide_stringbuf (len);
536 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
537 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
538 + STRING_START (str)), len);
539 }
540
541 SET_STRING_STRINGBUF (str, new_buf);
542 start -= STRING_START (str);
543
544 /* FIXME: The following operations are not atomic, so other threads
545 looking at STR may see an inconsistent state. Nevertheless it can't
546 hurt much since (i) accessing STR while it is being mutated can't
547 yield a crash, and (ii) concurrent accesses to STR should be
548 protected by a mutex at the application level. The latter may not
549 apply when STR != ORIG_STR, though. */
550 SET_STRING_START (str, 0);
551 SET_STRING_STRINGBUF (str, new_buf);
552
553 buf = new_buf;
554
555 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
556 }
557 return orig_str;
558 }
559
560 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
561 char *
562 scm_i_string_writable_chars (SCM str)
563 {
564 SCM buf;
565 size_t start;
566
567 get_str_buf_start (&str, &buf, &start);
568 if (scm_i_is_narrow_string (str))
569 return (char *) STRINGBUF_CHARS (buf) + start;
570 else
571 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
572 scm_list_1 (str));
573 return NULL;
574 }
575
576 /* Return a pointer to the UCS-4 codepoints of a string. */
577 static scm_t_wchar *
578 scm_i_string_writable_wide_chars (SCM str)
579 {
580 SCM buf;
581 size_t start;
582
583 get_str_buf_start (&str, &buf, &start);
584 if (!scm_i_is_narrow_string (str))
585 return STRINGBUF_WIDE_CHARS (buf) + start;
586 else
587 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
588 scm_list_1 (str));
589 }
590
591 /* Unlock the string mutex that was locked when
592 scm_i_string_start_writing was called. */
593 void
594 scm_i_string_stop_writing (void)
595 {
596 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
597 }
598
599 /* Return the Xth character of STR as a UCS-4 codepoint. */
600 scm_t_wchar
601 scm_i_string_ref (SCM str, size_t x)
602 {
603 if (scm_i_is_narrow_string (str))
604 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
605 else
606 return scm_i_string_wide_chars (str)[x];
607 }
608
609 /* Returns index+1 of the first char in STR that matches C, or
610 0 if the char is not found. */
611 int
612 scm_i_string_contains_char (SCM str, char ch)
613 {
614 size_t i;
615 size_t len = scm_i_string_length (str);
616
617 i = 0;
618 if (scm_i_is_narrow_string (str))
619 {
620 while (i < len)
621 {
622 if (scm_i_string_chars (str)[i] == ch)
623 return i+1;
624 i++;
625 }
626 }
627 else
628 {
629 while (i < len)
630 {
631 if (scm_i_string_wide_chars (str)[i]
632 == (unsigned char) ch)
633 return i+1;
634 i++;
635 }
636 }
637 return 0;
638 }
639
640 int
641 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
642 {
643 if (scm_i_is_narrow_string (sstr))
644 {
645 const char *a = scm_i_string_chars (sstr) + start_x;
646 const char *b = cstr;
647 return strncmp (a, b, strlen(b));
648 }
649 else
650 {
651 size_t i;
652 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
653 const char *b = cstr;
654 for (i = 0; i < strlen (b); i++)
655 {
656 if (a[i] != (unsigned char) b[i])
657 return 1;
658 }
659 }
660 return 0;
661 }
662
663 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
664 void
665 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
666 {
667 if (chr > 0xFF && scm_i_is_narrow_string (str))
668 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
669
670 if (scm_i_is_narrow_string (str))
671 {
672 char *dst = scm_i_string_writable_chars (str);
673 dst[p] = chr;
674 }
675 else
676 {
677 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
678 dst[p] = chr;
679 }
680 }
681
682 \f
683 /* Symbols.
684
685 Basic symbol creation and accessing is done here, the rest is in
686 symbols.[hc]. This has been done to keep stringbufs and the
687 internals of strings and string-like objects confined to this file.
688 */
689
690 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
691
692 SCM
693 scm_i_make_symbol (SCM name, scm_t_bits flags,
694 unsigned long hash, SCM props)
695 {
696 SCM buf;
697 size_t start = STRING_START (name);
698 size_t length = STRING_LENGTH (name);
699
700 if (IS_SH_STRING (name))
701 {
702 name = SH_STRING_STRING (name);
703 start += STRING_START (name);
704 }
705 buf = SYMBOL_STRINGBUF (name);
706
707 if (start == 0 && length == STRINGBUF_LENGTH (buf))
708 {
709 /* reuse buf. */
710 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
711 SET_STRINGBUF_SHARED (buf);
712 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
713 }
714 else
715 {
716 /* make new buf. */
717 if (scm_i_is_narrow_string (name))
718 {
719 SCM new_buf = make_stringbuf (length);
720 memcpy (STRINGBUF_CHARS (new_buf),
721 STRINGBUF_CHARS (buf) + start, length);
722 buf = new_buf;
723 }
724 else
725 {
726 SCM new_buf = make_wide_stringbuf (length);
727 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
728 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
729 length);
730 buf = new_buf;
731 }
732 }
733 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
734 (scm_t_bits) hash, SCM_UNPACK (props));
735 }
736
737 SCM
738 scm_i_c_make_symbol (const char *name, size_t len,
739 scm_t_bits flags, unsigned long hash, SCM props)
740 {
741 SCM buf = make_stringbuf (len);
742 memcpy (STRINGBUF_CHARS (buf), name, len);
743
744 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
745 (scm_t_bits) hash, SCM_UNPACK (props));
746 }
747
748 /* Returns the number of characters in SYM. This may be different
749 from the memory size of SYM. */
750 size_t
751 scm_i_symbol_length (SCM sym)
752 {
753 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
754 }
755
756 size_t
757 scm_c_symbol_length (SCM sym)
758 #define FUNC_NAME "scm_c_symbol_length"
759 {
760 SCM_VALIDATE_SYMBOL (1, sym);
761
762 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
763 }
764 #undef FUNC_NAME
765
766 /* True if the name of SYM is stored as a Latin-1 encoded string.
767 False if it is stored as a 32-bit UCS-4-encoded string. */
768 int
769 scm_i_is_narrow_symbol (SCM sym)
770 {
771 SCM buf;
772
773 buf = SYMBOL_STRINGBUF (sym);
774 return !STRINGBUF_WIDE (buf);
775 }
776
777 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
778 contains the name of SYM. */
779 const char *
780 scm_i_symbol_chars (SCM sym)
781 {
782 SCM buf;
783
784 buf = SYMBOL_STRINGBUF (sym);
785 if (!STRINGBUF_WIDE (buf))
786 return (const char *) STRINGBUF_CHARS (buf);
787 else
788 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
789 scm_list_1 (sym));
790 }
791
792 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
793 symbol's name. */
794 const scm_t_wchar *
795 scm_i_symbol_wide_chars (SCM sym)
796 {
797 SCM buf;
798
799 buf = SYMBOL_STRINGBUF (sym);
800 if (STRINGBUF_WIDE (buf))
801 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
802 else
803 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
804 scm_list_1 (sym));
805 }
806
807 SCM
808 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
809 {
810 SCM buf = SYMBOL_STRINGBUF (sym);
811 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
812 SET_STRINGBUF_SHARED (buf);
813 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
814 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
815 (scm_t_bits)start, (scm_t_bits) end - start);
816 }
817
818 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
819 scm_t_wchar
820 scm_i_symbol_ref (SCM sym, size_t x)
821 {
822 if (scm_i_is_narrow_symbol (sym))
823 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
824 else
825 return scm_i_symbol_wide_chars (sym)[x];
826 }
827
828 /* Debugging
829 */
830
831 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
832 "Returns an association list containing debugging information\n"
833 "for @var{str}. The association list has the following entries."
834 "@table @code\n"
835 "@item string\n"
836 "The string itself.\n"
837 "@item start\n"
838 "The start index of the string into its stringbuf\n"
839 "@item length\n"
840 "The length of the string\n"
841 "@item shared\n"
842 "If this string is a substring, it returns its parent string.\n"
843 "Otherwise, it returns @code{#f}\n"
844 "@item read-only\n"
845 "@code{#t} if the string is read-only\n"
846 "@item stringbuf-chars\n"
847 "A new string containing this string's stringbuf's characters\n"
848 "@item stringbuf-length\n"
849 "The number of characters in this stringbuf\n"
850 "@item stringbuf-shared\n"
851 "@code{#t} if this stringbuf is shared\n"
852 "@item stringbuf-wide\n"
853 "@code{#t} if this stringbuf's characters are stored in a\n"
854 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
855 "buffer\n"
856 "@end table")
857 #define FUNC_NAME s_scm_sys_string_dump
858 {
859 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
860 SCM buf;
861 SCM_VALIDATE_STRING (1, str);
862
863 /* String info */
864 e1 = scm_cons (scm_from_latin1_symbol ("string"),
865 str);
866 e2 = scm_cons (scm_from_latin1_symbol ("start"),
867 scm_from_size_t (STRING_START (str)));
868 e3 = scm_cons (scm_from_latin1_symbol ("length"),
869 scm_from_size_t (STRING_LENGTH (str)));
870
871 if (IS_SH_STRING (str))
872 {
873 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
874 SH_STRING_STRING (str));
875 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
876 }
877 else
878 {
879 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
880 SCM_BOOL_F);
881 buf = STRING_STRINGBUF (str);
882 }
883
884 if (IS_RO_STRING (str))
885 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
886 SCM_BOOL_T);
887 else
888 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
889 SCM_BOOL_F);
890
891 /* Stringbuf info */
892 if (!STRINGBUF_WIDE (buf))
893 {
894 size_t len = STRINGBUF_LENGTH (buf);
895 char *cbuf;
896 SCM sbc = scm_i_make_string (len, &cbuf, 0);
897 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
898 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
899 sbc);
900 }
901 else
902 {
903 size_t len = STRINGBUF_LENGTH (buf);
904 scm_t_wchar *cbuf;
905 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
906 u32_cpy ((scm_t_uint32 *) cbuf,
907 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
908 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
909 sbc);
910 }
911 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
912 scm_from_size_t (STRINGBUF_LENGTH (buf)));
913 if (STRINGBUF_SHARED (buf))
914 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
915 SCM_BOOL_T);
916 else
917 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
918 SCM_BOOL_F);
919 if (STRINGBUF_WIDE (buf))
920 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
921 SCM_BOOL_T);
922 else
923 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
924 SCM_BOOL_F);
925
926 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
927 }
928 #undef FUNC_NAME
929
930 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
931 "Returns an association list containing debugging information\n"
932 "for @var{sym}. The association list has the following entries."
933 "@table @code\n"
934 "@item symbol\n"
935 "The symbol itself\n"
936 "@item hash\n"
937 "Its hash value\n"
938 "@item interned\n"
939 "@code{#t} if it is an interned symbol\n"
940 "@item stringbuf-chars\n"
941 "A new string containing this symbols's stringbuf's characters\n"
942 "@item stringbuf-length\n"
943 "The number of characters in this stringbuf\n"
944 "@item stringbuf-shared\n"
945 "@code{#t} if this stringbuf is shared\n"
946 "@item stringbuf-wide\n"
947 "@code{#t} if this stringbuf's characters are stored in a\n"
948 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
949 "buffer\n"
950 "@end table")
951 #define FUNC_NAME s_scm_sys_symbol_dump
952 {
953 SCM e1, e2, e3, e4, e5, e6, e7;
954 SCM buf;
955 SCM_VALIDATE_SYMBOL (1, sym);
956 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
957 sym);
958 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
959 scm_from_ulong (scm_i_symbol_hash (sym)));
960 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
961 scm_symbol_interned_p (sym));
962 buf = SYMBOL_STRINGBUF (sym);
963
964 /* Stringbuf info */
965 if (!STRINGBUF_WIDE (buf))
966 {
967 size_t len = STRINGBUF_LENGTH (buf);
968 char *cbuf;
969 SCM sbc = scm_i_make_string (len, &cbuf, 0);
970 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
971 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
972 sbc);
973 }
974 else
975 {
976 size_t len = STRINGBUF_LENGTH (buf);
977 scm_t_wchar *cbuf;
978 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
979 u32_cpy ((scm_t_uint32 *) cbuf,
980 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
981 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
982 sbc);
983 }
984 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
985 scm_from_size_t (STRINGBUF_LENGTH (buf)));
986 if (STRINGBUF_SHARED (buf))
987 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
988 SCM_BOOL_T);
989 else
990 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
991 SCM_BOOL_F);
992 if (STRINGBUF_WIDE (buf))
993 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
994 SCM_BOOL_T);
995 else
996 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
997 SCM_BOOL_F);
998 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
999
1000 }
1001 #undef FUNC_NAME
1002
1003 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1004
1005 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1006 #define FUNC_NAME s_scm_sys_stringbuf_hist
1007 {
1008 int i;
1009 for (i = 0; i < 1000; i++)
1010 if (lenhist[i])
1011 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1012 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1013 return SCM_UNSPECIFIED;
1014 }
1015 #undef FUNC_NAME
1016
1017 #endif
1018
1019 \f
1020
1021 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1022 (SCM obj),
1023 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1024 #define FUNC_NAME s_scm_string_p
1025 {
1026 return scm_from_bool (IS_STRING (obj));
1027 }
1028 #undef FUNC_NAME
1029
1030
1031 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1032
1033 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1034 (SCM chrs),
1035 "@deffnx {Scheme Procedure} list->string chrs\n"
1036 "Return a newly allocated string composed of the arguments,\n"
1037 "@var{chrs}.")
1038 #define FUNC_NAME s_scm_string
1039 {
1040 SCM result = SCM_BOOL_F;
1041 SCM rest;
1042 size_t len;
1043 size_t p = 0;
1044 long i;
1045 int wide = 0;
1046
1047 /* Verify that this is a list of chars. */
1048 i = scm_ilength (chrs);
1049 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1050
1051 len = (size_t) i;
1052 rest = chrs;
1053
1054 while (len > 0 && scm_is_pair (rest))
1055 {
1056 SCM elt = SCM_CAR (rest);
1057 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1058 if (SCM_CHAR (elt) > 0xFF)
1059 wide = 1;
1060 rest = SCM_CDR (rest);
1061 len--;
1062 scm_remember_upto_here_1 (elt);
1063 }
1064
1065 /* Construct a string containing this list of chars. */
1066 len = (size_t) i;
1067 rest = chrs;
1068
1069 if (wide == 0)
1070 {
1071 char *buf;
1072
1073 result = scm_i_make_string (len, NULL, 0);
1074 result = scm_i_string_start_writing (result);
1075 buf = scm_i_string_writable_chars (result);
1076 while (len > 0 && scm_is_pair (rest))
1077 {
1078 SCM elt = SCM_CAR (rest);
1079 buf[p] = (unsigned char) SCM_CHAR (elt);
1080 p++;
1081 rest = SCM_CDR (rest);
1082 len--;
1083 scm_remember_upto_here_1 (elt);
1084 }
1085 }
1086 else
1087 {
1088 scm_t_wchar *buf;
1089
1090 result = scm_i_make_wide_string (len, NULL, 0);
1091 result = scm_i_string_start_writing (result);
1092 buf = scm_i_string_writable_wide_chars (result);
1093 while (len > 0 && scm_is_pair (rest))
1094 {
1095 SCM elt = SCM_CAR (rest);
1096 buf[p] = SCM_CHAR (elt);
1097 p++;
1098 rest = SCM_CDR (rest);
1099 len--;
1100 scm_remember_upto_here_1 (elt);
1101 }
1102 }
1103 scm_i_string_stop_writing ();
1104
1105 if (len > 0)
1106 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1107 if (!scm_is_null (rest))
1108 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1109
1110 return result;
1111 }
1112 #undef FUNC_NAME
1113
1114 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1115 (SCM k, SCM chr),
1116 "Return a newly allocated string of\n"
1117 "length @var{k}. If @var{chr} is given, then all elements of\n"
1118 "the string are initialized to @var{chr}, otherwise the contents\n"
1119 "of the @var{string} are all set to @var{#\nul}.")
1120 #define FUNC_NAME s_scm_make_string
1121 {
1122 return scm_c_make_string (scm_to_size_t (k), chr);
1123 }
1124 #undef FUNC_NAME
1125
1126 SCM
1127 scm_c_make_string (size_t len, SCM chr)
1128 #define FUNC_NAME NULL
1129 {
1130 size_t p;
1131 char *contents = NULL;
1132 SCM res = scm_i_make_string (len, &contents, 0);
1133
1134 /* If no char is given, initialize string contents to NULL. */
1135 if (SCM_UNBNDP (chr))
1136 memset (contents, 0, len);
1137 else
1138 {
1139 SCM_VALIDATE_CHAR (0, chr);
1140 res = scm_i_string_start_writing (res);
1141 for (p = 0; p < len; p++)
1142 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1143 scm_i_string_stop_writing ();
1144 }
1145
1146 return res;
1147 }
1148 #undef FUNC_NAME
1149
1150 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1151 (SCM string),
1152 "Return the number of characters in @var{string}.")
1153 #define FUNC_NAME s_scm_string_length
1154 {
1155 SCM_VALIDATE_STRING (1, string);
1156 return scm_from_size_t (STRING_LENGTH (string));
1157 }
1158 #undef FUNC_NAME
1159
1160 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1161 (SCM string),
1162 "Return the bytes used to represent a character in @var{string}."
1163 "This will return 1 or 4.")
1164 #define FUNC_NAME s_scm_string_bytes_per_char
1165 {
1166 SCM_VALIDATE_STRING (1, string);
1167 if (!scm_i_is_narrow_string (string))
1168 return scm_from_int (4);
1169
1170 return scm_from_int (1);
1171 }
1172 #undef FUNC_NAME
1173
1174 size_t
1175 scm_c_string_length (SCM string)
1176 {
1177 if (!IS_STRING (string))
1178 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1179 return STRING_LENGTH (string);
1180 }
1181
1182 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1183 (SCM str, SCM k),
1184 "Return character @var{k} of @var{str} using zero-origin\n"
1185 "indexing. @var{k} must be a valid index of @var{str}.")
1186 #define FUNC_NAME s_scm_string_ref
1187 {
1188 size_t len;
1189 unsigned long idx;
1190
1191 SCM_VALIDATE_STRING (1, str);
1192
1193 len = scm_i_string_length (str);
1194 if (SCM_LIKELY (len > 0))
1195 idx = scm_to_unsigned_integer (k, 0, len - 1);
1196 else
1197 scm_out_of_range (NULL, k);
1198
1199 if (scm_i_is_narrow_string (str))
1200 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1201 else
1202 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1203 }
1204 #undef FUNC_NAME
1205
1206 SCM
1207 scm_c_string_ref (SCM str, size_t p)
1208 {
1209 if (p >= scm_i_string_length (str))
1210 scm_out_of_range (NULL, scm_from_size_t (p));
1211 if (scm_i_is_narrow_string (str))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1213 else
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1215
1216 }
1217
1218 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1219 (SCM str, SCM k, SCM chr),
1220 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1221 "an unspecified value. @var{k} must be a valid index of\n"
1222 "@var{str}.")
1223 #define FUNC_NAME s_scm_string_set_x
1224 {
1225 size_t len;
1226 unsigned long idx;
1227
1228 SCM_VALIDATE_STRING (1, str);
1229
1230 len = scm_i_string_length (str);
1231 if (SCM_LIKELY (len > 0))
1232 idx = scm_to_unsigned_integer (k, 0, len - 1);
1233 else
1234 scm_out_of_range (NULL, k);
1235
1236 SCM_VALIDATE_CHAR (3, chr);
1237 str = scm_i_string_start_writing (str);
1238 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1239 scm_i_string_stop_writing ();
1240
1241 return SCM_UNSPECIFIED;
1242 }
1243 #undef FUNC_NAME
1244
1245 void
1246 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1247 {
1248 if (p >= scm_i_string_length (str))
1249 scm_out_of_range (NULL, scm_from_size_t (p));
1250 str = scm_i_string_start_writing (str);
1251 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1252 scm_i_string_stop_writing ();
1253 }
1254
1255 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1256 (SCM str, SCM start, SCM end),
1257 "Return a newly allocated string formed from the characters\n"
1258 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1259 "ending with index @var{end} (exclusive).\n"
1260 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1261 "exact integers satisfying:\n\n"
1262 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1263 #define FUNC_NAME s_scm_substring
1264 {
1265 size_t len, from, to;
1266
1267 SCM_VALIDATE_STRING (1, str);
1268 len = scm_i_string_length (str);
1269 from = scm_to_unsigned_integer (start, 0, len);
1270 if (SCM_UNBNDP (end))
1271 to = len;
1272 else
1273 to = scm_to_unsigned_integer (end, from, len);
1274 return scm_i_substring (str, from, to);
1275 }
1276 #undef FUNC_NAME
1277
1278 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1279 (SCM str, SCM start, SCM end),
1280 "Return a newly allocated string formed from the characters\n"
1281 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1282 "ending with index @var{end} (exclusive).\n"
1283 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1284 "exact integers satisfying:\n"
1285 "\n"
1286 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1287 "\n"
1288 "The returned string is read-only.\n")
1289 #define FUNC_NAME s_scm_substring_read_only
1290 {
1291 size_t len, from, to;
1292
1293 SCM_VALIDATE_STRING (1, str);
1294 len = scm_i_string_length (str);
1295 from = scm_to_unsigned_integer (start, 0, len);
1296 if (SCM_UNBNDP (end))
1297 to = len;
1298 else
1299 to = scm_to_unsigned_integer (end, from, len);
1300 return scm_i_substring_read_only (str, from, to);
1301 }
1302 #undef FUNC_NAME
1303
1304 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1305 (SCM str, SCM start, SCM end),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring_copy
1313 {
1314 /* For the Scheme version, START is mandatory, but for the C
1315 version, it is optional. See scm_string_copy in srfi-13.c for a
1316 rationale.
1317 */
1318
1319 size_t from, to;
1320
1321 SCM_VALIDATE_STRING (1, str);
1322 scm_i_get_substring_spec (scm_i_string_length (str),
1323 start, &from, end, &to);
1324 return scm_i_substring_copy (str, from, to);
1325 }
1326 #undef FUNC_NAME
1327
1328 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1329 (SCM str, SCM start, SCM end),
1330 "Return string that indirectly refers to the characters\n"
1331 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1332 "ending with index @var{end} (exclusive).\n"
1333 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1334 "exact integers satisfying:\n\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1336 #define FUNC_NAME s_scm_substring_shared
1337 {
1338 size_t len, from, to;
1339
1340 SCM_VALIDATE_STRING (1, str);
1341 len = scm_i_string_length (str);
1342 from = scm_to_unsigned_integer (start, 0, len);
1343 if (SCM_UNBNDP (end))
1344 to = len;
1345 else
1346 to = scm_to_unsigned_integer (end, from, len);
1347 return scm_i_substring_shared (str, from, to);
1348 }
1349 #undef FUNC_NAME
1350
1351 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1352 (SCM args),
1353 "Return a newly allocated string whose characters form the\n"
1354 "concatenation of the given strings, @var{args}.")
1355 #define FUNC_NAME s_scm_string_append
1356 {
1357 SCM res;
1358 size_t len = 0;
1359 int wide = 0;
1360 SCM l, s;
1361 size_t i;
1362 union
1363 {
1364 char *narrow;
1365 scm_t_wchar *wide;
1366 } data;
1367
1368 SCM_VALIDATE_REST_ARGUMENT (args);
1369 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1370 {
1371 s = SCM_CAR (l);
1372 SCM_VALIDATE_STRING (SCM_ARGn, s);
1373 len += scm_i_string_length (s);
1374 if (!scm_i_is_narrow_string (s))
1375 wide = 1;
1376 }
1377 data.narrow = NULL;
1378 if (!wide)
1379 res = scm_i_make_string (len, &data.narrow, 0);
1380 else
1381 res = scm_i_make_wide_string (len, &data.wide, 0);
1382
1383 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1384 {
1385 size_t len;
1386 s = SCM_CAR (l);
1387 SCM_VALIDATE_STRING (SCM_ARGn, s);
1388 len = scm_i_string_length (s);
1389 if (!wide)
1390 {
1391 memcpy (data.narrow, scm_i_string_chars (s), len);
1392 data.narrow += len;
1393 }
1394 else
1395 {
1396 if (scm_i_is_narrow_string (s))
1397 {
1398 for (i = 0; i < scm_i_string_length (s); i++)
1399 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1400 }
1401 else
1402 u32_cpy ((scm_t_uint32 *) data.wide,
1403 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1404 data.wide += len;
1405 }
1406 scm_remember_upto_here_1 (s);
1407 }
1408 return res;
1409 }
1410 #undef FUNC_NAME
1411
1412
1413 \f
1414 /* Charset conversion error handling. */
1415
1416 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1417 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1418
1419 /* Raise an exception informing that character CHR could not be written
1420 to PORT in its current encoding. */
1421 void
1422 scm_encoding_error (const char *subr, int err, const char *message,
1423 SCM port, SCM chr)
1424 {
1425 scm_throw (scm_encoding_error_key,
1426 scm_list_n (scm_from_latin1_string (subr),
1427 scm_from_latin1_string (message),
1428 scm_from_int (err),
1429 port, chr,
1430 SCM_UNDEFINED));
1431 }
1432
1433 /* Raise an exception informing of an encoding error on PORT. This
1434 means that a character could not be written in PORT's encoding. */
1435 void
1436 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1437 {
1438 scm_throw (scm_decoding_error_key,
1439 scm_list_n (scm_from_latin1_string (subr),
1440 scm_from_latin1_string (message),
1441 scm_from_int (err),
1442 port,
1443 SCM_UNDEFINED));
1444 }
1445
1446 \f
1447 /* String conversion to/from C. */
1448
1449 static void
1450 decoding_error (const char *func_name, int errno_save,
1451 const char *str, size_t len)
1452 {
1453 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1454 handler. */
1455 SCM bv;
1456 signed char *buf;
1457
1458 buf = scm_gc_malloc_pointerless (len, "bytevector");
1459 memcpy (buf, str, len);
1460 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1461
1462 scm_decoding_error (func_name, errno_save,
1463 "input locale conversion error", bv);
1464 }
1465
1466 SCM
1467 scm_from_stringn (const char *str, size_t len, const char *encoding,
1468 scm_t_string_failed_conversion_handler handler)
1469 {
1470 size_t u32len, i;
1471 scm_t_wchar *u32;
1472 int wide = 0;
1473 SCM res;
1474
1475 /* The order of these checks is important. */
1476 if (!str && len != 0)
1477 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1478 if (len == (size_t) -1)
1479 len = strlen (str);
1480 if (len == 0)
1481 return scm_nullstr;
1482
1483 if (encoding == NULL)
1484 {
1485 /* If encoding is null, use Latin-1. */
1486 char *buf;
1487 res = scm_i_make_string (len, &buf, 0);
1488 memcpy (buf, str, len);
1489 return res;
1490 }
1491
1492 u32len = 0;
1493 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1494 (enum iconv_ilseq_handler)
1495 handler,
1496 str, len,
1497 NULL,
1498 NULL, &u32len);
1499
1500 if (SCM_UNLIKELY (u32 == NULL))
1501 decoding_error (__func__, errno, str, len);
1502
1503 i = 0;
1504 while (i < u32len)
1505 if (u32[i++] > 0xFF)
1506 {
1507 wide = 1;
1508 break;
1509 }
1510
1511 if (!wide)
1512 {
1513 char *dst;
1514 res = scm_i_make_string (u32len, &dst, 0);
1515 for (i = 0; i < u32len; i ++)
1516 dst[i] = (unsigned char) u32[i];
1517 dst[u32len] = '\0';
1518 }
1519 else
1520 {
1521 scm_t_wchar *wdst;
1522 res = scm_i_make_wide_string (u32len, &wdst, 0);
1523 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1524 wdst[u32len] = 0;
1525 }
1526
1527 free (u32);
1528 return res;
1529 }
1530
1531 SCM
1532 scm_from_locale_string (const char *str)
1533 {
1534 return scm_from_locale_stringn (str, -1);
1535 }
1536
1537 SCM
1538 scm_from_locale_stringn (const char *str, size_t len)
1539 {
1540 return scm_from_stringn (str, len, locale_charset (),
1541 scm_i_get_conversion_strategy (SCM_BOOL_F));
1542 }
1543
1544 SCM
1545 scm_from_latin1_string (const char *str)
1546 {
1547 return scm_from_latin1_stringn (str, -1);
1548 }
1549
1550 SCM
1551 scm_from_latin1_stringn (const char *str, size_t len)
1552 {
1553 char *buf;
1554 SCM result;
1555
1556 if (len == (size_t) -1)
1557 len = strlen (str);
1558
1559 /* Make a narrow string and copy STR as is. */
1560 result = scm_i_make_string (len, &buf, 0);
1561 memcpy (buf, str, len);
1562
1563 return result;
1564 }
1565
1566 SCM
1567 scm_from_utf8_string (const char *str)
1568 {
1569 return scm_from_utf8_stringn (str, -1);
1570 }
1571
1572 SCM
1573 scm_from_utf8_stringn (const char *str, size_t len)
1574 {
1575 size_t i, char_len;
1576 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1577 int ascii = 1, narrow = 1;
1578 SCM res;
1579
1580 if (len == (size_t) -1)
1581 len = strlen (str);
1582
1583 i = 0;
1584 char_len = 0;
1585
1586 while (i < len)
1587 {
1588 if (ustr[i] <= 127)
1589 {
1590 char_len++;
1591 i++;
1592 }
1593 else
1594 {
1595 ucs4_t c;
1596 int nbytes;
1597
1598 ascii = 0;
1599
1600 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1601
1602 if (nbytes < 0)
1603 /* Bad UTF-8. */
1604 decoding_error (__func__, errno, str, len);
1605
1606 if (c > 255)
1607 narrow = 0;
1608
1609 char_len++;
1610 i += nbytes;
1611 }
1612 }
1613
1614 if (ascii)
1615 {
1616 char *dst;
1617 res = scm_i_make_string (char_len, &dst, 0);
1618 memcpy (dst, str, len);
1619 }
1620 else if (narrow)
1621 {
1622 char *dst;
1623 size_t j;
1624 ucs4_t c;
1625
1626 res = scm_i_make_string (char_len, &dst, 0);
1627
1628 for (i = 0, j = 0; i < len; i++, j++)
1629 {
1630 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1631 dst[j] = (signed char) c;
1632 }
1633 }
1634 else
1635 {
1636 scm_t_wchar *dst;
1637 size_t j;
1638 ucs4_t c;
1639
1640 res = scm_i_make_wide_string (char_len, &dst, 0);
1641
1642 for (i = 0, j = 0; i < len; i++, j++)
1643 {
1644 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1645 dst[j] = c;
1646 }
1647 }
1648
1649 return res;
1650 }
1651
1652 SCM
1653 scm_from_utf32_string (const scm_t_wchar *str)
1654 {
1655 return scm_from_utf32_stringn (str, -1);
1656 }
1657
1658 SCM
1659 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1660 {
1661 SCM result;
1662 scm_t_wchar *buf;
1663
1664 if (len == (size_t) -1)
1665 len = u32_strlen ((uint32_t *) str);
1666
1667 result = scm_i_make_wide_string (len, &buf, 0);
1668 memcpy (buf, str, len * sizeof (scm_t_wchar));
1669 scm_i_try_narrow_string (result);
1670
1671 return result;
1672 }
1673
1674 /* Create a new scheme string from the C string STR. The memory of
1675 STR may be used directly as storage for the new string. */
1676 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1677 would be to register a finalizer to eventually free(3) STR, which isn't
1678 worth it. Should we just deprecate the `scm_take_' functions? */
1679 SCM
1680 scm_take_locale_stringn (char *str, size_t len)
1681 {
1682 SCM res;
1683
1684 res = scm_from_locale_stringn (str, len);
1685 free (str);
1686
1687 return res;
1688 }
1689
1690 SCM
1691 scm_take_locale_string (char *str)
1692 {
1693 return scm_take_locale_stringn (str, -1);
1694 }
1695
1696 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1697 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1698 Set *LENP to the size of the resulting string.
1699
1700 FIXME: This is a hack we should get rid of. See
1701 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1702 for details. */
1703 static void
1704 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1705 {
1706 char *before, *after;
1707 size_t i, j;
1708
1709 before = buf;
1710 after = buf;
1711 i = 0;
1712 j = 0;
1713 while (i < *lenp)
1714 {
1715 if ((i <= *lenp - 6)
1716 && before[i] == '\\'
1717 && before[i + 1] == 'u'
1718 && before[i + 2] == '0' && before[i + 3] == '0')
1719 {
1720 /* Convert \u00NN to \xNN */
1721 after[j] = '\\';
1722 after[j + 1] = 'x';
1723 after[j + 2] = tolower ((int) before[i + 4]);
1724 after[j + 3] = tolower ((int) before[i + 5]);
1725 i += 6;
1726 j += 4;
1727 }
1728 else if ((i <= *lenp - 10)
1729 && before[i] == '\\'
1730 && before[i + 1] == 'U'
1731 && before[i + 2] == '0' && before[i + 3] == '0')
1732 {
1733 /* Convert \U00NNNNNN to \UNNNNNN */
1734 after[j] = '\\';
1735 after[j + 1] = 'U';
1736 after[j + 2] = tolower ((int) before[i + 4]);
1737 after[j + 3] = tolower ((int) before[i + 5]);
1738 after[j + 4] = tolower ((int) before[i + 6]);
1739 after[j + 5] = tolower ((int) before[i + 7]);
1740 after[j + 6] = tolower ((int) before[i + 8]);
1741 after[j + 7] = tolower ((int) before[i + 9]);
1742 i += 10;
1743 j += 8;
1744 }
1745 else
1746 {
1747 after[j] = before[i];
1748 i++;
1749 j++;
1750 }
1751 }
1752 *lenp = j;
1753 }
1754
1755 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1756 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1757 of the resulting string. BUF must be large enough to handle the
1758 worst case when `\uXXXX' escapes (6 characters) are replaced by
1759 `\xXXXX;' (7 characters). */
1760 static void
1761 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1762 {
1763 char *before, *after;
1764 size_t i, j;
1765 /* The worst case is if the input string contains all 4-digit hex escapes.
1766 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1767 size_t max_out_len = (*lenp * 7) / 6 + 1;
1768 size_t nzeros, ndigits;
1769
1770 before = buf;
1771 after = alloca (max_out_len);
1772 i = 0;
1773 j = 0;
1774 while (i < *lenp)
1775 {
1776 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1777 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1778 {
1779 if (before[i + 1] == 'u')
1780 ndigits = 4;
1781 else if (before[i + 1] == 'U')
1782 ndigits = 8;
1783 else
1784 abort ();
1785
1786 /* Add the R6RS hex escape initial sequence. */
1787 after[j] = '\\';
1788 after[j + 1] = 'x';
1789
1790 /* Move string positions to the start of the hex numbers. */
1791 i += 2;
1792 j += 2;
1793
1794 /* Find the number of initial zeros in this hex number. */
1795 nzeros = 0;
1796 while (before[i + nzeros] == '0' && nzeros < ndigits)
1797 nzeros++;
1798
1799 /* Copy the number, skipping initial zeros, and then move the string
1800 positions. */
1801 if (nzeros == ndigits)
1802 {
1803 after[j] = '0';
1804 i += ndigits;
1805 j += 1;
1806 }
1807 else
1808 {
1809 int pos;
1810 for (pos = 0; pos < ndigits - nzeros; pos++)
1811 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1812 i += ndigits;
1813 j += (ndigits - nzeros);
1814 }
1815
1816 /* Add terminating semicolon. */
1817 after[j] = ';';
1818 j++;
1819 }
1820 else
1821 {
1822 after[j] = before[i];
1823 i++;
1824 j++;
1825 }
1826 }
1827 *lenp = j;
1828 memcpy (before, after, j);
1829 }
1830
1831 char *
1832 scm_to_locale_string (SCM str)
1833 {
1834 return scm_to_locale_stringn (str, NULL);
1835 }
1836
1837 char *
1838 scm_to_locale_stringn (SCM str, size_t *lenp)
1839 {
1840 return scm_to_stringn (str, lenp,
1841 locale_charset (),
1842 scm_i_get_conversion_strategy (SCM_BOOL_F));
1843 }
1844
1845 char *
1846 scm_to_latin1_string (SCM str)
1847 {
1848 return scm_to_latin1_stringn (str, NULL);
1849 }
1850
1851 char *
1852 scm_to_latin1_stringn (SCM str, size_t *lenp)
1853 #define FUNC_NAME "scm_to_latin1_stringn"
1854 {
1855 char *result;
1856
1857 SCM_VALIDATE_STRING (1, str);
1858
1859 if (scm_i_is_narrow_string (str))
1860 {
1861 size_t len = scm_i_string_length (str);
1862
1863 if (lenp)
1864 *lenp = len;
1865
1866 result = scm_strndup (scm_i_string_data (str), len);
1867 }
1868 else
1869 result = scm_to_stringn (str, lenp, NULL,
1870 SCM_FAILED_CONVERSION_ERROR);
1871
1872 return result;
1873 }
1874 #undef FUNC_NAME
1875
1876 char *
1877 scm_to_utf8_string (SCM str)
1878 {
1879 return scm_to_utf8_stringn (str, NULL);
1880 }
1881
1882 char *
1883 scm_to_utf8_stringn (SCM str, size_t *lenp)
1884 {
1885 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1886 }
1887
1888 scm_t_wchar *
1889 scm_to_utf32_string (SCM str)
1890 {
1891 return scm_to_utf32_stringn (str, NULL);
1892 }
1893
1894 scm_t_wchar *
1895 scm_to_utf32_stringn (SCM str, size_t *lenp)
1896 #define FUNC_NAME "scm_to_utf32_stringn"
1897 {
1898 scm_t_wchar *result;
1899
1900 SCM_VALIDATE_STRING (1, str);
1901
1902 if (scm_i_is_narrow_string (str))
1903 result = (scm_t_wchar *)
1904 scm_to_stringn (str, lenp, "UTF-32",
1905 SCM_FAILED_CONVERSION_ERROR);
1906 else
1907 {
1908 size_t len;
1909
1910 len = scm_i_string_length (str);
1911 if (lenp)
1912 *lenp = len;
1913
1914 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1915 memcpy (result, scm_i_string_wide_chars (str),
1916 len * sizeof (scm_t_wchar));
1917 result[len] = 0;
1918 }
1919
1920 return result;
1921 }
1922 #undef FUNC_NAME
1923
1924 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1925 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1926 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1927 defined by HANDLER. */
1928 char *
1929 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1930 scm_t_string_failed_conversion_handler handler)
1931 {
1932 char *buf;
1933 size_t ilen, len, i;
1934 int ret;
1935 const char *enc;
1936
1937 if (!scm_is_string (str))
1938 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1939 ilen = scm_i_string_length (str);
1940
1941 if (ilen == 0)
1942 {
1943 buf = scm_malloc (1);
1944 buf[0] = '\0';
1945 if (lenp)
1946 *lenp = 0;
1947 return buf;
1948 }
1949
1950 if (lenp == NULL)
1951 for (i = 0; i < ilen; i++)
1952 if (scm_i_string_ref (str, i) == '\0')
1953 scm_misc_error (NULL,
1954 "string contains #\\nul character: ~S",
1955 scm_list_1 (str));
1956
1957 if (scm_i_is_narrow_string (str) && (encoding == NULL))
1958 {
1959 /* If using native Latin-1 encoding, just copy the string
1960 contents. */
1961 if (lenp)
1962 {
1963 buf = scm_malloc (ilen);
1964 memcpy (buf, scm_i_string_chars (str), ilen);
1965 *lenp = ilen;
1966 return buf;
1967 }
1968 else
1969 {
1970 buf = scm_malloc (ilen + 1);
1971 memcpy (buf, scm_i_string_chars (str), ilen);
1972 buf[ilen] = '\0';
1973 return buf;
1974 }
1975 }
1976
1977
1978 buf = NULL;
1979 len = 0;
1980 enc = encoding;
1981 if (enc == NULL)
1982 enc = "ISO-8859-1";
1983 if (scm_i_is_narrow_string (str))
1984 {
1985 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1986 "ISO-8859-1", enc,
1987 (enum iconv_ilseq_handler) handler, NULL,
1988 &buf, &len);
1989
1990 if (ret != 0)
1991 scm_encoding_error (__func__, errno,
1992 "cannot convert narrow string to output locale",
1993 SCM_BOOL_F,
1994 /* FIXME: Faulty character unknown. */
1995 SCM_BOOL_F);
1996 }
1997 else
1998 {
1999 buf = u32_conv_to_encoding (enc,
2000 (enum iconv_ilseq_handler) handler,
2001 (scm_t_uint32 *) scm_i_string_wide_chars (str),
2002 ilen,
2003 NULL,
2004 NULL, &len);
2005 if (buf == NULL)
2006 scm_encoding_error (__func__, errno,
2007 "cannot convert wide string to output locale",
2008 SCM_BOOL_F,
2009 /* FIXME: Faulty character unknown. */
2010 SCM_BOOL_F);
2011 }
2012 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2013 {
2014 if (SCM_R6RS_ESCAPES_P)
2015 {
2016 /* The worst case is if the input string contains all 4-digit
2017 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2018 (seven characters). Make BUF large enough to hold
2019 that. */
2020 buf = scm_realloc (buf, (len * 7) / 6 + 1);
2021 unistring_escapes_to_r6rs_escapes (buf, &len);
2022 }
2023 else
2024 unistring_escapes_to_guile_escapes (buf, &len);
2025
2026 buf = scm_realloc (buf, len);
2027 }
2028 if (lenp)
2029 *lenp = len;
2030 else
2031 {
2032 buf = scm_realloc (buf, len + 1);
2033 buf[len] = '\0';
2034 }
2035
2036 scm_remember_upto_here_1 (str);
2037 return buf;
2038 }
2039
2040 size_t
2041 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2042 {
2043 size_t len;
2044 char *result = NULL;
2045 if (!scm_is_string (str))
2046 scm_wrong_type_arg_msg (NULL, 0, str, "string");
2047 result = scm_to_locale_stringn (str, &len);
2048
2049 memcpy (buf, result, (len > max_len) ? max_len : len);
2050 free (result);
2051
2052 scm_remember_upto_here_1 (str);
2053 return len;
2054 }
2055
2056 \f
2057 /* Unicode string normalization. */
2058
2059 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2060 libguile/i18n.c. It would be useful to have this factored out into a more
2061 convenient location, but its use of alloca makes that tricky to do. */
2062
2063 static SCM
2064 normalize_str (SCM string, uninorm_t form)
2065 {
2066 SCM ret;
2067 scm_t_uint32 *w_str;
2068 scm_t_wchar *cbuf;
2069 size_t rlen, len = scm_i_string_length (string);
2070
2071 if (scm_i_is_narrow_string (string))
2072 {
2073 size_t i;
2074 const char *buf = scm_i_string_chars (string);
2075
2076 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2077
2078 for (i = 0; i < len; i ++)
2079 w_str[i] = (unsigned char) buf[i];
2080 w_str[len] = 0;
2081 }
2082 else
2083 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2084
2085 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2086
2087 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2088 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2089 free (w_str);
2090
2091 scm_i_try_narrow_string (ret);
2092
2093 return ret;
2094 }
2095
2096 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2097 (SCM string),
2098 "Returns the NFC normalized form of @var{string}.")
2099 #define FUNC_NAME s_scm_string_normalize_nfc
2100 {
2101 SCM_VALIDATE_STRING (1, string);
2102 return normalize_str (string, UNINORM_NFC);
2103 }
2104 #undef FUNC_NAME
2105
2106 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2107 (SCM string),
2108 "Returns the NFD normalized form of @var{string}.")
2109 #define FUNC_NAME s_scm_string_normalize_nfd
2110 {
2111 SCM_VALIDATE_STRING (1, string);
2112 return normalize_str (string, UNINORM_NFD);
2113 }
2114 #undef FUNC_NAME
2115
2116 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2117 (SCM string),
2118 "Returns the NFKC normalized form of @var{string}.")
2119 #define FUNC_NAME s_scm_string_normalize_nfkc
2120 {
2121 SCM_VALIDATE_STRING (1, string);
2122 return normalize_str (string, UNINORM_NFKC);
2123 }
2124 #undef FUNC_NAME
2125
2126 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2127 (SCM string),
2128 "Returns the NFKD normalized form of @var{string}.")
2129 #define FUNC_NAME s_scm_string_normalize_nfkd
2130 {
2131 SCM_VALIDATE_STRING (1, string);
2132 return normalize_str (string, UNINORM_NFKD);
2133 }
2134 #undef FUNC_NAME
2135
2136 /* converts C scm_array of strings to SCM scm_list of strings.
2137 If argc < 0, a null terminated scm_array is assumed.
2138 The current locale encoding is assumed */
2139 SCM
2140 scm_makfromstrs (int argc, char **argv)
2141 {
2142 int i = argc;
2143 SCM lst = SCM_EOL;
2144 if (0 > i)
2145 for (i = 0; argv[i]; i++);
2146 while (i--)
2147 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2148 return lst;
2149 }
2150
2151 /* Return a newly allocated array of char pointers to each of the strings
2152 in args, with a terminating NULL pointer. The strings are encoded using
2153 the current locale. */
2154
2155 char **
2156 scm_i_allocate_string_pointers (SCM list)
2157 #define FUNC_NAME "scm_i_allocate_string_pointers"
2158 {
2159 char **result;
2160 int list_len = scm_ilength (list);
2161 int i;
2162
2163 if (list_len < 0)
2164 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2165
2166 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2167 "string pointers");
2168 result[list_len] = NULL;
2169
2170 /* The list might have been modified in another thread, so
2171 we check LIST before each access.
2172 */
2173 for (i = 0; i < list_len && scm_is_pair (list); i++)
2174 {
2175 SCM str = SCM_CAR (list);
2176 size_t len; /* String length in bytes */
2177 char *c_str = scm_to_locale_stringn (str, &len);
2178
2179 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2180 scm_malloc to allocate the returned string, which must be
2181 explicitly deallocated. This forces us to copy the string a
2182 second time into a new buffer. Ideally there would be variants
2183 of scm_to_*_stringn that can return garbage-collected buffers. */
2184
2185 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2186 memcpy (result[i], c_str, len);
2187 result[i][len] = '\0';
2188 free (c_str);
2189
2190 list = SCM_CDR (list);
2191 }
2192
2193 return result;
2194 }
2195 #undef FUNC_NAME
2196
2197 void
2198 scm_i_get_substring_spec (size_t len,
2199 SCM start, size_t *cstart,
2200 SCM end, size_t *cend)
2201 {
2202 if (SCM_UNBNDP (start))
2203 *cstart = 0;
2204 else
2205 *cstart = scm_to_unsigned_integer (start, 0, len);
2206
2207 if (SCM_UNBNDP (end))
2208 *cend = len;
2209 else
2210 *cend = scm_to_unsigned_integer (end, *cstart, len);
2211 }
2212
2213 static SCM
2214 string_handle_ref (scm_t_array_handle *h, size_t index)
2215 {
2216 return scm_c_string_ref (h->array, index);
2217 }
2218
2219 static void
2220 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2221 {
2222 scm_c_string_set_x (h->array, index, val);
2223 }
2224
2225 static void
2226 string_get_handle (SCM v, scm_t_array_handle *h)
2227 {
2228 h->array = v;
2229 h->ndims = 1;
2230 h->dims = &h->dim0;
2231 h->dim0.lbnd = 0;
2232 h->dim0.ubnd = scm_c_string_length (v) - 1;
2233 h->dim0.inc = 1;
2234 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2235 h->elements = h->writable_elements = NULL;
2236 }
2237
2238 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2239 string_handle_ref, string_handle_set,
2240 string_get_handle)
2241 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2242
2243 void
2244 scm_init_strings ()
2245 {
2246 scm_nullstr = scm_i_make_string (0, NULL, 1);
2247
2248 #include "libguile/strings.x"
2249 }
2250
2251
2252 /*
2253 Local Variables:
2254 c-file-style: "gnu"
2255 End:
2256 */