scm_i_substring_copy tries to narrow the substring
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229
230 \f
231 /* Copy-on-write strings.
232 */
233
234 #define STRING_TAG scm_tc7_string
235
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
245 /* Read-only strings.
246 */
247
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
251 /* Mutation-sharing substrings
252 */
253
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
258
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
261 SCM scm_nullstr;
262
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
267 SCM
268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
269 {
270 SCM buf = make_stringbuf (len);
271 SCM res;
272 if (charsp)
273 *charsp = (char *) STRINGBUF_CHARS (buf);
274 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
275 SCM_UNPACK (buf),
276 (scm_t_bits) 0, (scm_t_bits) len);
277 return res;
278 }
279
280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
284 SCM
285 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
286 {
287 SCM buf = make_wide_stringbuf (len);
288 SCM res;
289 if (charsp)
290 *charsp = STRINGBUF_WIDE_CHARS (buf);
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295 }
296
297 static void
298 validate_substring_args (SCM str, size_t start, size_t end)
299 {
300 if (!IS_STRING (str))
301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
302 if (start > STRING_LENGTH (str))
303 scm_out_of_range (NULL, scm_from_size_t (start));
304 if (end > STRING_LENGTH (str) || end < start)
305 scm_out_of_range (NULL, scm_from_size_t (end));
306 }
307
308 static inline void
309 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
310 {
311 *start = STRING_START (*str);
312 if (IS_SH_STRING (*str))
313 {
314 *str = SH_STRING_STRING (*str);
315 *start += STRING_START (*str);
316 }
317 *buf = STRING_STRINGBUF (*str);
318 }
319
320 SCM
321 scm_i_substring (SCM str, size_t start, size_t end)
322 {
323 SCM buf;
324 size_t str_start;
325 get_str_buf_start (&str, &buf, &str_start);
326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
327 SET_STRINGBUF_SHARED (buf);
328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
329 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
330 (scm_t_bits)str_start + start,
331 (scm_t_bits) end - start);
332 }
333
334 SCM
335 scm_i_substring_read_only (SCM str, size_t start, size_t end)
336 {
337 SCM buf;
338 size_t str_start;
339 get_str_buf_start (&str, &buf, &str_start);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
341 SET_STRINGBUF_SHARED (buf);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
343 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
344 (scm_t_bits)str_start + start,
345 (scm_t_bits) end - start);
346 }
347
348 SCM
349 scm_i_substring_copy (SCM str, size_t start, size_t end)
350 {
351 size_t len = end - start;
352 SCM buf, my_buf, substr;
353 size_t str_start;
354 int wide = 0;
355 get_str_buf_start (&str, &buf, &str_start);
356 if (scm_i_is_narrow_string (str))
357 {
358 my_buf = make_stringbuf (len);
359 memcpy (STRINGBUF_CHARS (my_buf),
360 STRINGBUF_CHARS (buf) + str_start + start, len);
361 }
362 else
363 {
364 my_buf = make_wide_stringbuf (len);
365 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
366 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
367 + start), len);
368 wide = 1;
369 }
370 scm_remember_upto_here_1 (buf);
371 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
372 (scm_t_bits) 0, (scm_t_bits) len);
373 if (wide)
374 scm_i_try_narrow_string (substr);
375 return substr;
376 }
377
378 SCM
379 scm_i_substring_shared (SCM str, size_t start, size_t end)
380 {
381 if (start == 0 && end == STRING_LENGTH (str))
382 return str;
383 else
384 {
385 size_t len = end - start;
386 if (IS_SH_STRING (str))
387 {
388 start += STRING_START (str);
389 str = SH_STRING_STRING (str);
390 }
391 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
392 (scm_t_bits)start, (scm_t_bits) len);
393 }
394 }
395
396 SCM
397 scm_c_substring (SCM str, size_t start, size_t end)
398 {
399 validate_substring_args (str, start, end);
400 return scm_i_substring (str, start, end);
401 }
402
403 SCM
404 scm_c_substring_read_only (SCM str, size_t start, size_t end)
405 {
406 validate_substring_args (str, start, end);
407 return scm_i_substring_read_only (str, start, end);
408 }
409
410 SCM
411 scm_c_substring_copy (SCM str, size_t start, size_t end)
412 {
413 validate_substring_args (str, start, end);
414 return scm_i_substring_copy (str, start, end);
415 }
416
417 SCM
418 scm_c_substring_shared (SCM str, size_t start, size_t end)
419 {
420 validate_substring_args (str, start, end);
421 return scm_i_substring_shared (str, start, end);
422 }
423
424 \f
425 /* Internal accessors
426 */
427
428 /* Returns the number of characters in STR. This may be different
429 than the memory size of the string storage. */
430 size_t
431 scm_i_string_length (SCM str)
432 {
433 return STRING_LENGTH (str);
434 }
435
436 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
437 encoding. False if it is 'wide', having a 32-bit UCS-4
438 encoding. */
439 int
440 scm_i_is_narrow_string (SCM str)
441 {
442 if (IS_SH_STRING (str))
443 str = SH_STRING_STRING (str);
444
445 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
446 }
447
448 /* Try to coerce a string to be narrow. It if is narrow already, do
449 nothing. If it is wide, shrink it to narrow if none of its
450 characters are above 0xFF. Return true if the string is narrow or
451 was made to be narrow. */
452 int
453 scm_i_try_narrow_string (SCM str)
454 {
455 if (IS_SH_STRING (str))
456 str = SH_STRING_STRING (str);
457
458 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
459
460 return scm_i_is_narrow_string (str);
461 }
462
463 /* Return a pointer to the raw data of the string, which can be either Latin-1
464 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
465 const void *
466 scm_i_string_data (SCM str)
467 {
468 SCM buf;
469 size_t start;
470 const char *data;
471
472 get_str_buf_start (&str, &buf, &start);
473
474 data = STRINGBUF_CONTENTS (buf);
475 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
476
477 return data;
478 }
479
480 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
481 STR. */
482 const char *
483 scm_i_string_chars (SCM str)
484 {
485 SCM buf;
486 size_t start;
487 get_str_buf_start (&str, &buf, &start);
488 if (scm_i_is_narrow_string (str))
489 return (const char *) STRINGBUF_CHARS (buf) + start;
490 else
491 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
492 scm_list_1 (str));
493 return NULL;
494 }
495
496 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
497 STR. */
498 const scm_t_wchar *
499 scm_i_string_wide_chars (SCM str)
500 {
501 SCM buf;
502 size_t start;
503
504 get_str_buf_start (&str, &buf, &start);
505 if (!scm_i_is_narrow_string (str))
506 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
507 else
508 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
509 scm_list_1 (str));
510 }
511
512 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
513 a new string buffer, so that it can be modified without modifying
514 other strings. Also, lock the string mutex. Later, one must call
515 scm_i_string_stop_writing to unlock the mutex. */
516 SCM
517 scm_i_string_start_writing (SCM orig_str)
518 {
519 SCM buf, str = orig_str;
520 size_t start;
521
522 get_str_buf_start (&str, &buf, &start);
523 if (IS_RO_STRING (str))
524 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
525
526 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
527 if (STRINGBUF_SHARED (buf))
528 {
529 /* Clone the stringbuf. */
530 size_t len = STRING_LENGTH (str);
531 SCM new_buf;
532
533 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
534
535 if (scm_i_is_narrow_string (str))
536 {
537 new_buf = make_stringbuf (len);
538 memcpy (STRINGBUF_CHARS (new_buf),
539 STRINGBUF_CHARS (buf) + STRING_START (str), len);
540
541 }
542 else
543 {
544 new_buf = make_wide_stringbuf (len);
545 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
546 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
547 + STRING_START (str)), len);
548 }
549
550 SET_STRING_STRINGBUF (str, new_buf);
551 start -= STRING_START (str);
552
553 /* FIXME: The following operations are not atomic, so other threads
554 looking at STR may see an inconsistent state. Nevertheless it can't
555 hurt much since (i) accessing STR while it is being mutated can't
556 yield a crash, and (ii) concurrent accesses to STR should be
557 protected by a mutex at the application level. The latter may not
558 apply when STR != ORIG_STR, though. */
559 SET_STRING_START (str, 0);
560 SET_STRING_STRINGBUF (str, new_buf);
561
562 buf = new_buf;
563
564 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
565 }
566 return orig_str;
567 }
568
569 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
570 char *
571 scm_i_string_writable_chars (SCM str)
572 {
573 SCM buf;
574 size_t start;
575
576 get_str_buf_start (&str, &buf, &start);
577 if (scm_i_is_narrow_string (str))
578 return (char *) STRINGBUF_CHARS (buf) + start;
579 else
580 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
581 scm_list_1 (str));
582 return NULL;
583 }
584
585 /* Return a pointer to the UCS-4 codepoints of a string. */
586 static scm_t_wchar *
587 scm_i_string_writable_wide_chars (SCM str)
588 {
589 SCM buf;
590 size_t start;
591
592 get_str_buf_start (&str, &buf, &start);
593 if (!scm_i_is_narrow_string (str))
594 return STRINGBUF_WIDE_CHARS (buf) + start;
595 else
596 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
597 scm_list_1 (str));
598 }
599
600 /* Unlock the string mutex that was locked when
601 scm_i_string_start_writing was called. */
602 void
603 scm_i_string_stop_writing (void)
604 {
605 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
606 }
607
608 /* Return the Xth character of STR as a UCS-4 codepoint. */
609 scm_t_wchar
610 scm_i_string_ref (SCM str, size_t x)
611 {
612 if (scm_i_is_narrow_string (str))
613 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
614 else
615 return scm_i_string_wide_chars (str)[x];
616 }
617
618 /* Returns index+1 of the first char in STR that matches C, or
619 0 if the char is not found. */
620 int
621 scm_i_string_contains_char (SCM str, char ch)
622 {
623 size_t i;
624 size_t len = scm_i_string_length (str);
625
626 i = 0;
627 if (scm_i_is_narrow_string (str))
628 {
629 while (i < len)
630 {
631 if (scm_i_string_chars (str)[i] == ch)
632 return i+1;
633 i++;
634 }
635 }
636 else
637 {
638 while (i < len)
639 {
640 if (scm_i_string_wide_chars (str)[i]
641 == (unsigned char) ch)
642 return i+1;
643 i++;
644 }
645 }
646 return 0;
647 }
648
649 int
650 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
651 {
652 if (scm_i_is_narrow_string (sstr))
653 {
654 const char *a = scm_i_string_chars (sstr) + start_x;
655 const char *b = cstr;
656 return strncmp (a, b, strlen(b));
657 }
658 else
659 {
660 size_t i;
661 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
662 const char *b = cstr;
663 for (i = 0; i < strlen (b); i++)
664 {
665 if (a[i] != (unsigned char) b[i])
666 return 1;
667 }
668 }
669 return 0;
670 }
671
672 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
673 void
674 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
675 {
676 if (IS_SH_STRING (str))
677 {
678 p += STRING_START (str);
679 str = SH_STRING_STRING (str);
680 }
681
682 if (chr > 0xFF && scm_i_is_narrow_string (str))
683 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
684
685 if (scm_i_is_narrow_string (str))
686 {
687 char *dst = scm_i_string_writable_chars (str);
688 dst[p] = chr;
689 }
690 else
691 {
692 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
693 dst[p] = chr;
694 }
695 }
696
697 \f
698 /* Symbols.
699
700 Basic symbol creation and accessing is done here, the rest is in
701 symbols.[hc]. This has been done to keep stringbufs and the
702 internals of strings and string-like objects confined to this file.
703 */
704
705 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
706
707 SCM
708 scm_i_make_symbol (SCM name, scm_t_bits flags,
709 unsigned long hash, SCM props)
710 {
711 SCM buf;
712 size_t start = STRING_START (name);
713 size_t length = STRING_LENGTH (name);
714
715 if (IS_SH_STRING (name))
716 {
717 name = SH_STRING_STRING (name);
718 start += STRING_START (name);
719 }
720 buf = SYMBOL_STRINGBUF (name);
721
722 if (start == 0 && length == STRINGBUF_LENGTH (buf))
723 {
724 /* reuse buf. */
725 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
726 SET_STRINGBUF_SHARED (buf);
727 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
728 }
729 else
730 {
731 /* make new buf. */
732 if (scm_i_is_narrow_string (name))
733 {
734 SCM new_buf = make_stringbuf (length);
735 memcpy (STRINGBUF_CHARS (new_buf),
736 STRINGBUF_CHARS (buf) + start, length);
737 buf = new_buf;
738 }
739 else
740 {
741 SCM new_buf = make_wide_stringbuf (length);
742 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
743 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
744 length);
745 buf = new_buf;
746 }
747 }
748 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
749 (scm_t_bits) hash, SCM_UNPACK (props));
750 }
751
752 SCM
753 scm_i_c_make_symbol (const char *name, size_t len,
754 scm_t_bits flags, unsigned long hash, SCM props)
755 {
756 SCM buf = make_stringbuf (len);
757 memcpy (STRINGBUF_CHARS (buf), name, len);
758
759 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
760 (scm_t_bits) hash, SCM_UNPACK (props));
761 }
762
763 /* Returns the number of characters in SYM. This may be different
764 from the memory size of SYM. */
765 size_t
766 scm_i_symbol_length (SCM sym)
767 {
768 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
769 }
770
771 size_t
772 scm_c_symbol_length (SCM sym)
773 #define FUNC_NAME "scm_c_symbol_length"
774 {
775 SCM_VALIDATE_SYMBOL (1, sym);
776
777 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
778 }
779 #undef FUNC_NAME
780
781 /* True if the name of SYM is stored as a Latin-1 encoded string.
782 False if it is stored as a 32-bit UCS-4-encoded string. */
783 int
784 scm_i_is_narrow_symbol (SCM sym)
785 {
786 SCM buf;
787
788 buf = SYMBOL_STRINGBUF (sym);
789 return !STRINGBUF_WIDE (buf);
790 }
791
792 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
793 contains the name of SYM. */
794 const char *
795 scm_i_symbol_chars (SCM sym)
796 {
797 SCM buf;
798
799 buf = SYMBOL_STRINGBUF (sym);
800 if (!STRINGBUF_WIDE (buf))
801 return (const char *) STRINGBUF_CHARS (buf);
802 else
803 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
804 scm_list_1 (sym));
805 }
806
807 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
808 symbol's name. */
809 const scm_t_wchar *
810 scm_i_symbol_wide_chars (SCM sym)
811 {
812 SCM buf;
813
814 buf = SYMBOL_STRINGBUF (sym);
815 if (STRINGBUF_WIDE (buf))
816 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
817 else
818 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
819 scm_list_1 (sym));
820 }
821
822 SCM
823 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
824 {
825 SCM buf = SYMBOL_STRINGBUF (sym);
826 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
827 SET_STRINGBUF_SHARED (buf);
828 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
829 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
830 (scm_t_bits)start, (scm_t_bits) end - start);
831 }
832
833 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
834 scm_t_wchar
835 scm_i_symbol_ref (SCM sym, size_t x)
836 {
837 if (scm_i_is_narrow_symbol (sym))
838 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
839 else
840 return scm_i_symbol_wide_chars (sym)[x];
841 }
842
843 /* Debugging
844 */
845
846 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
847 "Returns an association list containing debugging information\n"
848 "for @var{str}. The association list has the following entries."
849 "@table @code\n"
850 "@item string\n"
851 "The string itself.\n"
852 "@item start\n"
853 "The start index of the string into its stringbuf\n"
854 "@item length\n"
855 "The length of the string\n"
856 "@item shared\n"
857 "If this string is a substring, it returns its parent string.\n"
858 "Otherwise, it returns @code{#f}\n"
859 "@item read-only\n"
860 "@code{#t} if the string is read-only\n"
861 "@item stringbuf-chars\n"
862 "A new string containing this string's stringbuf's characters\n"
863 "@item stringbuf-length\n"
864 "The number of characters in this stringbuf\n"
865 "@item stringbuf-shared\n"
866 "@code{#t} if this stringbuf is shared\n"
867 "@item stringbuf-wide\n"
868 "@code{#t} if this stringbuf's characters are stored in a\n"
869 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
870 "buffer\n"
871 "@end table")
872 #define FUNC_NAME s_scm_sys_string_dump
873 {
874 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
875 SCM buf;
876 SCM_VALIDATE_STRING (1, str);
877
878 /* String info */
879 e1 = scm_cons (scm_from_latin1_symbol ("string"),
880 str);
881 e2 = scm_cons (scm_from_latin1_symbol ("start"),
882 scm_from_size_t (STRING_START (str)));
883 e3 = scm_cons (scm_from_latin1_symbol ("length"),
884 scm_from_size_t (STRING_LENGTH (str)));
885
886 if (IS_SH_STRING (str))
887 {
888 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
889 SH_STRING_STRING (str));
890 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
891 }
892 else
893 {
894 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
895 SCM_BOOL_F);
896 buf = STRING_STRINGBUF (str);
897 }
898
899 if (IS_RO_STRING (str))
900 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
901 SCM_BOOL_T);
902 else
903 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
904 SCM_BOOL_F);
905
906 /* Stringbuf info */
907 if (!STRINGBUF_WIDE (buf))
908 {
909 size_t len = STRINGBUF_LENGTH (buf);
910 char *cbuf;
911 SCM sbc = scm_i_make_string (len, &cbuf, 0);
912 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
913 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
914 sbc);
915 }
916 else
917 {
918 size_t len = STRINGBUF_LENGTH (buf);
919 scm_t_wchar *cbuf;
920 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
921 u32_cpy ((scm_t_uint32 *) cbuf,
922 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
923 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
924 sbc);
925 }
926 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
927 scm_from_size_t (STRINGBUF_LENGTH (buf)));
928 if (STRINGBUF_SHARED (buf))
929 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
930 SCM_BOOL_T);
931 else
932 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
933 SCM_BOOL_F);
934 if (STRINGBUF_WIDE (buf))
935 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
936 SCM_BOOL_T);
937 else
938 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
939 SCM_BOOL_F);
940
941 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
942 }
943 #undef FUNC_NAME
944
945 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
946 "Returns an association list containing debugging information\n"
947 "for @var{sym}. The association list has the following entries."
948 "@table @code\n"
949 "@item symbol\n"
950 "The symbol itself\n"
951 "@item hash\n"
952 "Its hash value\n"
953 "@item interned\n"
954 "@code{#t} if it is an interned symbol\n"
955 "@item stringbuf-chars\n"
956 "A new string containing this symbols's stringbuf's characters\n"
957 "@item stringbuf-length\n"
958 "The number of characters in this stringbuf\n"
959 "@item stringbuf-shared\n"
960 "@code{#t} if this stringbuf is shared\n"
961 "@item stringbuf-wide\n"
962 "@code{#t} if this stringbuf's characters are stored in a\n"
963 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
964 "buffer\n"
965 "@end table")
966 #define FUNC_NAME s_scm_sys_symbol_dump
967 {
968 SCM e1, e2, e3, e4, e5, e6, e7;
969 SCM buf;
970 SCM_VALIDATE_SYMBOL (1, sym);
971 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
972 sym);
973 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
974 scm_from_ulong (scm_i_symbol_hash (sym)));
975 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
976 scm_symbol_interned_p (sym));
977 buf = SYMBOL_STRINGBUF (sym);
978
979 /* Stringbuf info */
980 if (!STRINGBUF_WIDE (buf))
981 {
982 size_t len = STRINGBUF_LENGTH (buf);
983 char *cbuf;
984 SCM sbc = scm_i_make_string (len, &cbuf, 0);
985 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
986 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
987 sbc);
988 }
989 else
990 {
991 size_t len = STRINGBUF_LENGTH (buf);
992 scm_t_wchar *cbuf;
993 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
994 u32_cpy ((scm_t_uint32 *) cbuf,
995 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
996 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
997 sbc);
998 }
999 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1000 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1001 if (STRINGBUF_SHARED (buf))
1002 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1003 SCM_BOOL_T);
1004 else
1005 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1006 SCM_BOOL_F);
1007 if (STRINGBUF_WIDE (buf))
1008 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1009 SCM_BOOL_T);
1010 else
1011 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1012 SCM_BOOL_F);
1013 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1014
1015 }
1016 #undef FUNC_NAME
1017
1018 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1019
1020 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1021 #define FUNC_NAME s_scm_sys_stringbuf_hist
1022 {
1023 int i;
1024 for (i = 0; i < 1000; i++)
1025 if (lenhist[i])
1026 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1027 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1028 return SCM_UNSPECIFIED;
1029 }
1030 #undef FUNC_NAME
1031
1032 #endif
1033
1034 \f
1035
1036 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1037 (SCM obj),
1038 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1039 #define FUNC_NAME s_scm_string_p
1040 {
1041 return scm_from_bool (IS_STRING (obj));
1042 }
1043 #undef FUNC_NAME
1044
1045
1046 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1047
1048 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1049 (SCM chrs),
1050 "@deffnx {Scheme Procedure} list->string chrs\n"
1051 "Return a newly allocated string composed of the arguments,\n"
1052 "@var{chrs}.")
1053 #define FUNC_NAME s_scm_string
1054 {
1055 SCM result = SCM_BOOL_F;
1056 SCM rest;
1057 size_t len;
1058 size_t p = 0;
1059 long i;
1060 int wide = 0;
1061
1062 /* Verify that this is a list of chars. */
1063 i = scm_ilength (chrs);
1064 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1065
1066 len = (size_t) i;
1067 rest = chrs;
1068
1069 while (len > 0 && scm_is_pair (rest))
1070 {
1071 SCM elt = SCM_CAR (rest);
1072 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1073 if (SCM_CHAR (elt) > 0xFF)
1074 wide = 1;
1075 rest = SCM_CDR (rest);
1076 len--;
1077 scm_remember_upto_here_1 (elt);
1078 }
1079
1080 /* Construct a string containing this list of chars. */
1081 len = (size_t) i;
1082 rest = chrs;
1083
1084 if (wide == 0)
1085 {
1086 char *buf;
1087
1088 result = scm_i_make_string (len, NULL, 0);
1089 result = scm_i_string_start_writing (result);
1090 buf = scm_i_string_writable_chars (result);
1091 while (len > 0 && scm_is_pair (rest))
1092 {
1093 SCM elt = SCM_CAR (rest);
1094 buf[p] = (unsigned char) SCM_CHAR (elt);
1095 p++;
1096 rest = SCM_CDR (rest);
1097 len--;
1098 scm_remember_upto_here_1 (elt);
1099 }
1100 }
1101 else
1102 {
1103 scm_t_wchar *buf;
1104
1105 result = scm_i_make_wide_string (len, NULL, 0);
1106 result = scm_i_string_start_writing (result);
1107 buf = scm_i_string_writable_wide_chars (result);
1108 while (len > 0 && scm_is_pair (rest))
1109 {
1110 SCM elt = SCM_CAR (rest);
1111 buf[p] = SCM_CHAR (elt);
1112 p++;
1113 rest = SCM_CDR (rest);
1114 len--;
1115 scm_remember_upto_here_1 (elt);
1116 }
1117 }
1118 scm_i_string_stop_writing ();
1119
1120 if (len > 0)
1121 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1122 if (!scm_is_null (rest))
1123 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1124
1125 return result;
1126 }
1127 #undef FUNC_NAME
1128
1129 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1130 (SCM k, SCM chr),
1131 "Return a newly allocated string of\n"
1132 "length @var{k}. If @var{chr} is given, then all elements of\n"
1133 "the string are initialized to @var{chr}, otherwise the contents\n"
1134 "of the @var{string} are all set to @var{#\nul}.")
1135 #define FUNC_NAME s_scm_make_string
1136 {
1137 return scm_c_make_string (scm_to_size_t (k), chr);
1138 }
1139 #undef FUNC_NAME
1140
1141 SCM
1142 scm_c_make_string (size_t len, SCM chr)
1143 #define FUNC_NAME NULL
1144 {
1145 size_t p;
1146 char *contents = NULL;
1147 SCM res = scm_i_make_string (len, &contents, 0);
1148
1149 /* If no char is given, initialize string contents to NULL. */
1150 if (SCM_UNBNDP (chr))
1151 memset (contents, 0, len);
1152 else
1153 {
1154 SCM_VALIDATE_CHAR (0, chr);
1155 res = scm_i_string_start_writing (res);
1156 for (p = 0; p < len; p++)
1157 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1158 scm_i_string_stop_writing ();
1159 }
1160
1161 return res;
1162 }
1163 #undef FUNC_NAME
1164
1165 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1166 (SCM string),
1167 "Return the number of characters in @var{string}.")
1168 #define FUNC_NAME s_scm_string_length
1169 {
1170 SCM_VALIDATE_STRING (1, string);
1171 return scm_from_size_t (STRING_LENGTH (string));
1172 }
1173 #undef FUNC_NAME
1174
1175 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1176 (SCM string),
1177 "Return the bytes used to represent a character in @var{string}."
1178 "This will return 1 or 4.")
1179 #define FUNC_NAME s_scm_string_bytes_per_char
1180 {
1181 SCM_VALIDATE_STRING (1, string);
1182 if (!scm_i_is_narrow_string (string))
1183 return scm_from_int (4);
1184
1185 return scm_from_int (1);
1186 }
1187 #undef FUNC_NAME
1188
1189 size_t
1190 scm_c_string_length (SCM string)
1191 {
1192 if (!IS_STRING (string))
1193 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1194 return STRING_LENGTH (string);
1195 }
1196
1197 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1198 (SCM str, SCM k),
1199 "Return character @var{k} of @var{str} using zero-origin\n"
1200 "indexing. @var{k} must be a valid index of @var{str}.")
1201 #define FUNC_NAME s_scm_string_ref
1202 {
1203 size_t len;
1204 unsigned long idx;
1205
1206 SCM_VALIDATE_STRING (1, str);
1207
1208 len = scm_i_string_length (str);
1209 if (SCM_LIKELY (len > 0))
1210 idx = scm_to_unsigned_integer (k, 0, len - 1);
1211 else
1212 scm_out_of_range (NULL, k);
1213
1214 if (scm_i_is_narrow_string (str))
1215 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1216 else
1217 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1218 }
1219 #undef FUNC_NAME
1220
1221 SCM
1222 scm_c_string_ref (SCM str, size_t p)
1223 {
1224 if (p >= scm_i_string_length (str))
1225 scm_out_of_range (NULL, scm_from_size_t (p));
1226 if (scm_i_is_narrow_string (str))
1227 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1228 else
1229 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1230
1231 }
1232
1233 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1234 (SCM str, SCM k, SCM chr),
1235 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1236 "an unspecified value. @var{k} must be a valid index of\n"
1237 "@var{str}.")
1238 #define FUNC_NAME s_scm_string_set_x
1239 {
1240 size_t len;
1241 unsigned long idx;
1242
1243 SCM_VALIDATE_STRING (1, str);
1244
1245 len = scm_i_string_length (str);
1246 if (SCM_LIKELY (len > 0))
1247 idx = scm_to_unsigned_integer (k, 0, len - 1);
1248 else
1249 scm_out_of_range (NULL, k);
1250
1251 SCM_VALIDATE_CHAR (3, chr);
1252 str = scm_i_string_start_writing (str);
1253 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1254 scm_i_string_stop_writing ();
1255
1256 return SCM_UNSPECIFIED;
1257 }
1258 #undef FUNC_NAME
1259
1260 void
1261 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1262 {
1263 if (p >= scm_i_string_length (str))
1264 scm_out_of_range (NULL, scm_from_size_t (p));
1265 str = scm_i_string_start_writing (str);
1266 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1267 scm_i_string_stop_writing ();
1268 }
1269
1270 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1271 (SCM str, SCM start, SCM end),
1272 "Return a newly allocated string formed from the characters\n"
1273 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1274 "ending with index @var{end} (exclusive).\n"
1275 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1276 "exact integers satisfying:\n\n"
1277 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1278 #define FUNC_NAME s_scm_substring
1279 {
1280 size_t len, from, to;
1281
1282 SCM_VALIDATE_STRING (1, str);
1283 len = scm_i_string_length (str);
1284 from = scm_to_unsigned_integer (start, 0, len);
1285 if (SCM_UNBNDP (end))
1286 to = len;
1287 else
1288 to = scm_to_unsigned_integer (end, from, len);
1289 return scm_i_substring (str, from, to);
1290 }
1291 #undef FUNC_NAME
1292
1293 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1294 (SCM str, SCM start, SCM end),
1295 "Return a newly allocated string formed from the characters\n"
1296 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1297 "ending with index @var{end} (exclusive).\n"
1298 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1299 "exact integers satisfying:\n"
1300 "\n"
1301 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1302 "\n"
1303 "The returned string is read-only.\n")
1304 #define FUNC_NAME s_scm_substring_read_only
1305 {
1306 size_t len, from, to;
1307
1308 SCM_VALIDATE_STRING (1, str);
1309 len = scm_i_string_length (str);
1310 from = scm_to_unsigned_integer (start, 0, len);
1311 if (SCM_UNBNDP (end))
1312 to = len;
1313 else
1314 to = scm_to_unsigned_integer (end, from, len);
1315 return scm_i_substring_read_only (str, from, to);
1316 }
1317 #undef FUNC_NAME
1318
1319 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1320 (SCM str, SCM start, SCM end),
1321 "Return a newly allocated string formed from the characters\n"
1322 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323 "ending with index @var{end} (exclusive).\n"
1324 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325 "exact integers satisfying:\n\n"
1326 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1327 #define FUNC_NAME s_scm_substring_copy
1328 {
1329 /* For the Scheme version, START is mandatory, but for the C
1330 version, it is optional. See scm_string_copy in srfi-13.c for a
1331 rationale.
1332 */
1333
1334 size_t from, to;
1335
1336 SCM_VALIDATE_STRING (1, str);
1337 scm_i_get_substring_spec (scm_i_string_length (str),
1338 start, &from, end, &to);
1339 return scm_i_substring_copy (str, from, to);
1340 }
1341 #undef FUNC_NAME
1342
1343 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1344 (SCM str, SCM start, SCM end),
1345 "Return string that indirectly refers to the characters\n"
1346 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1347 "ending with index @var{end} (exclusive).\n"
1348 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1349 "exact integers satisfying:\n\n"
1350 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1351 #define FUNC_NAME s_scm_substring_shared
1352 {
1353 size_t len, from, to;
1354
1355 SCM_VALIDATE_STRING (1, str);
1356 len = scm_i_string_length (str);
1357 from = scm_to_unsigned_integer (start, 0, len);
1358 if (SCM_UNBNDP (end))
1359 to = len;
1360 else
1361 to = scm_to_unsigned_integer (end, from, len);
1362 return scm_i_substring_shared (str, from, to);
1363 }
1364 #undef FUNC_NAME
1365
1366 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1367 (SCM args),
1368 "Return a newly allocated string whose characters form the\n"
1369 "concatenation of the given strings, @var{args}.")
1370 #define FUNC_NAME s_scm_string_append
1371 {
1372 SCM res;
1373 size_t len = 0;
1374 int wide = 0;
1375 SCM l, s;
1376 size_t i;
1377 union
1378 {
1379 char *narrow;
1380 scm_t_wchar *wide;
1381 } data;
1382
1383 SCM_VALIDATE_REST_ARGUMENT (args);
1384 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1385 {
1386 s = SCM_CAR (l);
1387 SCM_VALIDATE_STRING (SCM_ARGn, s);
1388 len += scm_i_string_length (s);
1389 if (!scm_i_is_narrow_string (s))
1390 wide = 1;
1391 }
1392 data.narrow = NULL;
1393 if (!wide)
1394 res = scm_i_make_string (len, &data.narrow, 0);
1395 else
1396 res = scm_i_make_wide_string (len, &data.wide, 0);
1397
1398 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1399 {
1400 size_t len;
1401 s = SCM_CAR (l);
1402 SCM_VALIDATE_STRING (SCM_ARGn, s);
1403 len = scm_i_string_length (s);
1404 if (!wide)
1405 {
1406 memcpy (data.narrow, scm_i_string_chars (s), len);
1407 data.narrow += len;
1408 }
1409 else
1410 {
1411 if (scm_i_is_narrow_string (s))
1412 {
1413 for (i = 0; i < scm_i_string_length (s); i++)
1414 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1415 }
1416 else
1417 u32_cpy ((scm_t_uint32 *) data.wide,
1418 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1419 data.wide += len;
1420 }
1421 scm_remember_upto_here_1 (s);
1422 }
1423 return res;
1424 }
1425 #undef FUNC_NAME
1426
1427
1428 \f
1429 /* Charset conversion error handling. */
1430
1431 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1432 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1433
1434 /* Raise an exception informing that character CHR could not be written
1435 to PORT in its current encoding. */
1436 void
1437 scm_encoding_error (const char *subr, int err, const char *message,
1438 SCM port, SCM chr)
1439 {
1440 scm_throw (scm_encoding_error_key,
1441 scm_list_n (scm_from_latin1_string (subr),
1442 scm_from_latin1_string (message),
1443 scm_from_int (err),
1444 port, chr,
1445 SCM_UNDEFINED));
1446 }
1447
1448 /* Raise an exception informing of an encoding error on PORT. This
1449 means that a character could not be written in PORT's encoding. */
1450 void
1451 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1452 {
1453 scm_throw (scm_decoding_error_key,
1454 scm_list_n (scm_from_latin1_string (subr),
1455 scm_from_latin1_string (message),
1456 scm_from_int (err),
1457 port,
1458 SCM_UNDEFINED));
1459 }
1460
1461 \f
1462 /* String conversion to/from C. */
1463
1464 SCM
1465 scm_from_stringn (const char *str, size_t len, const char *encoding,
1466 scm_t_string_failed_conversion_handler handler)
1467 {
1468 size_t u32len, i;
1469 scm_t_wchar *u32;
1470 int wide = 0;
1471 SCM res;
1472
1473 /* The order of these checks is important. */
1474 if (!str && len != 0)
1475 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1476 if (len == (size_t) -1)
1477 len = strlen (str);
1478
1479 if (encoding == NULL || len == 0)
1480 {
1481 /* If encoding is null (or the string is empty), use Latin-1. */
1482 char *buf;
1483 res = scm_i_make_string (len, &buf, 0);
1484 memcpy (buf, str, len);
1485 return res;
1486 }
1487
1488 u32len = 0;
1489 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1490 (enum iconv_ilseq_handler)
1491 handler,
1492 str, len,
1493 NULL,
1494 NULL, &u32len);
1495
1496 if (SCM_UNLIKELY (u32 == NULL))
1497 {
1498 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1499 handler. */
1500 SCM bv;
1501 signed char *buf;
1502
1503 buf = scm_gc_malloc_pointerless (len, "bytevector");
1504 memcpy (buf, str, len);
1505 bv = scm_c_take_gc_bytevector (buf, len);
1506
1507 scm_decoding_error (__func__, errno,
1508 "input locale conversion error", bv);
1509 }
1510
1511 i = 0;
1512 while (i < u32len)
1513 if (u32[i++] > 0xFF)
1514 {
1515 wide = 1;
1516 break;
1517 }
1518
1519 if (!wide)
1520 {
1521 char *dst;
1522 res = scm_i_make_string (u32len, &dst, 0);
1523 for (i = 0; i < u32len; i ++)
1524 dst[i] = (unsigned char) u32[i];
1525 dst[u32len] = '\0';
1526 }
1527 else
1528 {
1529 scm_t_wchar *wdst;
1530 res = scm_i_make_wide_string (u32len, &wdst, 0);
1531 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1532 wdst[u32len] = 0;
1533 }
1534
1535 free (u32);
1536 return res;
1537 }
1538
1539 SCM
1540 scm_from_locale_string (const char *str)
1541 {
1542 return scm_from_locale_stringn (str, -1);
1543 }
1544
1545 SCM
1546 scm_from_locale_stringn (const char *str, size_t len)
1547 {
1548 return scm_from_stringn (str, len, locale_charset (),
1549 scm_i_get_conversion_strategy (SCM_BOOL_F));
1550 }
1551
1552 SCM
1553 scm_from_latin1_string (const char *str)
1554 {
1555 return scm_from_latin1_stringn (str, -1);
1556 }
1557
1558 SCM
1559 scm_from_latin1_stringn (const char *str, size_t len)
1560 {
1561 char *buf;
1562 SCM result;
1563
1564 if (len == (size_t) -1)
1565 len = strlen (str);
1566
1567 /* Make a narrow string and copy STR as is. */
1568 result = scm_i_make_string (len, &buf, 0);
1569 memcpy (buf, str, len);
1570
1571 return result;
1572 }
1573
1574 SCM
1575 scm_from_utf8_string (const char *str)
1576 {
1577 return scm_from_utf8_stringn (str, -1);
1578 }
1579
1580 SCM
1581 scm_from_utf8_stringn (const char *str, size_t len)
1582 {
1583 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1584 }
1585
1586 SCM
1587 scm_from_utf32_string (const scm_t_wchar *str)
1588 {
1589 return scm_from_utf32_stringn (str, -1);
1590 }
1591
1592 SCM
1593 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1594 {
1595 SCM result;
1596 scm_t_wchar *buf;
1597
1598 if (len == (size_t) -1)
1599 len = u32_strlen ((uint32_t *) str);
1600
1601 result = scm_i_make_wide_string (len, &buf, 0);
1602 memcpy (buf, str, len * sizeof (scm_t_wchar));
1603 scm_i_try_narrow_string (result);
1604
1605 return result;
1606 }
1607
1608 /* Create a new scheme string from the C string STR. The memory of
1609 STR may be used directly as storage for the new string. */
1610 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1611 would be to register a finalizer to eventually free(3) STR, which isn't
1612 worth it. Should we just deprecate the `scm_take_' functions? */
1613 SCM
1614 scm_take_locale_stringn (char *str, size_t len)
1615 {
1616 SCM res;
1617
1618 res = scm_from_locale_stringn (str, len);
1619 free (str);
1620
1621 return res;
1622 }
1623
1624 SCM
1625 scm_take_locale_string (char *str)
1626 {
1627 return scm_take_locale_stringn (str, -1);
1628 }
1629
1630 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1631 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1632 Set *LENP to the size of the resulting string.
1633
1634 FIXME: This is a hack we should get rid of. See
1635 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1636 for details. */
1637 static void
1638 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1639 {
1640 char *before, *after;
1641 size_t i, j;
1642
1643 before = buf;
1644 after = buf;
1645 i = 0;
1646 j = 0;
1647 while (i < *lenp)
1648 {
1649 if ((i <= *lenp - 6)
1650 && before[i] == '\\'
1651 && before[i + 1] == 'u'
1652 && before[i + 2] == '0' && before[i + 3] == '0')
1653 {
1654 /* Convert \u00NN to \xNN */
1655 after[j] = '\\';
1656 after[j + 1] = 'x';
1657 after[j + 2] = tolower ((int) before[i + 4]);
1658 after[j + 3] = tolower ((int) before[i + 5]);
1659 i += 6;
1660 j += 4;
1661 }
1662 else if ((i <= *lenp - 10)
1663 && before[i] == '\\'
1664 && before[i + 1] == 'U'
1665 && before[i + 2] == '0' && before[i + 3] == '0')
1666 {
1667 /* Convert \U00NNNNNN to \UNNNNNN */
1668 after[j] = '\\';
1669 after[j + 1] = 'U';
1670 after[j + 2] = tolower ((int) before[i + 4]);
1671 after[j + 3] = tolower ((int) before[i + 5]);
1672 after[j + 4] = tolower ((int) before[i + 6]);
1673 after[j + 5] = tolower ((int) before[i + 7]);
1674 after[j + 6] = tolower ((int) before[i + 8]);
1675 after[j + 7] = tolower ((int) before[i + 9]);
1676 i += 10;
1677 j += 8;
1678 }
1679 else
1680 {
1681 after[j] = before[i];
1682 i++;
1683 j++;
1684 }
1685 }
1686 *lenp = j;
1687 }
1688
1689 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1690 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1691 of the resulting string. BUF must be large enough to handle the
1692 worst case when `\uXXXX' escapes (6 characters) are replaced by
1693 `\xXXXX;' (7 characters). */
1694 static void
1695 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1696 {
1697 char *before, *after;
1698 size_t i, j;
1699 /* The worst case is if the input string contains all 4-digit hex escapes.
1700 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1701 size_t max_out_len = (*lenp * 7) / 6 + 1;
1702 size_t nzeros, ndigits;
1703
1704 before = buf;
1705 after = alloca (max_out_len);
1706 i = 0;
1707 j = 0;
1708 while (i < *lenp)
1709 {
1710 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1711 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1712 {
1713 if (before[i + 1] == 'u')
1714 ndigits = 4;
1715 else if (before[i + 1] == 'U')
1716 ndigits = 8;
1717 else
1718 abort ();
1719
1720 /* Add the R6RS hex escape initial sequence. */
1721 after[j] = '\\';
1722 after[j + 1] = 'x';
1723
1724 /* Move string positions to the start of the hex numbers. */
1725 i += 2;
1726 j += 2;
1727
1728 /* Find the number of initial zeros in this hex number. */
1729 nzeros = 0;
1730 while (before[i + nzeros] == '0' && nzeros < ndigits)
1731 nzeros++;
1732
1733 /* Copy the number, skipping initial zeros, and then move the string
1734 positions. */
1735 if (nzeros == ndigits)
1736 {
1737 after[j] = '0';
1738 i += ndigits;
1739 j += 1;
1740 }
1741 else
1742 {
1743 int pos;
1744 for (pos = 0; pos < ndigits - nzeros; pos++)
1745 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1746 i += ndigits;
1747 j += (ndigits - nzeros);
1748 }
1749
1750 /* Add terminating semicolon. */
1751 after[j] = ';';
1752 j++;
1753 }
1754 else
1755 {
1756 after[j] = before[i];
1757 i++;
1758 j++;
1759 }
1760 }
1761 *lenp = j;
1762 memcpy (before, after, j);
1763 }
1764
1765 char *
1766 scm_to_locale_string (SCM str)
1767 {
1768 return scm_to_locale_stringn (str, NULL);
1769 }
1770
1771 char *
1772 scm_to_locale_stringn (SCM str, size_t *lenp)
1773 {
1774 return scm_to_stringn (str, lenp,
1775 locale_charset (),
1776 scm_i_get_conversion_strategy (SCM_BOOL_F));
1777 }
1778
1779 char *
1780 scm_to_latin1_string (SCM str)
1781 {
1782 return scm_to_latin1_stringn (str, NULL);
1783 }
1784
1785 char *
1786 scm_to_latin1_stringn (SCM str, size_t *lenp)
1787 #define FUNC_NAME "scm_to_latin1_stringn"
1788 {
1789 char *result;
1790
1791 SCM_VALIDATE_STRING (1, str);
1792
1793 if (scm_i_is_narrow_string (str))
1794 {
1795 size_t len = scm_i_string_length (str);
1796
1797 if (lenp)
1798 *lenp = len;
1799
1800 result = scm_strndup (scm_i_string_data (str), len);
1801 }
1802 else
1803 result = scm_to_stringn (str, lenp, NULL,
1804 SCM_FAILED_CONVERSION_ERROR);
1805
1806 return result;
1807 }
1808 #undef FUNC_NAME
1809
1810 char *
1811 scm_to_utf8_string (SCM str)
1812 {
1813 return scm_to_utf8_stringn (str, NULL);
1814 }
1815
1816 char *
1817 scm_to_utf8_stringn (SCM str, size_t *lenp)
1818 {
1819 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1820 }
1821
1822 scm_t_wchar *
1823 scm_to_utf32_string (SCM str)
1824 {
1825 return scm_to_utf32_stringn (str, NULL);
1826 }
1827
1828 scm_t_wchar *
1829 scm_to_utf32_stringn (SCM str, size_t *lenp)
1830 #define FUNC_NAME "scm_to_utf32_stringn"
1831 {
1832 scm_t_wchar *result;
1833
1834 SCM_VALIDATE_STRING (1, str);
1835
1836 if (scm_i_is_narrow_string (str))
1837 result = (scm_t_wchar *)
1838 scm_to_stringn (str, lenp, "UTF-32",
1839 SCM_FAILED_CONVERSION_ERROR);
1840 else
1841 {
1842 size_t len;
1843
1844 len = scm_i_string_length (str);
1845 if (lenp)
1846 *lenp = len;
1847
1848 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1849 memcpy (result, scm_i_string_wide_chars (str),
1850 len * sizeof (scm_t_wchar));
1851 result[len] = 0;
1852 }
1853
1854 return result;
1855 }
1856 #undef FUNC_NAME
1857
1858 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1859 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1860 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1861 defined by HANDLER. */
1862 char *
1863 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1864 scm_t_string_failed_conversion_handler handler)
1865 {
1866 char *buf;
1867 size_t ilen, len, i;
1868 int ret;
1869 const char *enc;
1870
1871 if (!scm_is_string (str))
1872 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1873 ilen = scm_i_string_length (str);
1874
1875 if (ilen == 0)
1876 {
1877 buf = scm_malloc (1);
1878 buf[0] = '\0';
1879 if (lenp)
1880 *lenp = 0;
1881 return buf;
1882 }
1883
1884 if (lenp == NULL)
1885 for (i = 0; i < ilen; i++)
1886 if (scm_i_string_ref (str, i) == '\0')
1887 scm_misc_error (NULL,
1888 "string contains #\\nul character: ~S",
1889 scm_list_1 (str));
1890
1891 if (scm_i_is_narrow_string (str) && (encoding == NULL))
1892 {
1893 /* If using native Latin-1 encoding, just copy the string
1894 contents. */
1895 if (lenp)
1896 {
1897 buf = scm_malloc (ilen);
1898 memcpy (buf, scm_i_string_chars (str), ilen);
1899 *lenp = ilen;
1900 return buf;
1901 }
1902 else
1903 {
1904 buf = scm_malloc (ilen + 1);
1905 memcpy (buf, scm_i_string_chars (str), ilen);
1906 buf[ilen] = '\0';
1907 return buf;
1908 }
1909 }
1910
1911
1912 buf = NULL;
1913 len = 0;
1914 enc = encoding;
1915 if (enc == NULL)
1916 enc = "ISO-8859-1";
1917 if (scm_i_is_narrow_string (str))
1918 {
1919 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1920 "ISO-8859-1", enc,
1921 (enum iconv_ilseq_handler) handler, NULL,
1922 &buf, &len);
1923
1924 if (ret != 0)
1925 scm_encoding_error (__func__, errno,
1926 "cannot convert narrow string to output locale",
1927 SCM_BOOL_F,
1928 /* FIXME: Faulty character unknown. */
1929 SCM_BOOL_F);
1930 }
1931 else
1932 {
1933 buf = u32_conv_to_encoding (enc,
1934 (enum iconv_ilseq_handler) handler,
1935 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1936 ilen,
1937 NULL,
1938 NULL, &len);
1939 if (buf == NULL)
1940 scm_encoding_error (__func__, errno,
1941 "cannot convert wide string to output locale",
1942 SCM_BOOL_F,
1943 /* FIXME: Faulty character unknown. */
1944 SCM_BOOL_F);
1945 }
1946 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1947 {
1948 if (SCM_R6RS_ESCAPES_P)
1949 {
1950 /* The worst case is if the input string contains all 4-digit
1951 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1952 (seven characters). Make BUF large enough to hold
1953 that. */
1954 buf = scm_realloc (buf, (len * 7) / 6 + 1);
1955 unistring_escapes_to_r6rs_escapes (buf, &len);
1956 }
1957 else
1958 unistring_escapes_to_guile_escapes (buf, &len);
1959
1960 buf = scm_realloc (buf, len);
1961 }
1962 if (lenp)
1963 *lenp = len;
1964 else
1965 {
1966 buf = scm_realloc (buf, len + 1);
1967 buf[len] = '\0';
1968 }
1969
1970 scm_remember_upto_here_1 (str);
1971 return buf;
1972 }
1973
1974 size_t
1975 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1976 {
1977 size_t len;
1978 char *result = NULL;
1979 if (!scm_is_string (str))
1980 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1981 result = scm_to_locale_stringn (str, &len);
1982
1983 memcpy (buf, result, (len > max_len) ? max_len : len);
1984 free (result);
1985
1986 scm_remember_upto_here_1 (str);
1987 return len;
1988 }
1989
1990 \f
1991 /* Unicode string normalization. */
1992
1993 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1994 libguile/i18n.c. It would be useful to have this factored out into a more
1995 convenient location, but its use of alloca makes that tricky to do. */
1996
1997 static SCM
1998 normalize_str (SCM string, uninorm_t form)
1999 {
2000 SCM ret;
2001 scm_t_uint32 *w_str;
2002 scm_t_wchar *cbuf;
2003 size_t rlen, len = scm_i_string_length (string);
2004
2005 if (scm_i_is_narrow_string (string))
2006 {
2007 size_t i;
2008 const char *buf = scm_i_string_chars (string);
2009
2010 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2011
2012 for (i = 0; i < len; i ++)
2013 w_str[i] = (unsigned char) buf[i];
2014 w_str[len] = 0;
2015 }
2016 else
2017 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2018
2019 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2020
2021 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2022 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2023 free (w_str);
2024
2025 scm_i_try_narrow_string (ret);
2026
2027 return ret;
2028 }
2029
2030 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2031 (SCM string),
2032 "Returns the NFC normalized form of @var{string}.")
2033 #define FUNC_NAME s_scm_string_normalize_nfc
2034 {
2035 SCM_VALIDATE_STRING (1, string);
2036 return normalize_str (string, UNINORM_NFC);
2037 }
2038 #undef FUNC_NAME
2039
2040 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2041 (SCM string),
2042 "Returns the NFD normalized form of @var{string}.")
2043 #define FUNC_NAME s_scm_string_normalize_nfd
2044 {
2045 SCM_VALIDATE_STRING (1, string);
2046 return normalize_str (string, UNINORM_NFD);
2047 }
2048 #undef FUNC_NAME
2049
2050 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2051 (SCM string),
2052 "Returns the NFKC normalized form of @var{string}.")
2053 #define FUNC_NAME s_scm_string_normalize_nfkc
2054 {
2055 SCM_VALIDATE_STRING (1, string);
2056 return normalize_str (string, UNINORM_NFKC);
2057 }
2058 #undef FUNC_NAME
2059
2060 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2061 (SCM string),
2062 "Returns the NFKD normalized form of @var{string}.")
2063 #define FUNC_NAME s_scm_string_normalize_nfkd
2064 {
2065 SCM_VALIDATE_STRING (1, string);
2066 return normalize_str (string, UNINORM_NFKD);
2067 }
2068 #undef FUNC_NAME
2069
2070 /* converts C scm_array of strings to SCM scm_list of strings.
2071 If argc < 0, a null terminated scm_array is assumed.
2072 The current locale encoding is assumed */
2073 SCM
2074 scm_makfromstrs (int argc, char **argv)
2075 {
2076 int i = argc;
2077 SCM lst = SCM_EOL;
2078 if (0 > i)
2079 for (i = 0; argv[i]; i++);
2080 while (i--)
2081 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2082 return lst;
2083 }
2084
2085 /* Return a newly allocated array of char pointers to each of the strings
2086 in args, with a terminating NULL pointer. The strings are encoded using
2087 the current locale. */
2088
2089 char **
2090 scm_i_allocate_string_pointers (SCM list)
2091 #define FUNC_NAME "scm_i_allocate_string_pointers"
2092 {
2093 char **result;
2094 int list_len = scm_ilength (list);
2095 int i;
2096
2097 if (list_len < 0)
2098 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2099
2100 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2101 "string pointers");
2102 result[list_len] = NULL;
2103
2104 /* The list might have been modified in another thread, so
2105 we check LIST before each access.
2106 */
2107 for (i = 0; i < list_len && scm_is_pair (list); i++)
2108 {
2109 SCM str = SCM_CAR (list);
2110 size_t len; /* String length in bytes */
2111 char *c_str = scm_to_locale_stringn (str, &len);
2112
2113 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2114 scm_malloc to allocate the returned string, which must be
2115 explicitly deallocated. This forces us to copy the string a
2116 second time into a new buffer. Ideally there would be variants
2117 of scm_to_*_stringn that can return garbage-collected buffers. */
2118
2119 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2120 memcpy (result[i], c_str, len);
2121 result[i][len] = '\0';
2122 free (c_str);
2123
2124 list = SCM_CDR (list);
2125 }
2126
2127 return result;
2128 }
2129 #undef FUNC_NAME
2130
2131 void
2132 scm_i_get_substring_spec (size_t len,
2133 SCM start, size_t *cstart,
2134 SCM end, size_t *cend)
2135 {
2136 if (SCM_UNBNDP (start))
2137 *cstart = 0;
2138 else
2139 *cstart = scm_to_unsigned_integer (start, 0, len);
2140
2141 if (SCM_UNBNDP (end))
2142 *cend = len;
2143 else
2144 *cend = scm_to_unsigned_integer (end, *cstart, len);
2145 }
2146
2147 #if SCM_ENABLE_DEPRECATED
2148
2149 /* When these definitions are removed, it becomes reasonable to use
2150 read-only strings for string literals. For that, change the reader
2151 to create string literals with scm_c_substring_read_only instead of
2152 with scm_c_substring_copy.
2153 */
2154
2155 int
2156 scm_i_deprecated_stringp (SCM str)
2157 {
2158 scm_c_issue_deprecation_warning
2159 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2160
2161 return scm_is_string (str);
2162 }
2163
2164 char *
2165 scm_i_deprecated_string_chars (SCM str)
2166 {
2167 char *chars;
2168
2169 scm_c_issue_deprecation_warning
2170 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2171
2172 /* We don't accept shared substrings here since they are not
2173 null-terminated.
2174 */
2175 if (IS_SH_STRING (str))
2176 scm_misc_error (NULL,
2177 "SCM_STRING_CHARS does not work with shared substrings",
2178 SCM_EOL);
2179
2180 /* We explicitly test for read-only strings to produce a better
2181 error message.
2182 */
2183
2184 if (IS_RO_STRING (str))
2185 scm_misc_error (NULL,
2186 "SCM_STRING_CHARS does not work with read-only strings",
2187 SCM_EOL);
2188
2189 /* The following is still wrong, of course...
2190 */
2191 str = scm_i_string_start_writing (str);
2192 chars = scm_i_string_writable_chars (str);
2193 scm_i_string_stop_writing ();
2194 return chars;
2195 }
2196
2197 size_t
2198 scm_i_deprecated_string_length (SCM str)
2199 {
2200 scm_c_issue_deprecation_warning
2201 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2202 return scm_c_string_length (str);
2203 }
2204
2205 #endif
2206
2207 static SCM
2208 string_handle_ref (scm_t_array_handle *h, size_t index)
2209 {
2210 return scm_c_string_ref (h->array, index);
2211 }
2212
2213 static void
2214 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2215 {
2216 scm_c_string_set_x (h->array, index, val);
2217 }
2218
2219 static void
2220 string_get_handle (SCM v, scm_t_array_handle *h)
2221 {
2222 h->array = v;
2223 h->ndims = 1;
2224 h->dims = &h->dim0;
2225 h->dim0.lbnd = 0;
2226 h->dim0.ubnd = scm_c_string_length (v) - 1;
2227 h->dim0.inc = 1;
2228 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2229 h->elements = h->writable_elements = NULL;
2230 }
2231
2232 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2233 string_handle_ref, string_handle_set,
2234 string_get_handle)
2235 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2236
2237 void
2238 scm_init_strings ()
2239 {
2240 scm_nullstr = scm_i_make_string (0, NULL, 0);
2241
2242 #include "libguile/strings.x"
2243 }
2244
2245
2246 /*
2247 Local Variables:
2248 c-file-style: "gnu"
2249 End:
2250 */