Avoid calling `u32_conv_from_encoding' on the null string
[bpt/guile.git] / libguile / strings.c
1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #include <alloca.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <uninorm.h>
30 #include <unistr.h>
31 #include <uniconv.h>
32
33 #include "striconveh.h"
34
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
44
45 \f
46
47 /* {Strings}
48 */
49
50
51 /* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
68 *
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
72 */
73
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
76
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93
94 #define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
103
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist[1001];
106 #endif
107
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
110 static SCM
111 make_stringbuf (size_t len)
112 {
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
120 SCM buf;
121
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127 #endif
128
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
138 }
139
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
142 static SCM
143 make_wide_stringbuf (size_t len)
144 {
145 SCM buf;
146 size_t raw_len;
147
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153 #endif
154
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
165 }
166
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169 static SCM
170 wide_stringbuf (SCM buf)
171 {
172 SCM new_buf;
173
174 if (STRINGBUF_WIDE (buf))
175 new_buf = buf;
176 else
177 {
178 size_t i, len;
179 scm_t_wchar *mem;
180
181 len = STRINGBUF_LENGTH (buf);
182
183 new_buf = make_wide_stringbuf (len);
184
185 mem = STRINGBUF_WIDE_CHARS (new_buf);
186 for (i = 0; i < len; i++)
187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
188 mem[len] = 0;
189 }
190
191 return new_buf;
192 }
193
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196 static SCM
197 narrow_stringbuf (SCM buf)
198 {
199 SCM new_buf;
200
201 if (!STRINGBUF_WIDE (buf))
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
208
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
211
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
216
217 new_buf = make_stringbuf (len);
218
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
226 }
227
228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
229
230 \f
231 /* Copy-on-write strings.
232 */
233
234 #define STRING_TAG scm_tc7_string
235
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
245 /* Read-only strings.
246 */
247
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
251 /* Mutation-sharing substrings
252 */
253
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
258
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
261 SCM scm_nullstr;
262
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
267 SCM
268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
269 {
270 SCM buf = make_stringbuf (len);
271 SCM res;
272 if (charsp)
273 *charsp = (char *) STRINGBUF_CHARS (buf);
274 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
275 SCM_UNPACK (buf),
276 (scm_t_bits) 0, (scm_t_bits) len);
277 return res;
278 }
279
280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
284 SCM
285 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
286 {
287 SCM buf = make_wide_stringbuf (len);
288 SCM res;
289 if (charsp)
290 *charsp = STRINGBUF_WIDE_CHARS (buf);
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295 }
296
297 static void
298 validate_substring_args (SCM str, size_t start, size_t end)
299 {
300 if (!IS_STRING (str))
301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
302 if (start > STRING_LENGTH (str))
303 scm_out_of_range (NULL, scm_from_size_t (start));
304 if (end > STRING_LENGTH (str) || end < start)
305 scm_out_of_range (NULL, scm_from_size_t (end));
306 }
307
308 static inline void
309 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
310 {
311 *start = STRING_START (*str);
312 if (IS_SH_STRING (*str))
313 {
314 *str = SH_STRING_STRING (*str);
315 *start += STRING_START (*str);
316 }
317 *buf = STRING_STRINGBUF (*str);
318 }
319
320 SCM
321 scm_i_substring (SCM str, size_t start, size_t end)
322 {
323 SCM buf;
324 size_t str_start;
325 get_str_buf_start (&str, &buf, &str_start);
326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
327 SET_STRINGBUF_SHARED (buf);
328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
329 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
330 (scm_t_bits)str_start + start,
331 (scm_t_bits) end - start);
332 }
333
334 SCM
335 scm_i_substring_read_only (SCM str, size_t start, size_t end)
336 {
337 SCM buf;
338 size_t str_start;
339 get_str_buf_start (&str, &buf, &str_start);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
341 SET_STRINGBUF_SHARED (buf);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
343 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
344 (scm_t_bits)str_start + start,
345 (scm_t_bits) end - start);
346 }
347
348 SCM
349 scm_i_substring_copy (SCM str, size_t start, size_t end)
350 {
351 size_t len = end - start;
352 SCM buf, my_buf;
353 size_t str_start;
354 get_str_buf_start (&str, &buf, &str_start);
355 if (scm_i_is_narrow_string (str))
356 {
357 my_buf = make_stringbuf (len);
358 memcpy (STRINGBUF_CHARS (my_buf),
359 STRINGBUF_CHARS (buf) + str_start + start, len);
360 }
361 else
362 {
363 my_buf = make_wide_stringbuf (len);
364 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
365 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
366 + start), len);
367 /* Even though this string is wide, the substring may be narrow.
368 Consider adding code to narrow the string. */
369 }
370 scm_remember_upto_here_1 (buf);
371 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
372 (scm_t_bits) 0, (scm_t_bits) len);
373 }
374
375 SCM
376 scm_i_substring_shared (SCM str, size_t start, size_t end)
377 {
378 if (start == 0 && end == STRING_LENGTH (str))
379 return str;
380 else
381 {
382 size_t len = end - start;
383 if (IS_SH_STRING (str))
384 {
385 start += STRING_START (str);
386 str = SH_STRING_STRING (str);
387 }
388 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
389 (scm_t_bits)start, (scm_t_bits) len);
390 }
391 }
392
393 SCM
394 scm_c_substring (SCM str, size_t start, size_t end)
395 {
396 validate_substring_args (str, start, end);
397 return scm_i_substring (str, start, end);
398 }
399
400 SCM
401 scm_c_substring_read_only (SCM str, size_t start, size_t end)
402 {
403 validate_substring_args (str, start, end);
404 return scm_i_substring_read_only (str, start, end);
405 }
406
407 SCM
408 scm_c_substring_copy (SCM str, size_t start, size_t end)
409 {
410 validate_substring_args (str, start, end);
411 return scm_i_substring_copy (str, start, end);
412 }
413
414 SCM
415 scm_c_substring_shared (SCM str, size_t start, size_t end)
416 {
417 validate_substring_args (str, start, end);
418 return scm_i_substring_shared (str, start, end);
419 }
420
421 \f
422 /* Internal accessors
423 */
424
425 /* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
427 size_t
428 scm_i_string_length (SCM str)
429 {
430 return STRING_LENGTH (str);
431 }
432
433 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
435 encoding. */
436 int
437 scm_i_is_narrow_string (SCM str)
438 {
439 if (IS_SH_STRING (str))
440 str = SH_STRING_STRING (str);
441
442 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
443 }
444
445 /* Try to coerce a string to be narrow. It if is narrow already, do
446 nothing. If it is wide, shrink it to narrow if none of its
447 characters are above 0xFF. Return true if the string is narrow or
448 was made to be narrow. */
449 int
450 scm_i_try_narrow_string (SCM str)
451 {
452 if (IS_SH_STRING (str))
453 str = SH_STRING_STRING (str);
454
455 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
456
457 return scm_i_is_narrow_string (str);
458 }
459
460 /* Return a pointer to the raw data of the string, which can be either Latin-1
461 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
462 const void *
463 scm_i_string_data (SCM str)
464 {
465 SCM buf;
466 size_t start;
467 const char *data;
468
469 get_str_buf_start (&str, &buf, &start);
470
471 data = STRINGBUF_CONTENTS (buf);
472 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
473
474 return data;
475 }
476
477 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
478 STR. */
479 const char *
480 scm_i_string_chars (SCM str)
481 {
482 SCM buf;
483 size_t start;
484 get_str_buf_start (&str, &buf, &start);
485 if (scm_i_is_narrow_string (str))
486 return (const char *) STRINGBUF_CHARS (buf) + start;
487 else
488 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
489 scm_list_1 (str));
490 return NULL;
491 }
492
493 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
494 STR. */
495 const scm_t_wchar *
496 scm_i_string_wide_chars (SCM str)
497 {
498 SCM buf;
499 size_t start;
500
501 get_str_buf_start (&str, &buf, &start);
502 if (!scm_i_is_narrow_string (str))
503 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
504 else
505 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
506 scm_list_1 (str));
507 }
508
509 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
510 a new string buffer, so that it can be modified without modifying
511 other strings. Also, lock the string mutex. Later, one must call
512 scm_i_string_stop_writing to unlock the mutex. */
513 SCM
514 scm_i_string_start_writing (SCM orig_str)
515 {
516 SCM buf, str = orig_str;
517 size_t start;
518
519 get_str_buf_start (&str, &buf, &start);
520 if (IS_RO_STRING (str))
521 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
522
523 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
524 if (STRINGBUF_SHARED (buf))
525 {
526 /* Clone the stringbuf. */
527 size_t len = STRING_LENGTH (str);
528 SCM new_buf;
529
530 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
531
532 if (scm_i_is_narrow_string (str))
533 {
534 new_buf = make_stringbuf (len);
535 memcpy (STRINGBUF_CHARS (new_buf),
536 STRINGBUF_CHARS (buf) + STRING_START (str), len);
537
538 }
539 else
540 {
541 new_buf = make_wide_stringbuf (len);
542 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
543 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
544 + STRING_START (str)), len);
545 }
546
547 SET_STRING_STRINGBUF (str, new_buf);
548 start -= STRING_START (str);
549
550 /* FIXME: The following operations are not atomic, so other threads
551 looking at STR may see an inconsistent state. Nevertheless it can't
552 hurt much since (i) accessing STR while it is being mutated can't
553 yield a crash, and (ii) concurrent accesses to STR should be
554 protected by a mutex at the application level. The latter may not
555 apply when STR != ORIG_STR, though. */
556 SET_STRING_START (str, 0);
557 SET_STRING_STRINGBUF (str, new_buf);
558
559 buf = new_buf;
560
561 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
562 }
563 return orig_str;
564 }
565
566 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
567 char *
568 scm_i_string_writable_chars (SCM str)
569 {
570 SCM buf;
571 size_t start;
572
573 get_str_buf_start (&str, &buf, &start);
574 if (scm_i_is_narrow_string (str))
575 return (char *) STRINGBUF_CHARS (buf) + start;
576 else
577 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
578 scm_list_1 (str));
579 return NULL;
580 }
581
582 /* Return a pointer to the UCS-4 codepoints of a string. */
583 static scm_t_wchar *
584 scm_i_string_writable_wide_chars (SCM str)
585 {
586 SCM buf;
587 size_t start;
588
589 get_str_buf_start (&str, &buf, &start);
590 if (!scm_i_is_narrow_string (str))
591 return STRINGBUF_WIDE_CHARS (buf) + start;
592 else
593 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
594 scm_list_1 (str));
595 }
596
597 /* Unlock the string mutex that was locked when
598 scm_i_string_start_writing was called. */
599 void
600 scm_i_string_stop_writing (void)
601 {
602 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
603 }
604
605 /* Return the Xth character of STR as a UCS-4 codepoint. */
606 scm_t_wchar
607 scm_i_string_ref (SCM str, size_t x)
608 {
609 if (scm_i_is_narrow_string (str))
610 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
611 else
612 return scm_i_string_wide_chars (str)[x];
613 }
614
615 /* Returns index+1 of the first char in STR that matches C, or
616 0 if the char is not found. */
617 int
618 scm_i_string_contains_char (SCM str, char ch)
619 {
620 size_t i;
621 size_t len = scm_i_string_length (str);
622
623 i = 0;
624 if (scm_i_is_narrow_string (str))
625 {
626 while (i < len)
627 {
628 if (scm_i_string_chars (str)[i] == ch)
629 return i+1;
630 i++;
631 }
632 }
633 else
634 {
635 while (i < len)
636 {
637 if (scm_i_string_wide_chars (str)[i]
638 == (unsigned char) ch)
639 return i+1;
640 i++;
641 }
642 }
643 return 0;
644 }
645
646 int
647 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
648 {
649 if (scm_i_is_narrow_string (sstr))
650 {
651 const char *a = scm_i_string_chars (sstr) + start_x;
652 const char *b = cstr;
653 return strncmp (a, b, strlen(b));
654 }
655 else
656 {
657 size_t i;
658 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
659 const char *b = cstr;
660 for (i = 0; i < strlen (b); i++)
661 {
662 if (a[i] != (unsigned char) b[i])
663 return 1;
664 }
665 }
666 return 0;
667 }
668
669 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
670 void
671 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
672 {
673 if (IS_SH_STRING (str))
674 {
675 p += STRING_START (str);
676 str = SH_STRING_STRING (str);
677 }
678
679 if (chr > 0xFF && scm_i_is_narrow_string (str))
680 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
681
682 if (scm_i_is_narrow_string (str))
683 {
684 char *dst = scm_i_string_writable_chars (str);
685 dst[p] = chr;
686 }
687 else
688 {
689 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
690 dst[p] = chr;
691 }
692 }
693
694 \f
695 /* Symbols.
696
697 Basic symbol creation and accessing is done here, the rest is in
698 symbols.[hc]. This has been done to keep stringbufs and the
699 internals of strings and string-like objects confined to this file.
700 */
701
702 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
703
704 SCM
705 scm_i_make_symbol (SCM name, scm_t_bits flags,
706 unsigned long hash, SCM props)
707 {
708 SCM buf;
709 size_t start = STRING_START (name);
710 size_t length = STRING_LENGTH (name);
711
712 if (IS_SH_STRING (name))
713 {
714 name = SH_STRING_STRING (name);
715 start += STRING_START (name);
716 }
717 buf = SYMBOL_STRINGBUF (name);
718
719 if (start == 0 && length == STRINGBUF_LENGTH (buf))
720 {
721 /* reuse buf. */
722 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
723 SET_STRINGBUF_SHARED (buf);
724 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
725 }
726 else
727 {
728 /* make new buf. */
729 if (scm_i_is_narrow_string (name))
730 {
731 SCM new_buf = make_stringbuf (length);
732 memcpy (STRINGBUF_CHARS (new_buf),
733 STRINGBUF_CHARS (buf) + start, length);
734 buf = new_buf;
735 }
736 else
737 {
738 SCM new_buf = make_wide_stringbuf (length);
739 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
740 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
741 length);
742 buf = new_buf;
743 }
744 }
745 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
746 (scm_t_bits) hash, SCM_UNPACK (props));
747 }
748
749 SCM
750 scm_i_c_make_symbol (const char *name, size_t len,
751 scm_t_bits flags, unsigned long hash, SCM props)
752 {
753 SCM buf = make_stringbuf (len);
754 memcpy (STRINGBUF_CHARS (buf), name, len);
755
756 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
757 (scm_t_bits) hash, SCM_UNPACK (props));
758 }
759
760 /* Returns the number of characters in SYM. This may be different
761 from the memory size of SYM. */
762 size_t
763 scm_i_symbol_length (SCM sym)
764 {
765 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
766 }
767
768 size_t
769 scm_c_symbol_length (SCM sym)
770 #define FUNC_NAME "scm_c_symbol_length"
771 {
772 SCM_VALIDATE_SYMBOL (1, sym);
773
774 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
775 }
776 #undef FUNC_NAME
777
778 /* True if the name of SYM is stored as a Latin-1 encoded string.
779 False if it is stored as a 32-bit UCS-4-encoded string. */
780 int
781 scm_i_is_narrow_symbol (SCM sym)
782 {
783 SCM buf;
784
785 buf = SYMBOL_STRINGBUF (sym);
786 return !STRINGBUF_WIDE (buf);
787 }
788
789 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
790 contains the name of SYM. */
791 const char *
792 scm_i_symbol_chars (SCM sym)
793 {
794 SCM buf;
795
796 buf = SYMBOL_STRINGBUF (sym);
797 if (!STRINGBUF_WIDE (buf))
798 return (const char *) STRINGBUF_CHARS (buf);
799 else
800 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
801 scm_list_1 (sym));
802 }
803
804 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
805 symbol's name. */
806 const scm_t_wchar *
807 scm_i_symbol_wide_chars (SCM sym)
808 {
809 SCM buf;
810
811 buf = SYMBOL_STRINGBUF (sym);
812 if (STRINGBUF_WIDE (buf))
813 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
814 else
815 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
816 scm_list_1 (sym));
817 }
818
819 SCM
820 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
821 {
822 SCM buf = SYMBOL_STRINGBUF (sym);
823 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
824 SET_STRINGBUF_SHARED (buf);
825 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
826 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
827 (scm_t_bits)start, (scm_t_bits) end - start);
828 }
829
830 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
831 scm_t_wchar
832 scm_i_symbol_ref (SCM sym, size_t x)
833 {
834 if (scm_i_is_narrow_symbol (sym))
835 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
836 else
837 return scm_i_symbol_wide_chars (sym)[x];
838 }
839
840 /* Debugging
841 */
842
843 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
844 "Returns an association list containing debugging information\n"
845 "for @var{str}. The association list has the following entries."
846 "@table @code\n"
847 "@item string\n"
848 "The string itself.\n"
849 "@item start\n"
850 "The start index of the string into its stringbuf\n"
851 "@item length\n"
852 "The length of the string\n"
853 "@item shared\n"
854 "If this string is a substring, it returns its parent string.\n"
855 "Otherwise, it returns @code{#f}\n"
856 "@item read-only\n"
857 "@code{#t} if the string is read-only\n"
858 "@item stringbuf-chars\n"
859 "A new string containing this string's stringbuf's characters\n"
860 "@item stringbuf-length\n"
861 "The number of characters in this stringbuf\n"
862 "@item stringbuf-shared\n"
863 "@code{#t} if this stringbuf is shared\n"
864 "@item stringbuf-wide\n"
865 "@code{#t} if this stringbuf's characters are stored in a\n"
866 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
867 "buffer\n"
868 "@end table")
869 #define FUNC_NAME s_scm_sys_string_dump
870 {
871 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
872 SCM buf;
873 SCM_VALIDATE_STRING (1, str);
874
875 /* String info */
876 e1 = scm_cons (scm_from_latin1_symbol ("string"),
877 str);
878 e2 = scm_cons (scm_from_latin1_symbol ("start"),
879 scm_from_size_t (STRING_START (str)));
880 e3 = scm_cons (scm_from_latin1_symbol ("length"),
881 scm_from_size_t (STRING_LENGTH (str)));
882
883 if (IS_SH_STRING (str))
884 {
885 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
886 SH_STRING_STRING (str));
887 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
888 }
889 else
890 {
891 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
892 SCM_BOOL_F);
893 buf = STRING_STRINGBUF (str);
894 }
895
896 if (IS_RO_STRING (str))
897 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
898 SCM_BOOL_T);
899 else
900 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
901 SCM_BOOL_F);
902
903 /* Stringbuf info */
904 if (!STRINGBUF_WIDE (buf))
905 {
906 size_t len = STRINGBUF_LENGTH (buf);
907 char *cbuf;
908 SCM sbc = scm_i_make_string (len, &cbuf, 0);
909 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
910 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
911 sbc);
912 }
913 else
914 {
915 size_t len = STRINGBUF_LENGTH (buf);
916 scm_t_wchar *cbuf;
917 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
918 u32_cpy ((scm_t_uint32 *) cbuf,
919 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
920 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
921 sbc);
922 }
923 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
924 scm_from_size_t (STRINGBUF_LENGTH (buf)));
925 if (STRINGBUF_SHARED (buf))
926 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
927 SCM_BOOL_T);
928 else
929 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
930 SCM_BOOL_F);
931 if (STRINGBUF_WIDE (buf))
932 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
933 SCM_BOOL_T);
934 else
935 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
936 SCM_BOOL_F);
937
938 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
939 }
940 #undef FUNC_NAME
941
942 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
943 "Returns an association list containing debugging information\n"
944 "for @var{sym}. The association list has the following entries."
945 "@table @code\n"
946 "@item symbol\n"
947 "The symbol itself\n"
948 "@item hash\n"
949 "Its hash value\n"
950 "@item interned\n"
951 "@code{#t} if it is an interned symbol\n"
952 "@item stringbuf-chars\n"
953 "A new string containing this symbols's stringbuf's characters\n"
954 "@item stringbuf-length\n"
955 "The number of characters in this stringbuf\n"
956 "@item stringbuf-shared\n"
957 "@code{#t} if this stringbuf is shared\n"
958 "@item stringbuf-wide\n"
959 "@code{#t} if this stringbuf's characters are stored in a\n"
960 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
961 "buffer\n"
962 "@end table")
963 #define FUNC_NAME s_scm_sys_symbol_dump
964 {
965 SCM e1, e2, e3, e4, e5, e6, e7;
966 SCM buf;
967 SCM_VALIDATE_SYMBOL (1, sym);
968 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
969 sym);
970 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
971 scm_from_ulong (scm_i_symbol_hash (sym)));
972 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
973 scm_symbol_interned_p (sym));
974 buf = SYMBOL_STRINGBUF (sym);
975
976 /* Stringbuf info */
977 if (!STRINGBUF_WIDE (buf))
978 {
979 size_t len = STRINGBUF_LENGTH (buf);
980 char *cbuf;
981 SCM sbc = scm_i_make_string (len, &cbuf, 0);
982 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
983 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
984 sbc);
985 }
986 else
987 {
988 size_t len = STRINGBUF_LENGTH (buf);
989 scm_t_wchar *cbuf;
990 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
991 u32_cpy ((scm_t_uint32 *) cbuf,
992 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
993 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
994 sbc);
995 }
996 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
997 scm_from_size_t (STRINGBUF_LENGTH (buf)));
998 if (STRINGBUF_SHARED (buf))
999 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1000 SCM_BOOL_T);
1001 else
1002 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1003 SCM_BOOL_F);
1004 if (STRINGBUF_WIDE (buf))
1005 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1006 SCM_BOOL_T);
1007 else
1008 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1009 SCM_BOOL_F);
1010 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1011
1012 }
1013 #undef FUNC_NAME
1014
1015 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1016
1017 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1018 #define FUNC_NAME s_scm_sys_stringbuf_hist
1019 {
1020 int i;
1021 for (i = 0; i < 1000; i++)
1022 if (lenhist[i])
1023 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1024 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1025 return SCM_UNSPECIFIED;
1026 }
1027 #undef FUNC_NAME
1028
1029 #endif
1030
1031 \f
1032
1033 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1034 (SCM obj),
1035 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1036 #define FUNC_NAME s_scm_string_p
1037 {
1038 return scm_from_bool (IS_STRING (obj));
1039 }
1040 #undef FUNC_NAME
1041
1042
1043 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1044
1045 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1046 (SCM chrs),
1047 "@deffnx {Scheme Procedure} list->string chrs\n"
1048 "Return a newly allocated string composed of the arguments,\n"
1049 "@var{chrs}.")
1050 #define FUNC_NAME s_scm_string
1051 {
1052 SCM result = SCM_BOOL_F;
1053 SCM rest;
1054 size_t len;
1055 size_t p = 0;
1056 long i;
1057 int wide = 0;
1058
1059 /* Verify that this is a list of chars. */
1060 i = scm_ilength (chrs);
1061 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1062
1063 len = (size_t) i;
1064 rest = chrs;
1065
1066 while (len > 0 && scm_is_pair (rest))
1067 {
1068 SCM elt = SCM_CAR (rest);
1069 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1070 if (SCM_CHAR (elt) > 0xFF)
1071 wide = 1;
1072 rest = SCM_CDR (rest);
1073 len--;
1074 scm_remember_upto_here_1 (elt);
1075 }
1076
1077 /* Construct a string containing this list of chars. */
1078 len = (size_t) i;
1079 rest = chrs;
1080
1081 if (wide == 0)
1082 {
1083 char *buf;
1084
1085 result = scm_i_make_string (len, NULL, 0);
1086 result = scm_i_string_start_writing (result);
1087 buf = scm_i_string_writable_chars (result);
1088 while (len > 0 && scm_is_pair (rest))
1089 {
1090 SCM elt = SCM_CAR (rest);
1091 buf[p] = (unsigned char) SCM_CHAR (elt);
1092 p++;
1093 rest = SCM_CDR (rest);
1094 len--;
1095 scm_remember_upto_here_1 (elt);
1096 }
1097 }
1098 else
1099 {
1100 scm_t_wchar *buf;
1101
1102 result = scm_i_make_wide_string (len, NULL, 0);
1103 result = scm_i_string_start_writing (result);
1104 buf = scm_i_string_writable_wide_chars (result);
1105 while (len > 0 && scm_is_pair (rest))
1106 {
1107 SCM elt = SCM_CAR (rest);
1108 buf[p] = SCM_CHAR (elt);
1109 p++;
1110 rest = SCM_CDR (rest);
1111 len--;
1112 scm_remember_upto_here_1 (elt);
1113 }
1114 }
1115 scm_i_string_stop_writing ();
1116
1117 if (len > 0)
1118 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1119 if (!scm_is_null (rest))
1120 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1121
1122 return result;
1123 }
1124 #undef FUNC_NAME
1125
1126 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1127 (SCM k, SCM chr),
1128 "Return a newly allocated string of\n"
1129 "length @var{k}. If @var{chr} is given, then all elements of\n"
1130 "the string are initialized to @var{chr}, otherwise the contents\n"
1131 "of the @var{string} are all set to @var{#\nul}.")
1132 #define FUNC_NAME s_scm_make_string
1133 {
1134 return scm_c_make_string (scm_to_size_t (k), chr);
1135 }
1136 #undef FUNC_NAME
1137
1138 SCM
1139 scm_c_make_string (size_t len, SCM chr)
1140 #define FUNC_NAME NULL
1141 {
1142 size_t p;
1143 char *contents = NULL;
1144 SCM res = scm_i_make_string (len, &contents, 0);
1145
1146 /* If no char is given, initialize string contents to NULL. */
1147 if (SCM_UNBNDP (chr))
1148 memset (contents, 0, len);
1149 else
1150 {
1151 SCM_VALIDATE_CHAR (0, chr);
1152 res = scm_i_string_start_writing (res);
1153 for (p = 0; p < len; p++)
1154 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1155 scm_i_string_stop_writing ();
1156 }
1157
1158 return res;
1159 }
1160 #undef FUNC_NAME
1161
1162 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1163 (SCM string),
1164 "Return the number of characters in @var{string}.")
1165 #define FUNC_NAME s_scm_string_length
1166 {
1167 SCM_VALIDATE_STRING (1, string);
1168 return scm_from_size_t (STRING_LENGTH (string));
1169 }
1170 #undef FUNC_NAME
1171
1172 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1173 (SCM string),
1174 "Return the bytes used to represent a character in @var{string}."
1175 "This will return 1 or 4.")
1176 #define FUNC_NAME s_scm_string_bytes_per_char
1177 {
1178 SCM_VALIDATE_STRING (1, string);
1179 if (!scm_i_is_narrow_string (string))
1180 return scm_from_int (4);
1181
1182 return scm_from_int (1);
1183 }
1184 #undef FUNC_NAME
1185
1186 size_t
1187 scm_c_string_length (SCM string)
1188 {
1189 if (!IS_STRING (string))
1190 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1191 return STRING_LENGTH (string);
1192 }
1193
1194 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1195 (SCM str, SCM k),
1196 "Return character @var{k} of @var{str} using zero-origin\n"
1197 "indexing. @var{k} must be a valid index of @var{str}.")
1198 #define FUNC_NAME s_scm_string_ref
1199 {
1200 size_t len;
1201 unsigned long idx;
1202
1203 SCM_VALIDATE_STRING (1, str);
1204
1205 len = scm_i_string_length (str);
1206 if (SCM_LIKELY (len > 0))
1207 idx = scm_to_unsigned_integer (k, 0, len - 1);
1208 else
1209 scm_out_of_range (NULL, k);
1210
1211 if (scm_i_is_narrow_string (str))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1213 else
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1215 }
1216 #undef FUNC_NAME
1217
1218 SCM
1219 scm_c_string_ref (SCM str, size_t p)
1220 {
1221 if (p >= scm_i_string_length (str))
1222 scm_out_of_range (NULL, scm_from_size_t (p));
1223 if (scm_i_is_narrow_string (str))
1224 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1225 else
1226 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1227
1228 }
1229
1230 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1231 (SCM str, SCM k, SCM chr),
1232 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1233 "an unspecified value. @var{k} must be a valid index of\n"
1234 "@var{str}.")
1235 #define FUNC_NAME s_scm_string_set_x
1236 {
1237 size_t len;
1238 unsigned long idx;
1239
1240 SCM_VALIDATE_STRING (1, str);
1241
1242 len = scm_i_string_length (str);
1243 if (SCM_LIKELY (len > 0))
1244 idx = scm_to_unsigned_integer (k, 0, len - 1);
1245 else
1246 scm_out_of_range (NULL, k);
1247
1248 SCM_VALIDATE_CHAR (3, chr);
1249 str = scm_i_string_start_writing (str);
1250 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1251 scm_i_string_stop_writing ();
1252
1253 return SCM_UNSPECIFIED;
1254 }
1255 #undef FUNC_NAME
1256
1257 void
1258 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1259 {
1260 if (p >= scm_i_string_length (str))
1261 scm_out_of_range (NULL, scm_from_size_t (p));
1262 str = scm_i_string_start_writing (str);
1263 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1264 scm_i_string_stop_writing ();
1265 }
1266
1267 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1268 (SCM str, SCM start, SCM end),
1269 "Return a newly allocated string formed from the characters\n"
1270 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1271 "ending with index @var{end} (exclusive).\n"
1272 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1273 "exact integers satisfying:\n\n"
1274 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1275 #define FUNC_NAME s_scm_substring
1276 {
1277 size_t len, from, to;
1278
1279 SCM_VALIDATE_STRING (1, str);
1280 len = scm_i_string_length (str);
1281 from = scm_to_unsigned_integer (start, 0, len);
1282 if (SCM_UNBNDP (end))
1283 to = len;
1284 else
1285 to = scm_to_unsigned_integer (end, from, len);
1286 return scm_i_substring (str, from, to);
1287 }
1288 #undef FUNC_NAME
1289
1290 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1291 (SCM str, SCM start, SCM end),
1292 "Return a newly allocated string formed from the characters\n"
1293 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1294 "ending with index @var{end} (exclusive).\n"
1295 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1296 "exact integers satisfying:\n"
1297 "\n"
1298 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1299 "\n"
1300 "The returned string is read-only.\n")
1301 #define FUNC_NAME s_scm_substring_read_only
1302 {
1303 size_t len, from, to;
1304
1305 SCM_VALIDATE_STRING (1, str);
1306 len = scm_i_string_length (str);
1307 from = scm_to_unsigned_integer (start, 0, len);
1308 if (SCM_UNBNDP (end))
1309 to = len;
1310 else
1311 to = scm_to_unsigned_integer (end, from, len);
1312 return scm_i_substring_read_only (str, from, to);
1313 }
1314 #undef FUNC_NAME
1315
1316 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1317 (SCM str, SCM start, SCM end),
1318 "Return a newly allocated string formed from the characters\n"
1319 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1320 "ending with index @var{end} (exclusive).\n"
1321 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1322 "exact integers satisfying:\n\n"
1323 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1324 #define FUNC_NAME s_scm_substring_copy
1325 {
1326 /* For the Scheme version, START is mandatory, but for the C
1327 version, it is optional. See scm_string_copy in srfi-13.c for a
1328 rationale.
1329 */
1330
1331 size_t from, to;
1332
1333 SCM_VALIDATE_STRING (1, str);
1334 scm_i_get_substring_spec (scm_i_string_length (str),
1335 start, &from, end, &to);
1336 return scm_i_substring_copy (str, from, to);
1337 }
1338 #undef FUNC_NAME
1339
1340 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1341 (SCM str, SCM start, SCM end),
1342 "Return string that indirectly refers to the characters\n"
1343 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1344 "ending with index @var{end} (exclusive).\n"
1345 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1346 "exact integers satisfying:\n\n"
1347 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1348 #define FUNC_NAME s_scm_substring_shared
1349 {
1350 size_t len, from, to;
1351
1352 SCM_VALIDATE_STRING (1, str);
1353 len = scm_i_string_length (str);
1354 from = scm_to_unsigned_integer (start, 0, len);
1355 if (SCM_UNBNDP (end))
1356 to = len;
1357 else
1358 to = scm_to_unsigned_integer (end, from, len);
1359 return scm_i_substring_shared (str, from, to);
1360 }
1361 #undef FUNC_NAME
1362
1363 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1364 (SCM args),
1365 "Return a newly allocated string whose characters form the\n"
1366 "concatenation of the given strings, @var{args}.")
1367 #define FUNC_NAME s_scm_string_append
1368 {
1369 SCM res;
1370 size_t len = 0;
1371 int wide = 0;
1372 SCM l, s;
1373 size_t i;
1374 union
1375 {
1376 char *narrow;
1377 scm_t_wchar *wide;
1378 } data;
1379
1380 SCM_VALIDATE_REST_ARGUMENT (args);
1381 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1382 {
1383 s = SCM_CAR (l);
1384 SCM_VALIDATE_STRING (SCM_ARGn, s);
1385 len += scm_i_string_length (s);
1386 if (!scm_i_is_narrow_string (s))
1387 wide = 1;
1388 }
1389 data.narrow = NULL;
1390 if (!wide)
1391 res = scm_i_make_string (len, &data.narrow, 0);
1392 else
1393 res = scm_i_make_wide_string (len, &data.wide, 0);
1394
1395 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1396 {
1397 size_t len;
1398 s = SCM_CAR (l);
1399 SCM_VALIDATE_STRING (SCM_ARGn, s);
1400 len = scm_i_string_length (s);
1401 if (!wide)
1402 {
1403 memcpy (data.narrow, scm_i_string_chars (s), len);
1404 data.narrow += len;
1405 }
1406 else
1407 {
1408 if (scm_i_is_narrow_string (s))
1409 {
1410 for (i = 0; i < scm_i_string_length (s); i++)
1411 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1412 }
1413 else
1414 u32_cpy ((scm_t_uint32 *) data.wide,
1415 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1416 data.wide += len;
1417 }
1418 scm_remember_upto_here_1 (s);
1419 }
1420 return res;
1421 }
1422 #undef FUNC_NAME
1423
1424
1425 \f
1426 /* Charset conversion error handling. */
1427
1428 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1429 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1430
1431 /* Raise an exception informing that character CHR could not be written
1432 to PORT in its current encoding. */
1433 void
1434 scm_encoding_error (const char *subr, int err, const char *message,
1435 SCM port, SCM chr)
1436 {
1437 scm_throw (scm_encoding_error_key,
1438 scm_list_n (scm_from_latin1_string (subr),
1439 scm_from_latin1_string (message),
1440 scm_from_int (err),
1441 port, chr,
1442 SCM_UNDEFINED));
1443 }
1444
1445 /* Raise an exception informing of an encoding error on PORT. This
1446 means that a character could not be written in PORT's encoding. */
1447 void
1448 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1449 {
1450 scm_throw (scm_decoding_error_key,
1451 scm_list_n (scm_from_latin1_string (subr),
1452 scm_from_latin1_string (message),
1453 scm_from_int (err),
1454 port,
1455 SCM_UNDEFINED));
1456 }
1457
1458 \f
1459 /* String conversion to/from C. */
1460
1461 SCM
1462 scm_from_stringn (const char *str, size_t len, const char *encoding,
1463 scm_t_string_failed_conversion_handler handler)
1464 {
1465 size_t u32len, i;
1466 scm_t_wchar *u32;
1467 int wide = 0;
1468 SCM res;
1469
1470 /* The order of these checks is important. */
1471 if (!str && len != 0)
1472 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1473 if (len == (size_t) -1)
1474 len = strlen (str);
1475
1476 if (encoding == NULL || len == 0)
1477 {
1478 /* If encoding is null (or the string is empty), use Latin-1. */
1479 char *buf;
1480 res = scm_i_make_string (len, &buf, 0);
1481 memcpy (buf, str, len);
1482 return res;
1483 }
1484
1485 u32len = 0;
1486 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1487 (enum iconv_ilseq_handler)
1488 handler,
1489 str, len,
1490 NULL,
1491 NULL, &u32len);
1492
1493 if (SCM_UNLIKELY (u32 == NULL))
1494 {
1495 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1496 handler. */
1497 SCM bv;
1498 signed char *buf;
1499
1500 buf = scm_gc_malloc_pointerless (len, "bytevector");
1501 memcpy (buf, str, len);
1502 bv = scm_c_take_gc_bytevector (buf, len);
1503
1504 scm_decoding_error (__func__, errno,
1505 "input locale conversion error", bv);
1506 }
1507
1508 i = 0;
1509 while (i < u32len)
1510 if (u32[i++] > 0xFF)
1511 {
1512 wide = 1;
1513 break;
1514 }
1515
1516 if (!wide)
1517 {
1518 char *dst;
1519 res = scm_i_make_string (u32len, &dst, 0);
1520 for (i = 0; i < u32len; i ++)
1521 dst[i] = (unsigned char) u32[i];
1522 dst[u32len] = '\0';
1523 }
1524 else
1525 {
1526 scm_t_wchar *wdst;
1527 res = scm_i_make_wide_string (u32len, &wdst, 0);
1528 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1529 wdst[u32len] = 0;
1530 }
1531
1532 free (u32);
1533 return res;
1534 }
1535
1536 SCM
1537 scm_from_locale_string (const char *str)
1538 {
1539 return scm_from_locale_stringn (str, -1);
1540 }
1541
1542 SCM
1543 scm_from_locale_stringn (const char *str, size_t len)
1544 {
1545 return scm_from_stringn (str, len, locale_charset (),
1546 scm_i_get_conversion_strategy (SCM_BOOL_F));
1547 }
1548
1549 SCM
1550 scm_from_latin1_string (const char *str)
1551 {
1552 return scm_from_latin1_stringn (str, -1);
1553 }
1554
1555 SCM
1556 scm_from_latin1_stringn (const char *str, size_t len)
1557 {
1558 char *buf;
1559 SCM result;
1560
1561 if (len == (size_t) -1)
1562 len = strlen (str);
1563
1564 /* Make a narrow string and copy STR as is. */
1565 result = scm_i_make_string (len, &buf, 0);
1566 memcpy (buf, str, len);
1567
1568 return result;
1569 }
1570
1571 SCM
1572 scm_from_utf8_string (const char *str)
1573 {
1574 return scm_from_utf8_stringn (str, -1);
1575 }
1576
1577 SCM
1578 scm_from_utf8_stringn (const char *str, size_t len)
1579 {
1580 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1581 }
1582
1583 SCM
1584 scm_from_utf32_string (const scm_t_wchar *str)
1585 {
1586 return scm_from_utf32_stringn (str, -1);
1587 }
1588
1589 SCM
1590 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1591 {
1592 SCM result;
1593 scm_t_wchar *buf;
1594
1595 if (len == (size_t) -1)
1596 len = u32_strlen ((uint32_t *) str);
1597
1598 result = scm_i_make_wide_string (len, &buf, 0);
1599 memcpy (buf, str, len * sizeof (scm_t_wchar));
1600 scm_i_try_narrow_string (result);
1601
1602 return result;
1603 }
1604
1605 /* Create a new scheme string from the C string STR. The memory of
1606 STR may be used directly as storage for the new string. */
1607 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1608 would be to register a finalizer to eventually free(3) STR, which isn't
1609 worth it. Should we just deprecate the `scm_take_' functions? */
1610 SCM
1611 scm_take_locale_stringn (char *str, size_t len)
1612 {
1613 SCM res;
1614
1615 res = scm_from_locale_stringn (str, len);
1616 free (str);
1617
1618 return res;
1619 }
1620
1621 SCM
1622 scm_take_locale_string (char *str)
1623 {
1624 return scm_take_locale_stringn (str, -1);
1625 }
1626
1627 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1628 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1629 Set *LENP to the size of the resulting string.
1630
1631 FIXME: This is a hack we should get rid of. See
1632 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1633 for details. */
1634 static void
1635 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1636 {
1637 char *before, *after;
1638 size_t i, j;
1639
1640 before = buf;
1641 after = buf;
1642 i = 0;
1643 j = 0;
1644 while (i < *lenp)
1645 {
1646 if ((i <= *lenp - 6)
1647 && before[i] == '\\'
1648 && before[i + 1] == 'u'
1649 && before[i + 2] == '0' && before[i + 3] == '0')
1650 {
1651 /* Convert \u00NN to \xNN */
1652 after[j] = '\\';
1653 after[j + 1] = 'x';
1654 after[j + 2] = tolower ((int) before[i + 4]);
1655 after[j + 3] = tolower ((int) before[i + 5]);
1656 i += 6;
1657 j += 4;
1658 }
1659 else if ((i <= *lenp - 10)
1660 && before[i] == '\\'
1661 && before[i + 1] == 'U'
1662 && before[i + 2] == '0' && before[i + 3] == '0')
1663 {
1664 /* Convert \U00NNNNNN to \UNNNNNN */
1665 after[j] = '\\';
1666 after[j + 1] = 'U';
1667 after[j + 2] = tolower ((int) before[i + 4]);
1668 after[j + 3] = tolower ((int) before[i + 5]);
1669 after[j + 4] = tolower ((int) before[i + 6]);
1670 after[j + 5] = tolower ((int) before[i + 7]);
1671 after[j + 6] = tolower ((int) before[i + 8]);
1672 after[j + 7] = tolower ((int) before[i + 9]);
1673 i += 10;
1674 j += 8;
1675 }
1676 else
1677 {
1678 after[j] = before[i];
1679 i++;
1680 j++;
1681 }
1682 }
1683 *lenp = j;
1684 }
1685
1686 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1687 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1688 of the resulting string. BUF must be large enough to handle the
1689 worst case when `\uXXXX' escapes (6 characters) are replaced by
1690 `\xXXXX;' (7 characters). */
1691 static void
1692 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1693 {
1694 char *before, *after;
1695 size_t i, j;
1696 /* The worst case is if the input string contains all 4-digit hex escapes.
1697 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1698 size_t max_out_len = (*lenp * 7) / 6 + 1;
1699 size_t nzeros, ndigits;
1700
1701 before = buf;
1702 after = alloca (max_out_len);
1703 i = 0;
1704 j = 0;
1705 while (i < *lenp)
1706 {
1707 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1708 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1709 {
1710 if (before[i + 1] == 'u')
1711 ndigits = 4;
1712 else if (before[i + 1] == 'U')
1713 ndigits = 8;
1714 else
1715 abort ();
1716
1717 /* Add the R6RS hex escape initial sequence. */
1718 after[j] = '\\';
1719 after[j + 1] = 'x';
1720
1721 /* Move string positions to the start of the hex numbers. */
1722 i += 2;
1723 j += 2;
1724
1725 /* Find the number of initial zeros in this hex number. */
1726 nzeros = 0;
1727 while (before[i + nzeros] == '0' && nzeros < ndigits)
1728 nzeros++;
1729
1730 /* Copy the number, skipping initial zeros, and then move the string
1731 positions. */
1732 if (nzeros == ndigits)
1733 {
1734 after[j] = '0';
1735 i += ndigits;
1736 j += 1;
1737 }
1738 else
1739 {
1740 int pos;
1741 for (pos = 0; pos < ndigits - nzeros; pos++)
1742 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1743 i += ndigits;
1744 j += (ndigits - nzeros);
1745 }
1746
1747 /* Add terminating semicolon. */
1748 after[j] = ';';
1749 j++;
1750 }
1751 else
1752 {
1753 after[j] = before[i];
1754 i++;
1755 j++;
1756 }
1757 }
1758 *lenp = j;
1759 memcpy (before, after, j);
1760 }
1761
1762 char *
1763 scm_to_locale_string (SCM str)
1764 {
1765 return scm_to_locale_stringn (str, NULL);
1766 }
1767
1768 char *
1769 scm_to_locale_stringn (SCM str, size_t *lenp)
1770 {
1771 return scm_to_stringn (str, lenp,
1772 locale_charset (),
1773 scm_i_get_conversion_strategy (SCM_BOOL_F));
1774 }
1775
1776 char *
1777 scm_to_latin1_string (SCM str)
1778 {
1779 return scm_to_latin1_stringn (str, NULL);
1780 }
1781
1782 char *
1783 scm_to_latin1_stringn (SCM str, size_t *lenp)
1784 #define FUNC_NAME "scm_to_latin1_stringn"
1785 {
1786 char *result;
1787
1788 SCM_VALIDATE_STRING (1, str);
1789
1790 if (scm_i_is_narrow_string (str))
1791 {
1792 size_t len = scm_i_string_length (str);
1793
1794 if (lenp)
1795 *lenp = len;
1796
1797 result = scm_strndup (scm_i_string_data (str), len);
1798 }
1799 else
1800 result = scm_to_stringn (str, lenp, NULL,
1801 SCM_FAILED_CONVERSION_ERROR);
1802
1803 return result;
1804 }
1805 #undef FUNC_NAME
1806
1807 char *
1808 scm_to_utf8_string (SCM str)
1809 {
1810 return scm_to_utf8_stringn (str, NULL);
1811 }
1812
1813 char *
1814 scm_to_utf8_stringn (SCM str, size_t *lenp)
1815 {
1816 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1817 }
1818
1819 scm_t_wchar *
1820 scm_to_utf32_string (SCM str)
1821 {
1822 return scm_to_utf32_stringn (str, NULL);
1823 }
1824
1825 scm_t_wchar *
1826 scm_to_utf32_stringn (SCM str, size_t *lenp)
1827 #define FUNC_NAME "scm_to_utf32_stringn"
1828 {
1829 scm_t_wchar *result;
1830
1831 SCM_VALIDATE_STRING (1, str);
1832
1833 if (scm_i_is_narrow_string (str))
1834 result = (scm_t_wchar *)
1835 scm_to_stringn (str, lenp, "UTF-32",
1836 SCM_FAILED_CONVERSION_ERROR);
1837 else
1838 {
1839 size_t len;
1840
1841 len = scm_i_string_length (str);
1842 if (lenp)
1843 *lenp = len;
1844
1845 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1846 memcpy (result, scm_i_string_wide_chars (str),
1847 len * sizeof (scm_t_wchar));
1848 result[len] = 0;
1849 }
1850
1851 return result;
1852 }
1853 #undef FUNC_NAME
1854
1855 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1856 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1857 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1858 defined by HANDLER. */
1859 char *
1860 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1861 scm_t_string_failed_conversion_handler handler)
1862 {
1863 char *buf;
1864 size_t ilen, len, i;
1865 int ret;
1866 const char *enc;
1867
1868 if (!scm_is_string (str))
1869 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1870 ilen = scm_i_string_length (str);
1871
1872 if (ilen == 0)
1873 {
1874 buf = scm_malloc (1);
1875 buf[0] = '\0';
1876 if (lenp)
1877 *lenp = 0;
1878 return buf;
1879 }
1880
1881 if (lenp == NULL)
1882 for (i = 0; i < ilen; i++)
1883 if (scm_i_string_ref (str, i) == '\0')
1884 scm_misc_error (NULL,
1885 "string contains #\\nul character: ~S",
1886 scm_list_1 (str));
1887
1888 if (scm_i_is_narrow_string (str) && (encoding == NULL))
1889 {
1890 /* If using native Latin-1 encoding, just copy the string
1891 contents. */
1892 if (lenp)
1893 {
1894 buf = scm_malloc (ilen);
1895 memcpy (buf, scm_i_string_chars (str), ilen);
1896 *lenp = ilen;
1897 return buf;
1898 }
1899 else
1900 {
1901 buf = scm_malloc (ilen + 1);
1902 memcpy (buf, scm_i_string_chars (str), ilen);
1903 buf[ilen] = '\0';
1904 return buf;
1905 }
1906 }
1907
1908
1909 buf = NULL;
1910 len = 0;
1911 enc = encoding;
1912 if (enc == NULL)
1913 enc = "ISO-8859-1";
1914 if (scm_i_is_narrow_string (str))
1915 {
1916 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1917 "ISO-8859-1", enc,
1918 (enum iconv_ilseq_handler) handler, NULL,
1919 &buf, &len);
1920
1921 if (ret != 0)
1922 scm_encoding_error (__func__, errno,
1923 "cannot convert narrow string to output locale",
1924 SCM_BOOL_F,
1925 /* FIXME: Faulty character unknown. */
1926 SCM_BOOL_F);
1927 }
1928 else
1929 {
1930 buf = u32_conv_to_encoding (enc,
1931 (enum iconv_ilseq_handler) handler,
1932 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1933 ilen,
1934 NULL,
1935 NULL, &len);
1936 if (buf == NULL)
1937 scm_encoding_error (__func__, errno,
1938 "cannot convert wide string to output locale",
1939 SCM_BOOL_F,
1940 /* FIXME: Faulty character unknown. */
1941 SCM_BOOL_F);
1942 }
1943 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1944 {
1945 if (SCM_R6RS_ESCAPES_P)
1946 {
1947 /* The worst case is if the input string contains all 4-digit
1948 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1949 (seven characters). Make BUF large enough to hold
1950 that. */
1951 buf = scm_realloc (buf, (len * 7) / 6 + 1);
1952 unistring_escapes_to_r6rs_escapes (buf, &len);
1953 }
1954 else
1955 unistring_escapes_to_guile_escapes (buf, &len);
1956
1957 buf = scm_realloc (buf, len);
1958 }
1959 if (lenp)
1960 *lenp = len;
1961 else
1962 {
1963 buf = scm_realloc (buf, len + 1);
1964 buf[len] = '\0';
1965 }
1966
1967 scm_remember_upto_here_1 (str);
1968 return buf;
1969 }
1970
1971 size_t
1972 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1973 {
1974 size_t len;
1975 char *result = NULL;
1976 if (!scm_is_string (str))
1977 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1978 result = scm_to_locale_stringn (str, &len);
1979
1980 memcpy (buf, result, (len > max_len) ? max_len : len);
1981 free (result);
1982
1983 scm_remember_upto_here_1 (str);
1984 return len;
1985 }
1986
1987 \f
1988 /* Unicode string normalization. */
1989
1990 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1991 libguile/i18n.c. It would be useful to have this factored out into a more
1992 convenient location, but its use of alloca makes that tricky to do. */
1993
1994 static SCM
1995 normalize_str (SCM string, uninorm_t form)
1996 {
1997 SCM ret;
1998 scm_t_uint32 *w_str;
1999 scm_t_wchar *cbuf;
2000 size_t rlen, len = scm_i_string_length (string);
2001
2002 if (scm_i_is_narrow_string (string))
2003 {
2004 size_t i;
2005 const char *buf = scm_i_string_chars (string);
2006
2007 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2008
2009 for (i = 0; i < len; i ++)
2010 w_str[i] = (unsigned char) buf[i];
2011 w_str[len] = 0;
2012 }
2013 else
2014 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2015
2016 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2017
2018 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2019 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2020 free (w_str);
2021
2022 scm_i_try_narrow_string (ret);
2023
2024 return ret;
2025 }
2026
2027 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2028 (SCM string),
2029 "Returns the NFC normalized form of @var{string}.")
2030 #define FUNC_NAME s_scm_string_normalize_nfc
2031 {
2032 SCM_VALIDATE_STRING (1, string);
2033 return normalize_str (string, UNINORM_NFC);
2034 }
2035 #undef FUNC_NAME
2036
2037 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2038 (SCM string),
2039 "Returns the NFD normalized form of @var{string}.")
2040 #define FUNC_NAME s_scm_string_normalize_nfd
2041 {
2042 SCM_VALIDATE_STRING (1, string);
2043 return normalize_str (string, UNINORM_NFD);
2044 }
2045 #undef FUNC_NAME
2046
2047 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2048 (SCM string),
2049 "Returns the NFKC normalized form of @var{string}.")
2050 #define FUNC_NAME s_scm_string_normalize_nfkc
2051 {
2052 SCM_VALIDATE_STRING (1, string);
2053 return normalize_str (string, UNINORM_NFKC);
2054 }
2055 #undef FUNC_NAME
2056
2057 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2058 (SCM string),
2059 "Returns the NFKD normalized form of @var{string}.")
2060 #define FUNC_NAME s_scm_string_normalize_nfkd
2061 {
2062 SCM_VALIDATE_STRING (1, string);
2063 return normalize_str (string, UNINORM_NFKD);
2064 }
2065 #undef FUNC_NAME
2066
2067 /* converts C scm_array of strings to SCM scm_list of strings.
2068 If argc < 0, a null terminated scm_array is assumed.
2069 The current locale encoding is assumed */
2070 SCM
2071 scm_makfromstrs (int argc, char **argv)
2072 {
2073 int i = argc;
2074 SCM lst = SCM_EOL;
2075 if (0 > i)
2076 for (i = 0; argv[i]; i++);
2077 while (i--)
2078 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2079 return lst;
2080 }
2081
2082 /* Return a newly allocated array of char pointers to each of the strings
2083 in args, with a terminating NULL pointer. The strings are encoded using
2084 the current locale. */
2085
2086 char **
2087 scm_i_allocate_string_pointers (SCM list)
2088 #define FUNC_NAME "scm_i_allocate_string_pointers"
2089 {
2090 char **result;
2091 int list_len = scm_ilength (list);
2092 int i;
2093
2094 if (list_len < 0)
2095 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2096
2097 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2098 "string pointers");
2099 result[list_len] = NULL;
2100
2101 /* The list might have been modified in another thread, so
2102 we check LIST before each access.
2103 */
2104 for (i = 0; i < list_len && scm_is_pair (list); i++)
2105 {
2106 SCM str = SCM_CAR (list);
2107 size_t len; /* String length in bytes */
2108 char *c_str = scm_to_locale_stringn (str, &len);
2109
2110 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2111 scm_malloc to allocate the returned string, which must be
2112 explicitly deallocated. This forces us to copy the string a
2113 second time into a new buffer. Ideally there would be variants
2114 of scm_to_*_stringn that can return garbage-collected buffers. */
2115
2116 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2117 memcpy (result[i], c_str, len);
2118 result[i][len] = '\0';
2119 free (c_str);
2120
2121 list = SCM_CDR (list);
2122 }
2123
2124 return result;
2125 }
2126 #undef FUNC_NAME
2127
2128 void
2129 scm_i_get_substring_spec (size_t len,
2130 SCM start, size_t *cstart,
2131 SCM end, size_t *cend)
2132 {
2133 if (SCM_UNBNDP (start))
2134 *cstart = 0;
2135 else
2136 *cstart = scm_to_unsigned_integer (start, 0, len);
2137
2138 if (SCM_UNBNDP (end))
2139 *cend = len;
2140 else
2141 *cend = scm_to_unsigned_integer (end, *cstart, len);
2142 }
2143
2144 #if SCM_ENABLE_DEPRECATED
2145
2146 /* When these definitions are removed, it becomes reasonable to use
2147 read-only strings for string literals. For that, change the reader
2148 to create string literals with scm_c_substring_read_only instead of
2149 with scm_c_substring_copy.
2150 */
2151
2152 int
2153 scm_i_deprecated_stringp (SCM str)
2154 {
2155 scm_c_issue_deprecation_warning
2156 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2157
2158 return scm_is_string (str);
2159 }
2160
2161 char *
2162 scm_i_deprecated_string_chars (SCM str)
2163 {
2164 char *chars;
2165
2166 scm_c_issue_deprecation_warning
2167 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2168
2169 /* We don't accept shared substrings here since they are not
2170 null-terminated.
2171 */
2172 if (IS_SH_STRING (str))
2173 scm_misc_error (NULL,
2174 "SCM_STRING_CHARS does not work with shared substrings",
2175 SCM_EOL);
2176
2177 /* We explicitly test for read-only strings to produce a better
2178 error message.
2179 */
2180
2181 if (IS_RO_STRING (str))
2182 scm_misc_error (NULL,
2183 "SCM_STRING_CHARS does not work with read-only strings",
2184 SCM_EOL);
2185
2186 /* The following is still wrong, of course...
2187 */
2188 str = scm_i_string_start_writing (str);
2189 chars = scm_i_string_writable_chars (str);
2190 scm_i_string_stop_writing ();
2191 return chars;
2192 }
2193
2194 size_t
2195 scm_i_deprecated_string_length (SCM str)
2196 {
2197 scm_c_issue_deprecation_warning
2198 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2199 return scm_c_string_length (str);
2200 }
2201
2202 #endif
2203
2204 static SCM
2205 string_handle_ref (scm_t_array_handle *h, size_t index)
2206 {
2207 return scm_c_string_ref (h->array, index);
2208 }
2209
2210 static void
2211 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2212 {
2213 scm_c_string_set_x (h->array, index, val);
2214 }
2215
2216 static void
2217 string_get_handle (SCM v, scm_t_array_handle *h)
2218 {
2219 h->array = v;
2220 h->ndims = 1;
2221 h->dims = &h->dim0;
2222 h->dim0.lbnd = 0;
2223 h->dim0.ubnd = scm_c_string_length (v) - 1;
2224 h->dim0.inc = 1;
2225 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2226 h->elements = h->writable_elements = NULL;
2227 }
2228
2229 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2230 string_handle_ref, string_handle_set,
2231 string_get_handle)
2232 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2233
2234 void
2235 scm_init_strings ()
2236 {
2237 scm_nullstr = scm_i_make_string (0, NULL, 0);
2238
2239 #include "libguile/strings.x"
2240 }
2241
2242
2243 /*
2244 Local Variables:
2245 c-file-style: "gnu"
2246 End:
2247 */