1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
34 #include "striconveh.h"
36 #include "libguile/_scm.h"
37 #include "libguile/chars.h"
38 #include "libguile/root.h"
39 #include "libguile/strings.h"
40 #include "libguile/ports.h"
41 #include "libguile/ports-internal.h"
42 #include "libguile/error.h"
43 #include "libguile/generalized-vectors.h"
44 #include "libguile/deprecation.h"
45 #include "libguile/validate.h"
46 #include "libguile/private-options.h"
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
58 * shared, not whether it definitely is.
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
77 /* The size in words of the stringbuf header (type tag + size). */
78 #define STRINGBUF_HEADER_SIZE 2U
80 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
82 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
83 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
89 #define STRINGBUF_CONTENTS(buf) ((void *) \
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
92 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
95 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
97 #define SET_STRINGBUF_SHARED(buf) \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
108 static size_t lenhist
[1001];
111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
114 make_stringbuf (size_t len
)
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
132 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
135 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
136 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
138 STRINGBUF_CHARS (buf
)[len
] = 0;
143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 make_wide_stringbuf (size_t len
)
151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
158 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
159 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
162 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
163 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
165 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
173 wide_stringbuf (SCM buf
)
177 if (STRINGBUF_WIDE (buf
))
184 len
= STRINGBUF_LENGTH (buf
);
186 new_buf
= make_wide_stringbuf (len
);
188 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
189 for (i
= 0; i
< len
; i
++)
190 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
200 narrow_stringbuf (SCM buf
)
204 if (!STRINGBUF_WIDE (buf
))
212 len
= STRINGBUF_LENGTH (buf
);
213 wmem
= STRINGBUF_WIDE_CHARS (buf
);
215 for (i
= 0; i
< len
; i
++)
217 /* BUF cannot be narrowed. */
220 new_buf
= make_stringbuf (len
);
222 mem
= STRINGBUF_CHARS (new_buf
);
223 for (i
= 0; i
< len
; i
++)
224 mem
[i
] = (unsigned char) wmem
[i
];
231 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
234 /* Copy-on-write strings.
237 #define STRING_TAG scm_tc7_string
239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
246 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
248 /* Read-only strings.
251 #define RO_STRING_TAG scm_tc7_ro_string
252 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
254 /* Mutation-sharing substrings
257 #define SH_STRING_TAG (scm_tc7_string + 0x100)
259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260 /* START and LENGTH as for STRINGs. */
262 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
266 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
267 characters. CHARSP, if not NULL, will be set to location of the
268 char array. If READ_ONLY_P, the returned string is read-only;
269 otherwise it is writable. */
271 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
273 static SCM null_stringbuf
= SCM_BOOL_F
;
279 if (SCM_UNLIKELY (scm_is_false (null_stringbuf
)))
281 null_stringbuf
= make_stringbuf (0);
282 SET_STRINGBUF_SHARED (null_stringbuf
);
284 buf
= null_stringbuf
;
287 buf
= make_stringbuf (len
);
290 *charsp
= (char *) STRINGBUF_CHARS (buf
);
291 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
293 (scm_t_bits
) 0, (scm_t_bits
) len
);
297 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
298 characters. CHARSP, if not NULL, will be set to location of the
299 character array. If READ_ONLY_P, the returned string is read-only;
300 otherwise it is writable. */
302 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
304 SCM buf
= make_wide_stringbuf (len
);
307 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
308 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
310 (scm_t_bits
) 0, (scm_t_bits
) len
);
315 validate_substring_args (SCM str
, size_t start
, size_t end
)
317 if (!IS_STRING (str
))
318 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
319 if (start
> STRING_LENGTH (str
))
320 scm_out_of_range (NULL
, scm_from_size_t (start
));
321 if (end
> STRING_LENGTH (str
) || end
< start
)
322 scm_out_of_range (NULL
, scm_from_size_t (end
));
326 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
328 *start
= STRING_START (*str
);
329 if (IS_SH_STRING (*str
))
331 *str
= SH_STRING_STRING (*str
);
332 *start
+= STRING_START (*str
);
334 *buf
= STRING_STRINGBUF (*str
);
338 scm_i_substring (SCM str
, size_t start
, size_t end
)
341 return scm_i_make_string (0, NULL
, 0);
346 get_str_buf_start (&str
, &buf
, &str_start
);
347 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
348 SET_STRINGBUF_SHARED (buf
);
349 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
350 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
351 (scm_t_bits
)str_start
+ start
,
352 (scm_t_bits
) end
- start
);
357 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
360 return scm_i_make_string (0, NULL
, 1);
365 get_str_buf_start (&str
, &buf
, &str_start
);
366 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
367 SET_STRINGBUF_SHARED (buf
);
368 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
369 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
370 (scm_t_bits
)str_start
+ start
,
371 (scm_t_bits
) end
- start
);
376 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
379 return scm_i_make_string (0, NULL
, 0);
382 size_t len
= end
- start
;
383 SCM buf
, my_buf
, substr
;
386 get_str_buf_start (&str
, &buf
, &str_start
);
387 if (scm_i_is_narrow_string (str
))
389 my_buf
= make_stringbuf (len
);
390 memcpy (STRINGBUF_CHARS (my_buf
),
391 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
395 my_buf
= make_wide_stringbuf (len
);
396 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
397 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
401 scm_remember_upto_here_1 (buf
);
402 substr
= scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
403 (scm_t_bits
) 0, (scm_t_bits
) len
);
405 scm_i_try_narrow_string (substr
);
411 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
413 if (start
== 0 && end
== STRING_LENGTH (str
))
415 else if (start
== end
)
416 return scm_i_make_string (0, NULL
, 0);
419 size_t len
= end
- start
;
420 if (IS_SH_STRING (str
))
422 start
+= STRING_START (str
);
423 str
= SH_STRING_STRING (str
);
425 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
426 (scm_t_bits
)start
, (scm_t_bits
) len
);
431 scm_c_substring (SCM str
, size_t start
, size_t end
)
433 validate_substring_args (str
, start
, end
);
434 return scm_i_substring (str
, start
, end
);
438 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
440 validate_substring_args (str
, start
, end
);
441 return scm_i_substring_read_only (str
, start
, end
);
445 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
447 validate_substring_args (str
, start
, end
);
448 return scm_i_substring_copy (str
, start
, end
);
452 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
454 validate_substring_args (str
, start
, end
);
455 return scm_i_substring_shared (str
, start
, end
);
459 /* Internal accessors
462 /* Returns the number of characters in STR. This may be different
463 than the memory size of the string storage. */
465 scm_i_string_length (SCM str
)
467 return STRING_LENGTH (str
);
470 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
471 encoding. False if it is 'wide', having a 32-bit UCS-4
474 scm_i_is_narrow_string (SCM str
)
476 if (IS_SH_STRING (str
))
477 str
= SH_STRING_STRING (str
);
479 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
482 /* Try to coerce a string to be narrow. It if is narrow already, do
483 nothing. If it is wide, shrink it to narrow if none of its
484 characters are above 0xFF. Return true if the string is narrow or
485 was made to be narrow. */
487 scm_i_try_narrow_string (SCM str
)
489 if (IS_SH_STRING (str
))
490 str
= SH_STRING_STRING (str
);
492 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
494 return scm_i_is_narrow_string (str
);
497 /* Return a pointer to the raw data of the string, which can be either Latin-1
498 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
500 scm_i_string_data (SCM str
)
506 get_str_buf_start (&str
, &buf
, &start
);
508 data
= STRINGBUF_CONTENTS (buf
);
509 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
514 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
517 scm_i_string_chars (SCM str
)
521 get_str_buf_start (&str
, &buf
, &start
);
522 if (scm_i_is_narrow_string (str
))
523 return (const char *) STRINGBUF_CHARS (buf
) + start
;
525 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
530 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
533 scm_i_string_wide_chars (SCM str
)
538 get_str_buf_start (&str
, &buf
, &start
);
539 if (!scm_i_is_narrow_string (str
))
540 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
542 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
546 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
547 a new string buffer, so that it can be modified without modifying
548 other strings. Also, lock the string mutex. Later, one must call
549 scm_i_string_stop_writing to unlock the mutex. */
551 scm_i_string_start_writing (SCM orig_str
)
553 SCM buf
, str
= orig_str
;
556 get_str_buf_start (&str
, &buf
, &start
);
557 if (IS_RO_STRING (str
))
558 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
560 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
561 if (STRINGBUF_SHARED (buf
))
563 /* Clone the stringbuf. */
564 size_t len
= STRING_LENGTH (str
);
567 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
569 if (scm_i_is_narrow_string (str
))
571 new_buf
= make_stringbuf (len
);
572 memcpy (STRINGBUF_CHARS (new_buf
),
573 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
578 new_buf
= make_wide_stringbuf (len
);
579 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
580 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
581 + STRING_START (str
)), len
);
584 SET_STRING_STRINGBUF (str
, new_buf
);
585 start
-= STRING_START (str
);
587 /* FIXME: The following operations are not atomic, so other threads
588 looking at STR may see an inconsistent state. Nevertheless it can't
589 hurt much since (i) accessing STR while it is being mutated can't
590 yield a crash, and (ii) concurrent accesses to STR should be
591 protected by a mutex at the application level. The latter may not
592 apply when STR != ORIG_STR, though. */
593 SET_STRING_START (str
, 0);
594 SET_STRING_STRINGBUF (str
, new_buf
);
598 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
603 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
605 scm_i_string_writable_chars (SCM str
)
610 get_str_buf_start (&str
, &buf
, &start
);
611 if (scm_i_is_narrow_string (str
))
612 return (char *) STRINGBUF_CHARS (buf
) + start
;
614 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
619 /* Return a pointer to the UCS-4 codepoints of a string. */
621 scm_i_string_writable_wide_chars (SCM str
)
626 get_str_buf_start (&str
, &buf
, &start
);
627 if (!scm_i_is_narrow_string (str
))
628 return STRINGBUF_WIDE_CHARS (buf
) + start
;
630 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
634 /* Unlock the string mutex that was locked when
635 scm_i_string_start_writing was called. */
637 scm_i_string_stop_writing (void)
639 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
642 /* Return the Xth character of STR as a UCS-4 codepoint. */
644 scm_i_string_ref (SCM str
, size_t x
)
646 if (scm_i_is_narrow_string (str
))
647 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
649 return scm_i_string_wide_chars (str
)[x
];
652 /* Returns index+1 of the first char in STR that matches C, or
653 0 if the char is not found. */
655 scm_i_string_contains_char (SCM str
, char ch
)
658 size_t len
= scm_i_string_length (str
);
661 if (scm_i_is_narrow_string (str
))
665 if (scm_i_string_chars (str
)[i
] == ch
)
674 if (scm_i_string_wide_chars (str
)[i
]
675 == (unsigned char) ch
)
684 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
686 if (scm_i_is_narrow_string (sstr
))
688 const char *a
= scm_i_string_chars (sstr
) + start_x
;
689 const char *b
= cstr
;
690 return strncmp (a
, b
, strlen(b
));
695 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
696 const char *b
= cstr
;
697 for (i
= 0; i
< strlen (b
); i
++)
699 if (a
[i
] != (unsigned char) b
[i
])
706 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
708 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
710 if (IS_SH_STRING (str
))
712 p
+= STRING_START (str
);
713 str
= SH_STRING_STRING (str
);
716 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
717 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
719 if (scm_i_is_narrow_string (str
))
721 char *dst
= scm_i_string_writable_chars (str
);
726 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
734 Basic symbol creation and accessing is done here, the rest is in
735 symbols.[hc]. This has been done to keep stringbufs and the
736 internals of strings and string-like objects confined to this file.
739 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
742 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
743 unsigned long hash
, SCM props
)
746 size_t start
= STRING_START (name
);
747 size_t length
= STRING_LENGTH (name
);
749 if (IS_SH_STRING (name
))
751 name
= SH_STRING_STRING (name
);
752 start
+= STRING_START (name
);
754 buf
= STRING_STRINGBUF (name
);
756 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
759 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
760 SET_STRINGBUF_SHARED (buf
);
761 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
766 if (scm_i_is_narrow_string (name
))
768 SCM new_buf
= make_stringbuf (length
);
769 memcpy (STRINGBUF_CHARS (new_buf
),
770 STRINGBUF_CHARS (buf
) + start
, length
);
775 SCM new_buf
= make_wide_stringbuf (length
);
776 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
777 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
782 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
783 (scm_t_bits
) hash
, SCM_UNPACK (props
));
787 scm_i_c_make_symbol (const char *name
, size_t len
,
788 scm_t_bits flags
, unsigned long hash
, SCM props
)
790 SCM buf
= make_stringbuf (len
);
791 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
793 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
794 (scm_t_bits
) hash
, SCM_UNPACK (props
));
797 /* Returns the number of characters in SYM. This may be different
798 from the memory size of SYM. */
800 scm_i_symbol_length (SCM sym
)
802 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
806 scm_c_symbol_length (SCM sym
)
807 #define FUNC_NAME "scm_c_symbol_length"
809 SCM_VALIDATE_SYMBOL (1, sym
);
811 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
815 /* True if the name of SYM is stored as a Latin-1 encoded string.
816 False if it is stored as a 32-bit UCS-4-encoded string. */
818 scm_i_is_narrow_symbol (SCM sym
)
822 buf
= SYMBOL_STRINGBUF (sym
);
823 return !STRINGBUF_WIDE (buf
);
826 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
827 contains the name of SYM. */
829 scm_i_symbol_chars (SCM sym
)
833 buf
= SYMBOL_STRINGBUF (sym
);
834 if (!STRINGBUF_WIDE (buf
))
835 return (const char *) STRINGBUF_CHARS (buf
);
837 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
841 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
844 scm_i_symbol_wide_chars (SCM sym
)
848 buf
= SYMBOL_STRINGBUF (sym
);
849 if (STRINGBUF_WIDE (buf
))
850 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
852 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
857 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
859 SCM buf
= SYMBOL_STRINGBUF (sym
);
860 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
861 SET_STRINGBUF_SHARED (buf
);
862 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
863 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
864 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
867 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
869 scm_i_symbol_ref (SCM sym
, size_t x
)
871 if (scm_i_is_narrow_symbol (sym
))
872 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
874 return scm_i_symbol_wide_chars (sym
)[x
];
880 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
881 "Returns an association list containing debugging information\n"
882 "for @var{str}. The association list has the following entries."
885 "The string itself.\n"
887 "The start index of the string into its stringbuf\n"
889 "The length of the string\n"
891 "If this string is a substring, it returns its parent string.\n"
892 "Otherwise, it returns @code{#f}\n"
894 "@code{#t} if the string is read-only\n"
895 "@item stringbuf-chars\n"
896 "A new string containing this string's stringbuf's characters\n"
897 "@item stringbuf-length\n"
898 "The number of characters in this stringbuf\n"
899 "@item stringbuf-shared\n"
900 "@code{#t} if this stringbuf is shared\n"
901 "@item stringbuf-wide\n"
902 "@code{#t} if this stringbuf's characters are stored in a\n"
903 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
906 #define FUNC_NAME s_scm_sys_string_dump
908 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
910 SCM_VALIDATE_STRING (1, str
);
913 e1
= scm_cons (scm_from_latin1_symbol ("string"),
915 e2
= scm_cons (scm_from_latin1_symbol ("start"),
916 scm_from_size_t (STRING_START (str
)));
917 e3
= scm_cons (scm_from_latin1_symbol ("length"),
918 scm_from_size_t (STRING_LENGTH (str
)));
920 if (IS_SH_STRING (str
))
922 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
923 SH_STRING_STRING (str
));
924 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
928 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
930 buf
= STRING_STRINGBUF (str
);
933 if (IS_RO_STRING (str
))
934 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
937 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
941 if (!STRINGBUF_WIDE (buf
))
943 size_t len
= STRINGBUF_LENGTH (buf
);
945 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
946 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
947 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
952 size_t len
= STRINGBUF_LENGTH (buf
);
954 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
955 u32_cpy ((scm_t_uint32
*) cbuf
,
956 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
957 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
960 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
961 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
962 if (STRINGBUF_SHARED (buf
))
963 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
966 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
968 if (STRINGBUF_WIDE (buf
))
969 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
972 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
975 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
979 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
980 "Returns an association list containing debugging information\n"
981 "for @var{sym}. The association list has the following entries."
984 "The symbol itself\n"
988 "@code{#t} if it is an interned symbol\n"
989 "@item stringbuf-chars\n"
990 "A new string containing this symbols's stringbuf's characters\n"
991 "@item stringbuf-length\n"
992 "The number of characters in this stringbuf\n"
993 "@item stringbuf-shared\n"
994 "@code{#t} if this stringbuf is shared\n"
995 "@item stringbuf-wide\n"
996 "@code{#t} if this stringbuf's characters are stored in a\n"
997 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1000 #define FUNC_NAME s_scm_sys_symbol_dump
1002 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
1004 SCM_VALIDATE_SYMBOL (1, sym
);
1005 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
1007 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
1008 scm_from_ulong (scm_i_symbol_hash (sym
)));
1009 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
1010 scm_symbol_interned_p (sym
));
1011 buf
= SYMBOL_STRINGBUF (sym
);
1013 /* Stringbuf info */
1014 if (!STRINGBUF_WIDE (buf
))
1016 size_t len
= STRINGBUF_LENGTH (buf
);
1018 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
1019 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
1020 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1025 size_t len
= STRINGBUF_LENGTH (buf
);
1027 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
1028 u32_cpy ((scm_t_uint32
*) cbuf
,
1029 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
1030 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1033 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1034 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
1035 if (STRINGBUF_SHARED (buf
))
1036 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1039 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1041 if (STRINGBUF_WIDE (buf
))
1042 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1045 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1047 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1052 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1054 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1055 #define FUNC_NAME s_scm_sys_stringbuf_hist
1058 for (i
= 0; i
< 1000; i
++)
1060 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1061 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1062 return SCM_UNSPECIFIED
;
1070 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1072 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1073 #define FUNC_NAME s_scm_string_p
1075 return scm_from_bool (IS_STRING (obj
));
1080 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1082 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1084 "@deffnx {Scheme Procedure} list->string chrs\n"
1085 "Return a newly allocated string composed of the arguments,\n"
1087 #define FUNC_NAME s_scm_string
1089 SCM result
= SCM_BOOL_F
;
1096 /* Verify that this is a list of chars. */
1097 i
= scm_ilength (chrs
);
1098 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1103 while (len
> 0 && scm_is_pair (rest
))
1105 SCM elt
= SCM_CAR (rest
);
1106 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1107 if (SCM_CHAR (elt
) > 0xFF)
1109 rest
= SCM_CDR (rest
);
1111 scm_remember_upto_here_1 (elt
);
1114 /* Construct a string containing this list of chars. */
1122 result
= scm_i_make_string (len
, NULL
, 0);
1123 result
= scm_i_string_start_writing (result
);
1124 buf
= scm_i_string_writable_chars (result
);
1125 while (len
> 0 && scm_is_pair (rest
))
1127 SCM elt
= SCM_CAR (rest
);
1128 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1130 rest
= SCM_CDR (rest
);
1132 scm_remember_upto_here_1 (elt
);
1139 result
= scm_i_make_wide_string (len
, NULL
, 0);
1140 result
= scm_i_string_start_writing (result
);
1141 buf
= scm_i_string_writable_wide_chars (result
);
1142 while (len
> 0 && scm_is_pair (rest
))
1144 SCM elt
= SCM_CAR (rest
);
1145 buf
[p
] = SCM_CHAR (elt
);
1147 rest
= SCM_CDR (rest
);
1149 scm_remember_upto_here_1 (elt
);
1152 scm_i_string_stop_writing ();
1155 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1156 if (!scm_is_null (rest
))
1157 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1163 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1165 "Return a newly allocated string of\n"
1166 "length @var{k}. If @var{chr} is given, then all elements of\n"
1167 "the string are initialized to @var{chr}, otherwise the contents\n"
1168 "of the string are all set to @code{#\nul}.")
1169 #define FUNC_NAME s_scm_make_string
1171 return scm_c_make_string (scm_to_size_t (k
), chr
);
1176 scm_c_make_string (size_t len
, SCM chr
)
1177 #define FUNC_NAME NULL
1180 char *contents
= NULL
;
1181 SCM res
= scm_i_make_string (len
, &contents
, 0);
1183 /* If no char is given, initialize string contents to NULL. */
1184 if (SCM_UNBNDP (chr
))
1185 memset (contents
, 0, len
);
1188 SCM_VALIDATE_CHAR (0, chr
);
1189 res
= scm_i_string_start_writing (res
);
1190 for (p
= 0; p
< len
; p
++)
1191 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1192 scm_i_string_stop_writing ();
1199 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1201 "Return the number of characters in @var{string}.")
1202 #define FUNC_NAME s_scm_string_length
1204 SCM_VALIDATE_STRING (1, string
);
1205 return scm_from_size_t (STRING_LENGTH (string
));
1209 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1211 "Return the bytes used to represent a character in @var{string}."
1212 "This will return 1 or 4.")
1213 #define FUNC_NAME s_scm_string_bytes_per_char
1215 SCM_VALIDATE_STRING (1, string
);
1216 if (!scm_i_is_narrow_string (string
))
1217 return scm_from_int (4);
1219 return scm_from_int (1);
1224 scm_c_string_length (SCM string
)
1226 if (!IS_STRING (string
))
1227 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1228 return STRING_LENGTH (string
);
1231 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1233 "Return character @var{k} of @var{str} using zero-origin\n"
1234 "indexing. @var{k} must be a valid index of @var{str}.")
1235 #define FUNC_NAME s_scm_string_ref
1240 SCM_VALIDATE_STRING (1, str
);
1242 len
= scm_i_string_length (str
);
1243 if (SCM_LIKELY (len
> 0))
1244 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1246 scm_out_of_range (NULL
, k
);
1248 if (scm_i_is_narrow_string (str
))
1249 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1251 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1256 scm_c_string_ref (SCM str
, size_t p
)
1258 if (p
>= scm_i_string_length (str
))
1259 scm_out_of_range (NULL
, scm_from_size_t (p
));
1260 if (scm_i_is_narrow_string (str
))
1261 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1263 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1267 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1268 (SCM str
, SCM k
, SCM chr
),
1269 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1270 "an unspecified value. @var{k} must be a valid index of\n"
1272 #define FUNC_NAME s_scm_string_set_x
1277 SCM_VALIDATE_STRING (1, str
);
1279 len
= scm_i_string_length (str
);
1280 if (SCM_LIKELY (len
> 0))
1281 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1283 scm_out_of_range (NULL
, k
);
1285 SCM_VALIDATE_CHAR (3, chr
);
1286 str
= scm_i_string_start_writing (str
);
1287 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1288 scm_i_string_stop_writing ();
1290 return SCM_UNSPECIFIED
;
1295 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1297 if (p
>= scm_i_string_length (str
))
1298 scm_out_of_range (NULL
, scm_from_size_t (p
));
1299 str
= scm_i_string_start_writing (str
);
1300 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1301 scm_i_string_stop_writing ();
1304 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1305 (SCM str
, SCM start
, SCM end
),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring
1314 size_t len
, from
, to
;
1316 SCM_VALIDATE_STRING (1, str
);
1317 len
= scm_i_string_length (str
);
1318 from
= scm_to_unsigned_integer (start
, 0, len
);
1319 if (SCM_UNBNDP (end
))
1322 to
= scm_to_unsigned_integer (end
, from
, len
);
1323 return scm_i_substring (str
, from
, to
);
1327 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1328 (SCM str
, SCM start
, SCM end
),
1329 "Return a newly allocated string formed from the characters\n"
1330 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1331 "ending with index @var{end} (exclusive).\n"
1332 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1333 "exact integers satisfying:\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1337 "The returned string is read-only.\n")
1338 #define FUNC_NAME s_scm_substring_read_only
1340 size_t len
, from
, to
;
1342 SCM_VALIDATE_STRING (1, str
);
1343 len
= scm_i_string_length (str
);
1344 from
= scm_to_unsigned_integer (start
, 0, len
);
1345 if (SCM_UNBNDP (end
))
1348 to
= scm_to_unsigned_integer (end
, from
, len
);
1349 return scm_i_substring_read_only (str
, from
, to
);
1353 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1354 (SCM str
, SCM start
, SCM end
),
1355 "Return a newly allocated string formed from the characters\n"
1356 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1357 "ending with index @var{end} (exclusive).\n"
1358 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1359 "exact integers satisfying:\n\n"
1360 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1361 #define FUNC_NAME s_scm_substring_copy
1363 /* For the Scheme version, START is mandatory, but for the C
1364 version, it is optional. See scm_string_copy in srfi-13.c for a
1370 SCM_VALIDATE_STRING (1, str
);
1371 scm_i_get_substring_spec (scm_i_string_length (str
),
1372 start
, &from
, end
, &to
);
1373 return scm_i_substring_copy (str
, from
, to
);
1377 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1378 (SCM str
, SCM start
, SCM end
),
1379 "Return string that indirectly refers to the characters\n"
1380 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1381 "ending with index @var{end} (exclusive).\n"
1382 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1383 "exact integers satisfying:\n\n"
1384 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1385 #define FUNC_NAME s_scm_substring_shared
1387 size_t len
, from
, to
;
1389 SCM_VALIDATE_STRING (1, str
);
1390 len
= scm_i_string_length (str
);
1391 from
= scm_to_unsigned_integer (start
, 0, len
);
1392 if (SCM_UNBNDP (end
))
1395 to
= scm_to_unsigned_integer (end
, from
, len
);
1396 return scm_i_substring_shared (str
, from
, to
);
1400 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1402 "Return a newly allocated string whose characters form the\n"
1403 "concatenation of the given strings, @var{args}.")
1404 #define FUNC_NAME s_scm_string_append
1418 SCM_VALIDATE_REST_ARGUMENT (args
);
1419 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1422 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1423 len
= scm_i_string_length (s
);
1424 if (((size_t) -1) - total
< len
)
1425 scm_num_overflow (s_scm_string_append
);
1427 if (!scm_i_is_narrow_string (s
))
1432 res
= scm_i_make_string (total
, &data
.narrow
, 0);
1434 res
= scm_i_make_wide_string (total
, &data
.wide
, 0);
1436 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1440 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1441 len
= scm_i_string_length (s
);
1443 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL
);
1446 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1451 if (scm_i_is_narrow_string (s
))
1453 const char *src
= scm_i_string_chars (s
);
1454 for (i
= 0; i
< len
; i
++)
1455 data
.wide
[i
] = (unsigned char) src
[i
];
1458 u32_cpy ((scm_t_uint32
*) data
.wide
,
1459 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1463 scm_remember_upto_here_1 (s
);
1466 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL
);
1473 /* Charset conversion error handling. */
1475 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1476 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1478 /* Raise an exception informing that character CHR could not be written
1479 to PORT in its current encoding. */
1481 scm_encoding_error (const char *subr
, int err
, const char *message
,
1484 scm_throw (scm_encoding_error_key
,
1485 scm_list_n (scm_from_latin1_string (subr
),
1486 scm_from_latin1_string (message
),
1492 /* Raise an exception informing of an encoding error on PORT. This
1493 means that a character could not be written in PORT's encoding. */
1495 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1497 scm_throw (scm_decoding_error_key
,
1498 scm_list_n (scm_from_latin1_string (subr
),
1499 scm_from_latin1_string (message
),
1506 /* String conversion to/from C. */
1509 decoding_error (const char *func_name
, int errno_save
,
1510 const char *str
, size_t len
)
1512 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1517 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1518 memcpy (buf
, str
, len
);
1519 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1521 scm_decoding_error (func_name
, errno_save
,
1522 "input locale conversion error", bv
);
1526 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1527 scm_t_string_failed_conversion_handler handler
)
1534 /* The order of these checks is important. */
1535 if (!str
&& len
!= 0)
1536 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1537 if (len
== (size_t) -1)
1540 if (c_strcasecmp (encoding
, "ISO-8859-1") == 0 || len
== 0)
1541 return scm_from_latin1_stringn (str
, len
);
1542 else if (c_strcasecmp (encoding
, "UTF-8") == 0
1543 && handler
== SCM_FAILED_CONVERSION_ERROR
)
1544 return scm_from_utf8_stringn (str
, len
);
1547 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1548 (enum iconv_ilseq_handler
)
1554 if (SCM_UNLIKELY (u32
== NULL
))
1555 decoding_error (__func__
, errno
, str
, len
);
1559 if (u32
[i
++] > 0xFF)
1568 res
= scm_i_make_string (u32len
, &dst
, 0);
1569 for (i
= 0; i
< u32len
; i
++)
1570 dst
[i
] = (unsigned char) u32
[i
];
1576 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1577 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1586 scm_from_locale_string (const char *str
)
1588 return scm_from_locale_stringn (str
, -1);
1592 scm_from_locale_stringn (const char *str
, size_t len
)
1594 return scm_from_stringn (str
, len
, locale_charset (),
1595 scm_i_default_port_conversion_handler ());
1599 scm_from_latin1_string (const char *str
)
1601 return scm_from_latin1_stringn (str
, -1);
1605 scm_from_latin1_stringn (const char *str
, size_t len
)
1610 if (len
== (size_t) -1)
1613 /* Make a narrow string and copy STR as is. */
1614 result
= scm_i_make_string (len
, &buf
, 0);
1615 memcpy (buf
, str
, len
);
1621 scm_from_utf8_string (const char *str
)
1623 return scm_from_utf8_stringn (str
, -1);
1627 scm_from_utf8_stringn (const char *str
, size_t len
)
1630 const scm_t_uint8
*ustr
= (const scm_t_uint8
*) str
;
1631 int ascii
= 1, narrow
= 1;
1634 if (len
== (size_t) -1)
1654 nbytes
= u8_mbtouc (&c
, ustr
+ i
, len
- i
);
1658 decoding_error (__func__
, errno
, str
, len
);
1671 res
= scm_i_make_string (char_len
, &dst
, 0);
1672 memcpy (dst
, str
, len
);
1680 res
= scm_i_make_string (char_len
, &dst
, 0);
1682 for (i
= 0, j
= 0; i
< len
; j
++)
1684 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1685 dst
[j
] = (signed char) c
;
1694 res
= scm_i_make_wide_string (char_len
, &dst
, 0);
1696 for (i
= 0, j
= 0; i
< len
; j
++)
1698 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1707 scm_from_utf32_string (const scm_t_wchar
*str
)
1709 return scm_from_utf32_stringn (str
, -1);
1713 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1718 if (len
== (size_t) -1)
1719 len
= u32_strlen ((uint32_t *) str
);
1721 result
= scm_i_make_wide_string (len
, &buf
, 0);
1722 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1723 scm_i_try_narrow_string (result
);
1729 scm_from_port_string (const char *str
, SCM port
)
1731 return scm_from_port_stringn (str
, -1, port
);
1735 scm_from_port_stringn (const char *str
, size_t len
, SCM port
)
1737 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
1738 scm_t_port_internal
*pti
= SCM_PORT_GET_INTERNAL (port
);
1740 if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
)
1741 return scm_from_latin1_stringn (str
, len
);
1742 else if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_UTF8
1743 && pt
->ilseq_handler
== SCM_FAILED_CONVERSION_ERROR
)
1744 return scm_from_utf8_stringn (str
, len
);
1746 return scm_from_stringn (str
, len
, pt
->encoding
, pt
->ilseq_handler
);
1749 /* Create a new scheme string from the C string STR. The memory of
1750 STR may be used directly as storage for the new string. */
1751 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1752 would be to register a finalizer to eventually free(3) STR, which isn't
1753 worth it. Should we just deprecate the `scm_take_' functions? */
1755 scm_take_locale_stringn (char *str
, size_t len
)
1759 res
= scm_from_locale_stringn (str
, len
);
1766 scm_take_locale_string (char *str
)
1768 return scm_take_locale_stringn (str
, -1);
1771 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1772 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1773 Set *LENP to the size of the resulting string.
1775 FIXME: This is a hack we should get rid of. See
1776 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1779 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1781 char *before
, *after
;
1790 if ((i
<= *lenp
- 6)
1791 && before
[i
] == '\\'
1792 && before
[i
+ 1] == 'u'
1793 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1795 /* Convert \u00NN to \xNN */
1798 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1799 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1803 else if ((i
<= *lenp
- 10)
1804 && before
[i
] == '\\'
1805 && before
[i
+ 1] == 'U'
1806 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1808 /* Convert \U00NNNNNN to \UNNNNNN */
1811 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1812 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1813 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1814 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1815 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1816 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1822 after
[j
] = before
[i
];
1830 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1831 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1832 of the resulting string. BUF must be large enough to handle the
1833 worst case when `\uXXXX' escapes (6 characters) are replaced by
1834 `\xXXXX;' (7 characters). */
1836 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1838 char *before
, *after
;
1840 /* The worst case is if the input string contains all 4-digit hex escapes.
1841 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1842 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1843 size_t nzeros
, ndigits
;
1846 after
= alloca (max_out_len
);
1851 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1852 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1854 if (before
[i
+ 1] == 'u')
1856 else if (before
[i
+ 1] == 'U')
1861 /* Add the R6RS hex escape initial sequence. */
1865 /* Move string positions to the start of the hex numbers. */
1869 /* Find the number of initial zeros in this hex number. */
1871 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1874 /* Copy the number, skipping initial zeros, and then move the string
1876 if (nzeros
== ndigits
)
1885 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1886 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1888 j
+= (ndigits
- nzeros
);
1891 /* Add terminating semicolon. */
1897 after
[j
] = before
[i
];
1903 memcpy (before
, after
, j
);
1907 scm_to_locale_string (SCM str
)
1909 return scm_to_locale_stringn (str
, NULL
);
1913 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1915 return scm_to_stringn (str
, lenp
,
1917 scm_i_default_port_conversion_handler ());
1921 scm_to_latin1_string (SCM str
)
1923 return scm_to_latin1_stringn (str
, NULL
);
1927 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1928 #define FUNC_NAME "scm_to_latin1_stringn"
1932 SCM_VALIDATE_STRING (1, str
);
1934 if (scm_i_is_narrow_string (str
))
1936 size_t len
= scm_i_string_length (str
);
1941 result
= scm_strndup (scm_i_string_data (str
), len
);
1944 result
= scm_to_stringn (str
, lenp
, NULL
,
1945 SCM_FAILED_CONVERSION_ERROR
);
1952 scm_to_utf8_string (SCM str
)
1954 return scm_to_utf8_stringn (str
, NULL
);
1958 latin1_u8_strlen (const scm_t_uint8
*str
, size_t len
)
1961 for (i
= 0, ret
= 0; i
< len
; i
++)
1962 ret
+= (str
[i
] < 128) ? 1 : 2;
1967 latin1_to_u8 (const scm_t_uint8
*str
, size_t latin_len
,
1968 scm_t_uint8
*u8_result
, size_t *u8_lenp
)
1971 size_t u8_len
= latin1_u8_strlen (str
, latin_len
);
1973 if (!(u8_result
&& u8_lenp
&& *u8_lenp
> u8_len
))
1974 u8_result
= scm_malloc (u8_len
+ 1);
1978 for (i
= 0, n
= 0; i
< latin_len
; i
++)
1979 n
+= u8_uctomb (u8_result
+ n
, str
[i
], u8_len
- n
);
1989 (Note that this includes code points that are not allowed by Unicode,
1990 but since this function has no way to report an error, and its
1991 purpose is to determine the size of destination buffers for
1992 libunicode conversion functions, we err on the safe side and handle
1993 everything that libunicode might conceivably handle, now or in the
1996 Char. number range | UTF-8 octet sequence
1997 (hexadecimal) | (binary)
1998 --------------------+------------------------------------------------------
1999 0000 0000-0000 007F | 0xxxxxxx
2000 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2001 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2002 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2003 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2004 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2008 u32_u8_length_in_bytes (const scm_t_uint32
*str
, size_t len
)
2012 for (i
= 0, ret
= 0; i
< len
; i
++)
2014 scm_t_uint32 c
= str
[i
];
2018 else if (c
<= 0x7ff)
2020 else if (c
<= 0xffff)
2022 else if (c
<= 0x1fffff)
2024 else if (c
<= 0x3ffffff)
2034 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
2035 #define FUNC_NAME "scm_to_utf8_stringn"
2037 SCM_VALIDATE_STRING (1, str
);
2039 if (scm_i_is_narrow_string (str
))
2040 return (char *) latin1_to_u8 ((scm_t_uint8
*) scm_i_string_chars (str
),
2041 scm_i_string_length (str
),
2045 scm_t_uint32
*chars
= (scm_t_uint32
*) scm_i_string_wide_chars (str
);
2046 scm_t_uint8
*buf
, *ret
;
2047 size_t num_chars
= scm_i_string_length (str
);
2048 size_t num_bytes_predicted
, num_bytes_actual
;
2050 num_bytes_predicted
= u32_u8_length_in_bytes (chars
, num_chars
);
2054 *lenp
= num_bytes_predicted
;
2055 buf
= scm_malloc (num_bytes_predicted
);
2059 buf
= scm_malloc (num_bytes_predicted
+ 1);
2060 buf
[num_bytes_predicted
] = 0;
2063 num_bytes_actual
= num_bytes_predicted
;
2064 ret
= u32_to_u8 (chars
, num_chars
, buf
, &num_bytes_actual
);
2066 if (SCM_LIKELY (ret
== buf
&& num_bytes_actual
== num_bytes_predicted
))
2067 return (char *) ret
;
2069 /* An error: a bad codepoint. */
2071 int saved_errno
= errno
;
2077 scm_decoding_error ("scm_to_utf8_stringn", errno
,
2078 "invalid codepoint in string", str
);
2088 scm_to_utf32_string (SCM str
)
2090 return scm_to_utf32_stringn (str
, NULL
);
2094 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
2095 #define FUNC_NAME "scm_to_utf32_stringn"
2097 scm_t_wchar
*result
;
2099 SCM_VALIDATE_STRING (1, str
);
2101 if (scm_i_is_narrow_string (str
))
2103 scm_t_uint8
*codepoints
;
2106 codepoints
= (scm_t_uint8
*) scm_i_string_chars (str
);
2107 len
= scm_i_string_length (str
);
2111 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2112 for (i
= 0; i
< len
; i
++)
2113 result
[i
] = codepoints
[i
];
2120 len
= scm_i_string_length (str
);
2124 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2125 memcpy (result
, scm_i_string_wide_chars (str
),
2126 len
* sizeof (scm_t_wchar
));
2135 scm_to_port_string (SCM str
, SCM port
)
2137 return scm_to_port_stringn (str
, NULL
, port
);
2141 scm_to_port_stringn (SCM str
, size_t *lenp
, SCM port
)
2143 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
2144 scm_t_port_internal
*pti
= SCM_PORT_GET_INTERNAL (port
);
2146 if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
2147 && pt
->ilseq_handler
== SCM_FAILED_CONVERSION_ERROR
)
2148 return scm_to_latin1_stringn (str
, lenp
);
2149 else if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_UTF8
)
2150 return scm_to_utf8_stringn (str
, lenp
);
2152 return scm_to_stringn (str
, lenp
, pt
->encoding
, pt
->ilseq_handler
);
2155 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2156 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2157 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2158 defined by HANDLER. */
2160 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
2161 scm_t_string_failed_conversion_handler handler
)
2164 size_t ilen
, len
, i
;
2168 if (!scm_is_string (str
))
2169 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2170 ilen
= scm_i_string_length (str
);
2174 buf
= scm_malloc (1);
2182 for (i
= 0; i
< ilen
; i
++)
2183 if (scm_i_string_ref (str
, i
) == '\0')
2184 scm_misc_error (NULL
,
2185 "string contains #\\nul character: ~S",
2188 if (scm_i_is_narrow_string (str
)
2189 && c_strcasecmp (encoding
, "ISO-8859-1") == 0)
2191 /* If using native Latin-1 encoding, just copy the string
2195 buf
= scm_malloc (ilen
);
2196 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2202 buf
= scm_malloc (ilen
+ 1);
2203 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2215 if (scm_i_is_narrow_string (str
))
2217 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
2219 (enum iconv_ilseq_handler
) handler
, NULL
,
2223 scm_encoding_error (__func__
, errno
,
2224 "cannot convert narrow string to output locale",
2226 /* FIXME: Faulty character unknown. */
2231 buf
= u32_conv_to_encoding (enc
,
2232 (enum iconv_ilseq_handler
) handler
,
2233 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
2238 scm_encoding_error (__func__
, errno
,
2239 "cannot convert wide string to output locale",
2241 /* FIXME: Faulty character unknown. */
2244 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
2246 if (SCM_R6RS_ESCAPES_P
)
2248 /* The worst case is if the input string contains all 4-digit
2249 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2250 (seven characters). Make BUF large enough to hold
2252 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
2253 unistring_escapes_to_r6rs_escapes (buf
, &len
);
2256 unistring_escapes_to_guile_escapes (buf
, &len
);
2258 buf
= scm_realloc (buf
, len
);
2264 buf
= scm_realloc (buf
, len
+ 1);
2268 scm_remember_upto_here_1 (str
);
2273 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
2276 char *result
= NULL
;
2277 if (!scm_is_string (str
))
2278 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2279 result
= scm_to_locale_stringn (str
, &len
);
2281 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
2284 scm_remember_upto_here_1 (str
);
2289 /* Unicode string normalization. */
2291 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2292 libguile/i18n.c. It would be useful to have this factored out into a more
2293 convenient location, but its use of alloca makes that tricky to do. */
2296 normalize_str (SCM string
, uninorm_t form
)
2299 scm_t_uint32
*w_str
;
2301 size_t rlen
, len
= scm_i_string_length (string
);
2303 if (scm_i_is_narrow_string (string
))
2306 const char *buf
= scm_i_string_chars (string
);
2308 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
2310 for (i
= 0; i
< len
; i
++)
2311 w_str
[i
] = (unsigned char) buf
[i
];
2315 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2317 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2319 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2320 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2323 scm_i_try_narrow_string (ret
);
2328 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2330 "Returns the NFC normalized form of @var{string}.")
2331 #define FUNC_NAME s_scm_string_normalize_nfc
2333 SCM_VALIDATE_STRING (1, string
);
2334 return normalize_str (string
, UNINORM_NFC
);
2338 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2340 "Returns the NFD normalized form of @var{string}.")
2341 #define FUNC_NAME s_scm_string_normalize_nfd
2343 SCM_VALIDATE_STRING (1, string
);
2344 return normalize_str (string
, UNINORM_NFD
);
2348 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2350 "Returns the NFKC normalized form of @var{string}.")
2351 #define FUNC_NAME s_scm_string_normalize_nfkc
2353 SCM_VALIDATE_STRING (1, string
);
2354 return normalize_str (string
, UNINORM_NFKC
);
2358 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2360 "Returns the NFKD normalized form of @var{string}.")
2361 #define FUNC_NAME s_scm_string_normalize_nfkd
2363 SCM_VALIDATE_STRING (1, string
);
2364 return normalize_str (string
, UNINORM_NFKD
);
2368 /* converts C scm_array of strings to SCM scm_list of strings.
2369 If argc < 0, a null terminated scm_array is assumed.
2370 The current locale encoding is assumed */
2372 scm_makfromstrs (int argc
, char **argv
)
2377 for (i
= 0; argv
[i
]; i
++);
2379 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2383 /* Return a newly allocated array of char pointers to each of the strings
2384 in args, with a terminating NULL pointer. The strings are encoded using
2385 the current locale. */
2388 scm_i_allocate_string_pointers (SCM list
)
2389 #define FUNC_NAME "scm_i_allocate_string_pointers"
2392 int list_len
= scm_ilength (list
);
2396 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2398 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2400 result
[list_len
] = NULL
;
2402 /* The list might have been modified in another thread, so
2403 we check LIST before each access.
2405 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2407 SCM str
= SCM_CAR (list
);
2408 size_t len
; /* String length in bytes */
2409 char *c_str
= scm_to_locale_stringn (str
, &len
);
2411 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2412 scm_malloc to allocate the returned string, which must be
2413 explicitly deallocated. This forces us to copy the string a
2414 second time into a new buffer. Ideally there would be variants
2415 of scm_to_*_stringn that can return garbage-collected buffers. */
2417 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2418 memcpy (result
[i
], c_str
, len
);
2419 result
[i
][len
] = '\0';
2422 list
= SCM_CDR (list
);
2430 scm_i_get_substring_spec (size_t len
,
2431 SCM start
, size_t *cstart
,
2432 SCM end
, size_t *cend
)
2434 if (SCM_UNBNDP (start
))
2437 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2439 if (SCM_UNBNDP (end
))
2442 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2446 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2448 return scm_c_string_ref (h
->array
, index
);
2452 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2454 scm_c_string_set_x (h
->array
, index
, val
);
2458 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2464 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2466 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2467 h
->elements
= h
->writable_elements
= NULL
;
2470 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2471 string_handle_ref
, string_handle_set
,
2473 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2478 scm_nullstr
= scm_i_make_string (0, NULL
, 0);
2480 #include "libguile/strings.x"