1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "striconveh.h"
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
94 #define SET_STRINGBUF_SHARED(buf) \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist
[1001];
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
111 make_stringbuf (size_t len
)
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
129 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
132 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
133 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
135 STRINGBUF_CHARS (buf
)[len
] = 0;
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
143 make_wide_stringbuf (size_t len
)
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
155 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
156 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
159 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
160 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
162 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
170 wide_stringbuf (SCM buf
)
174 if (STRINGBUF_WIDE (buf
))
181 len
= STRINGBUF_LENGTH (buf
);
183 new_buf
= make_wide_stringbuf (len
);
185 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
186 for (i
= 0; i
< len
; i
++)
187 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
197 narrow_stringbuf (SCM buf
)
201 if (!STRINGBUF_WIDE (buf
))
209 len
= STRINGBUF_LENGTH (buf
);
210 wmem
= STRINGBUF_WIDE_CHARS (buf
);
212 for (i
= 0; i
< len
; i
++)
214 /* BUF cannot be narrowed. */
217 new_buf
= make_stringbuf (len
);
219 mem
= STRINGBUF_CHARS (new_buf
);
220 for (i
= 0; i
< len
; i
++)
221 mem
[i
] = (unsigned char) wmem
[i
];
228 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
231 /* Copy-on-write strings.
234 #define STRING_TAG scm_tc7_string
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
243 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
245 /* Read-only strings.
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
251 /* Mutation-sharing substrings
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
268 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
270 static SCM null_stringbuf
= SCM_BOOL_F
;
276 if (SCM_UNLIKELY (scm_is_false (null_stringbuf
)))
278 null_stringbuf
= make_stringbuf (0);
279 SET_STRINGBUF_SHARED (null_stringbuf
);
281 buf
= null_stringbuf
;
284 buf
= make_stringbuf (len
);
287 *charsp
= (char *) STRINGBUF_CHARS (buf
);
288 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
290 (scm_t_bits
) 0, (scm_t_bits
) len
);
294 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
295 characters. CHARSP, if not NULL, will be set to location of the
296 character array. If READ_ONLY_P, the returned string is read-only;
297 otherwise it is writable. */
299 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
301 SCM buf
= make_wide_stringbuf (len
);
304 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
305 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
307 (scm_t_bits
) 0, (scm_t_bits
) len
);
312 validate_substring_args (SCM str
, size_t start
, size_t end
)
314 if (!IS_STRING (str
))
315 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
316 if (start
> STRING_LENGTH (str
))
317 scm_out_of_range (NULL
, scm_from_size_t (start
));
318 if (end
> STRING_LENGTH (str
) || end
< start
)
319 scm_out_of_range (NULL
, scm_from_size_t (end
));
323 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
325 *start
= STRING_START (*str
);
326 if (IS_SH_STRING (*str
))
328 *str
= SH_STRING_STRING (*str
);
329 *start
+= STRING_START (*str
);
331 *buf
= STRING_STRINGBUF (*str
);
335 scm_i_substring (SCM str
, size_t start
, size_t end
)
338 return scm_i_make_string (0, NULL
, 0);
343 get_str_buf_start (&str
, &buf
, &str_start
);
344 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
345 SET_STRINGBUF_SHARED (buf
);
346 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
347 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
348 (scm_t_bits
)str_start
+ start
,
349 (scm_t_bits
) end
- start
);
354 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
357 return scm_i_make_string (0, NULL
, 1);
362 get_str_buf_start (&str
, &buf
, &str_start
);
363 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
364 SET_STRINGBUF_SHARED (buf
);
365 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
366 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
367 (scm_t_bits
)str_start
+ start
,
368 (scm_t_bits
) end
- start
);
373 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
376 return scm_i_make_string (0, NULL
, 0);
379 size_t len
= end
- start
;
380 SCM buf
, my_buf
, substr
;
383 get_str_buf_start (&str
, &buf
, &str_start
);
384 if (scm_i_is_narrow_string (str
))
386 my_buf
= make_stringbuf (len
);
387 memcpy (STRINGBUF_CHARS (my_buf
),
388 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
392 my_buf
= make_wide_stringbuf (len
);
393 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
394 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
398 scm_remember_upto_here_1 (buf
);
399 substr
= scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
400 (scm_t_bits
) 0, (scm_t_bits
) len
);
402 scm_i_try_narrow_string (substr
);
408 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
410 if (start
== 0 && end
== STRING_LENGTH (str
))
412 else if (start
== end
)
413 return scm_i_make_string (0, NULL
, 0);
416 size_t len
= end
- start
;
417 if (IS_SH_STRING (str
))
419 start
+= STRING_START (str
);
420 str
= SH_STRING_STRING (str
);
422 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
423 (scm_t_bits
)start
, (scm_t_bits
) len
);
428 scm_c_substring (SCM str
, size_t start
, size_t end
)
430 validate_substring_args (str
, start
, end
);
431 return scm_i_substring (str
, start
, end
);
435 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
437 validate_substring_args (str
, start
, end
);
438 return scm_i_substring_read_only (str
, start
, end
);
442 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
444 validate_substring_args (str
, start
, end
);
445 return scm_i_substring_copy (str
, start
, end
);
449 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
451 validate_substring_args (str
, start
, end
);
452 return scm_i_substring_shared (str
, start
, end
);
456 /* Internal accessors
459 /* Returns the number of characters in STR. This may be different
460 than the memory size of the string storage. */
462 scm_i_string_length (SCM str
)
464 return STRING_LENGTH (str
);
467 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
468 encoding. False if it is 'wide', having a 32-bit UCS-4
471 scm_i_is_narrow_string (SCM str
)
473 if (IS_SH_STRING (str
))
474 str
= SH_STRING_STRING (str
);
476 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
479 /* Try to coerce a string to be narrow. It if is narrow already, do
480 nothing. If it is wide, shrink it to narrow if none of its
481 characters are above 0xFF. Return true if the string is narrow or
482 was made to be narrow. */
484 scm_i_try_narrow_string (SCM str
)
486 if (IS_SH_STRING (str
))
487 str
= SH_STRING_STRING (str
);
489 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
491 return scm_i_is_narrow_string (str
);
494 /* Return a pointer to the raw data of the string, which can be either Latin-1
495 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
497 scm_i_string_data (SCM str
)
503 get_str_buf_start (&str
, &buf
, &start
);
505 data
= STRINGBUF_CONTENTS (buf
);
506 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
511 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
514 scm_i_string_chars (SCM str
)
518 get_str_buf_start (&str
, &buf
, &start
);
519 if (scm_i_is_narrow_string (str
))
520 return (const char *) STRINGBUF_CHARS (buf
) + start
;
522 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
527 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
530 scm_i_string_wide_chars (SCM str
)
535 get_str_buf_start (&str
, &buf
, &start
);
536 if (!scm_i_is_narrow_string (str
))
537 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
539 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
543 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
544 a new string buffer, so that it can be modified without modifying
545 other strings. Also, lock the string mutex. Later, one must call
546 scm_i_string_stop_writing to unlock the mutex. */
548 scm_i_string_start_writing (SCM orig_str
)
550 SCM buf
, str
= orig_str
;
553 get_str_buf_start (&str
, &buf
, &start
);
554 if (IS_RO_STRING (str
))
555 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
557 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
558 if (STRINGBUF_SHARED (buf
))
560 /* Clone the stringbuf. */
561 size_t len
= STRING_LENGTH (str
);
564 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
566 if (scm_i_is_narrow_string (str
))
568 new_buf
= make_stringbuf (len
);
569 memcpy (STRINGBUF_CHARS (new_buf
),
570 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
575 new_buf
= make_wide_stringbuf (len
);
576 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
577 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
578 + STRING_START (str
)), len
);
581 SET_STRING_STRINGBUF (str
, new_buf
);
582 start
-= STRING_START (str
);
584 /* FIXME: The following operations are not atomic, so other threads
585 looking at STR may see an inconsistent state. Nevertheless it can't
586 hurt much since (i) accessing STR while it is being mutated can't
587 yield a crash, and (ii) concurrent accesses to STR should be
588 protected by a mutex at the application level. The latter may not
589 apply when STR != ORIG_STR, though. */
590 SET_STRING_START (str
, 0);
591 SET_STRING_STRINGBUF (str
, new_buf
);
595 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
600 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
602 scm_i_string_writable_chars (SCM str
)
607 get_str_buf_start (&str
, &buf
, &start
);
608 if (scm_i_is_narrow_string (str
))
609 return (char *) STRINGBUF_CHARS (buf
) + start
;
611 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
616 /* Return a pointer to the UCS-4 codepoints of a string. */
618 scm_i_string_writable_wide_chars (SCM str
)
623 get_str_buf_start (&str
, &buf
, &start
);
624 if (!scm_i_is_narrow_string (str
))
625 return STRINGBUF_WIDE_CHARS (buf
) + start
;
627 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
631 /* Unlock the string mutex that was locked when
632 scm_i_string_start_writing was called. */
634 scm_i_string_stop_writing (void)
636 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
639 /* Return the Xth character of STR as a UCS-4 codepoint. */
641 scm_i_string_ref (SCM str
, size_t x
)
643 if (scm_i_is_narrow_string (str
))
644 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
646 return scm_i_string_wide_chars (str
)[x
];
649 /* Returns index+1 of the first char in STR that matches C, or
650 0 if the char is not found. */
652 scm_i_string_contains_char (SCM str
, char ch
)
655 size_t len
= scm_i_string_length (str
);
658 if (scm_i_is_narrow_string (str
))
662 if (scm_i_string_chars (str
)[i
] == ch
)
671 if (scm_i_string_wide_chars (str
)[i
]
672 == (unsigned char) ch
)
681 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
683 if (scm_i_is_narrow_string (sstr
))
685 const char *a
= scm_i_string_chars (sstr
) + start_x
;
686 const char *b
= cstr
;
687 return strncmp (a
, b
, strlen(b
));
692 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
693 const char *b
= cstr
;
694 for (i
= 0; i
< strlen (b
); i
++)
696 if (a
[i
] != (unsigned char) b
[i
])
703 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
705 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
707 if (IS_SH_STRING (str
))
709 p
+= STRING_START (str
);
710 str
= SH_STRING_STRING (str
);
713 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
714 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
716 if (scm_i_is_narrow_string (str
))
718 char *dst
= scm_i_string_writable_chars (str
);
723 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
731 Basic symbol creation and accessing is done here, the rest is in
732 symbols.[hc]. This has been done to keep stringbufs and the
733 internals of strings and string-like objects confined to this file.
736 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
739 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
740 unsigned long hash
, SCM props
)
743 size_t start
= STRING_START (name
);
744 size_t length
= STRING_LENGTH (name
);
746 if (IS_SH_STRING (name
))
748 name
= SH_STRING_STRING (name
);
749 start
+= STRING_START (name
);
751 buf
= SYMBOL_STRINGBUF (name
);
753 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
756 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
757 SET_STRINGBUF_SHARED (buf
);
758 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
763 if (scm_i_is_narrow_string (name
))
765 SCM new_buf
= make_stringbuf (length
);
766 memcpy (STRINGBUF_CHARS (new_buf
),
767 STRINGBUF_CHARS (buf
) + start
, length
);
772 SCM new_buf
= make_wide_stringbuf (length
);
773 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
774 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
779 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
780 (scm_t_bits
) hash
, SCM_UNPACK (props
));
784 scm_i_c_make_symbol (const char *name
, size_t len
,
785 scm_t_bits flags
, unsigned long hash
, SCM props
)
787 SCM buf
= make_stringbuf (len
);
788 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
790 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
791 (scm_t_bits
) hash
, SCM_UNPACK (props
));
794 /* Returns the number of characters in SYM. This may be different
795 from the memory size of SYM. */
797 scm_i_symbol_length (SCM sym
)
799 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
803 scm_c_symbol_length (SCM sym
)
804 #define FUNC_NAME "scm_c_symbol_length"
806 SCM_VALIDATE_SYMBOL (1, sym
);
808 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
812 /* True if the name of SYM is stored as a Latin-1 encoded string.
813 False if it is stored as a 32-bit UCS-4-encoded string. */
815 scm_i_is_narrow_symbol (SCM sym
)
819 buf
= SYMBOL_STRINGBUF (sym
);
820 return !STRINGBUF_WIDE (buf
);
823 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
824 contains the name of SYM. */
826 scm_i_symbol_chars (SCM sym
)
830 buf
= SYMBOL_STRINGBUF (sym
);
831 if (!STRINGBUF_WIDE (buf
))
832 return (const char *) STRINGBUF_CHARS (buf
);
834 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
838 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
841 scm_i_symbol_wide_chars (SCM sym
)
845 buf
= SYMBOL_STRINGBUF (sym
);
846 if (STRINGBUF_WIDE (buf
))
847 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
849 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
854 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
856 SCM buf
= SYMBOL_STRINGBUF (sym
);
857 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
858 SET_STRINGBUF_SHARED (buf
);
859 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
860 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
861 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
864 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
866 scm_i_symbol_ref (SCM sym
, size_t x
)
868 if (scm_i_is_narrow_symbol (sym
))
869 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
871 return scm_i_symbol_wide_chars (sym
)[x
];
877 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
878 "Returns an association list containing debugging information\n"
879 "for @var{str}. The association list has the following entries."
882 "The string itself.\n"
884 "The start index of the string into its stringbuf\n"
886 "The length of the string\n"
888 "If this string is a substring, it returns its parent string.\n"
889 "Otherwise, it returns @code{#f}\n"
891 "@code{#t} if the string is read-only\n"
892 "@item stringbuf-chars\n"
893 "A new string containing this string's stringbuf's characters\n"
894 "@item stringbuf-length\n"
895 "The number of characters in this stringbuf\n"
896 "@item stringbuf-shared\n"
897 "@code{#t} if this stringbuf is shared\n"
898 "@item stringbuf-wide\n"
899 "@code{#t} if this stringbuf's characters are stored in a\n"
900 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
903 #define FUNC_NAME s_scm_sys_string_dump
905 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
907 SCM_VALIDATE_STRING (1, str
);
910 e1
= scm_cons (scm_from_latin1_symbol ("string"),
912 e2
= scm_cons (scm_from_latin1_symbol ("start"),
913 scm_from_size_t (STRING_START (str
)));
914 e3
= scm_cons (scm_from_latin1_symbol ("length"),
915 scm_from_size_t (STRING_LENGTH (str
)));
917 if (IS_SH_STRING (str
))
919 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
920 SH_STRING_STRING (str
));
921 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
925 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
927 buf
= STRING_STRINGBUF (str
);
930 if (IS_RO_STRING (str
))
931 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
934 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
938 if (!STRINGBUF_WIDE (buf
))
940 size_t len
= STRINGBUF_LENGTH (buf
);
942 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
943 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
944 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
949 size_t len
= STRINGBUF_LENGTH (buf
);
951 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
952 u32_cpy ((scm_t_uint32
*) cbuf
,
953 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
954 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
957 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
958 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
959 if (STRINGBUF_SHARED (buf
))
960 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
963 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
965 if (STRINGBUF_WIDE (buf
))
966 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
969 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
972 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
976 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
977 "Returns an association list containing debugging information\n"
978 "for @var{sym}. The association list has the following entries."
981 "The symbol itself\n"
985 "@code{#t} if it is an interned symbol\n"
986 "@item stringbuf-chars\n"
987 "A new string containing this symbols's stringbuf's characters\n"
988 "@item stringbuf-length\n"
989 "The number of characters in this stringbuf\n"
990 "@item stringbuf-shared\n"
991 "@code{#t} if this stringbuf is shared\n"
992 "@item stringbuf-wide\n"
993 "@code{#t} if this stringbuf's characters are stored in a\n"
994 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
997 #define FUNC_NAME s_scm_sys_symbol_dump
999 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
1001 SCM_VALIDATE_SYMBOL (1, sym
);
1002 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
1004 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
1005 scm_from_ulong (scm_i_symbol_hash (sym
)));
1006 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
1007 scm_symbol_interned_p (sym
));
1008 buf
= SYMBOL_STRINGBUF (sym
);
1010 /* Stringbuf info */
1011 if (!STRINGBUF_WIDE (buf
))
1013 size_t len
= STRINGBUF_LENGTH (buf
);
1015 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
1016 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
1017 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1022 size_t len
= STRINGBUF_LENGTH (buf
);
1024 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
1025 u32_cpy ((scm_t_uint32
*) cbuf
,
1026 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
1027 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1030 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1031 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
1032 if (STRINGBUF_SHARED (buf
))
1033 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1036 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1038 if (STRINGBUF_WIDE (buf
))
1039 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1042 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1044 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1049 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1051 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1052 #define FUNC_NAME s_scm_sys_stringbuf_hist
1055 for (i
= 0; i
< 1000; i
++)
1057 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1058 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1059 return SCM_UNSPECIFIED
;
1067 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1069 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1070 #define FUNC_NAME s_scm_string_p
1072 return scm_from_bool (IS_STRING (obj
));
1077 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1079 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1081 "@deffnx {Scheme Procedure} list->string chrs\n"
1082 "Return a newly allocated string composed of the arguments,\n"
1084 #define FUNC_NAME s_scm_string
1086 SCM result
= SCM_BOOL_F
;
1093 /* Verify that this is a list of chars. */
1094 i
= scm_ilength (chrs
);
1095 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1100 while (len
> 0 && scm_is_pair (rest
))
1102 SCM elt
= SCM_CAR (rest
);
1103 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1104 if (SCM_CHAR (elt
) > 0xFF)
1106 rest
= SCM_CDR (rest
);
1108 scm_remember_upto_here_1 (elt
);
1111 /* Construct a string containing this list of chars. */
1119 result
= scm_i_make_string (len
, NULL
, 0);
1120 result
= scm_i_string_start_writing (result
);
1121 buf
= scm_i_string_writable_chars (result
);
1122 while (len
> 0 && scm_is_pair (rest
))
1124 SCM elt
= SCM_CAR (rest
);
1125 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1127 rest
= SCM_CDR (rest
);
1129 scm_remember_upto_here_1 (elt
);
1136 result
= scm_i_make_wide_string (len
, NULL
, 0);
1137 result
= scm_i_string_start_writing (result
);
1138 buf
= scm_i_string_writable_wide_chars (result
);
1139 while (len
> 0 && scm_is_pair (rest
))
1141 SCM elt
= SCM_CAR (rest
);
1142 buf
[p
] = SCM_CHAR (elt
);
1144 rest
= SCM_CDR (rest
);
1146 scm_remember_upto_here_1 (elt
);
1149 scm_i_string_stop_writing ();
1152 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1153 if (!scm_is_null (rest
))
1154 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1160 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1162 "Return a newly allocated string of\n"
1163 "length @var{k}. If @var{chr} is given, then all elements of\n"
1164 "the string are initialized to @var{chr}, otherwise the contents\n"
1165 "of the string are all set to @code{#\nul}.")
1166 #define FUNC_NAME s_scm_make_string
1168 return scm_c_make_string (scm_to_size_t (k
), chr
);
1173 scm_c_make_string (size_t len
, SCM chr
)
1174 #define FUNC_NAME NULL
1177 char *contents
= NULL
;
1178 SCM res
= scm_i_make_string (len
, &contents
, 0);
1180 /* If no char is given, initialize string contents to NULL. */
1181 if (SCM_UNBNDP (chr
))
1182 memset (contents
, 0, len
);
1185 SCM_VALIDATE_CHAR (0, chr
);
1186 res
= scm_i_string_start_writing (res
);
1187 for (p
= 0; p
< len
; p
++)
1188 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1189 scm_i_string_stop_writing ();
1196 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1198 "Return the number of characters in @var{string}.")
1199 #define FUNC_NAME s_scm_string_length
1201 SCM_VALIDATE_STRING (1, string
);
1202 return scm_from_size_t (STRING_LENGTH (string
));
1206 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1208 "Return the bytes used to represent a character in @var{string}."
1209 "This will return 1 or 4.")
1210 #define FUNC_NAME s_scm_string_bytes_per_char
1212 SCM_VALIDATE_STRING (1, string
);
1213 if (!scm_i_is_narrow_string (string
))
1214 return scm_from_int (4);
1216 return scm_from_int (1);
1221 scm_c_string_length (SCM string
)
1223 if (!IS_STRING (string
))
1224 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1225 return STRING_LENGTH (string
);
1228 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1230 "Return character @var{k} of @var{str} using zero-origin\n"
1231 "indexing. @var{k} must be a valid index of @var{str}.")
1232 #define FUNC_NAME s_scm_string_ref
1237 SCM_VALIDATE_STRING (1, str
);
1239 len
= scm_i_string_length (str
);
1240 if (SCM_LIKELY (len
> 0))
1241 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1243 scm_out_of_range (NULL
, k
);
1245 if (scm_i_is_narrow_string (str
))
1246 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1248 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1253 scm_c_string_ref (SCM str
, size_t p
)
1255 if (p
>= scm_i_string_length (str
))
1256 scm_out_of_range (NULL
, scm_from_size_t (p
));
1257 if (scm_i_is_narrow_string (str
))
1258 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1260 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1264 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1265 (SCM str
, SCM k
, SCM chr
),
1266 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1267 "an unspecified value. @var{k} must be a valid index of\n"
1269 #define FUNC_NAME s_scm_string_set_x
1274 SCM_VALIDATE_STRING (1, str
);
1276 len
= scm_i_string_length (str
);
1277 if (SCM_LIKELY (len
> 0))
1278 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1280 scm_out_of_range (NULL
, k
);
1282 SCM_VALIDATE_CHAR (3, chr
);
1283 str
= scm_i_string_start_writing (str
);
1284 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1285 scm_i_string_stop_writing ();
1287 return SCM_UNSPECIFIED
;
1292 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1294 if (p
>= scm_i_string_length (str
))
1295 scm_out_of_range (NULL
, scm_from_size_t (p
));
1296 str
= scm_i_string_start_writing (str
);
1297 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1298 scm_i_string_stop_writing ();
1301 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1302 (SCM str
, SCM start
, SCM end
),
1303 "Return a newly allocated string formed from the characters\n"
1304 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1305 "ending with index @var{end} (exclusive).\n"
1306 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1307 "exact integers satisfying:\n\n"
1308 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1309 #define FUNC_NAME s_scm_substring
1311 size_t len
, from
, to
;
1313 SCM_VALIDATE_STRING (1, str
);
1314 len
= scm_i_string_length (str
);
1315 from
= scm_to_unsigned_integer (start
, 0, len
);
1316 if (SCM_UNBNDP (end
))
1319 to
= scm_to_unsigned_integer (end
, from
, len
);
1320 return scm_i_substring (str
, from
, to
);
1324 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1325 (SCM str
, SCM start
, SCM end
),
1326 "Return a newly allocated string formed from the characters\n"
1327 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1328 "ending with index @var{end} (exclusive).\n"
1329 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1330 "exact integers satisfying:\n"
1332 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1334 "The returned string is read-only.\n")
1335 #define FUNC_NAME s_scm_substring_read_only
1337 size_t len
, from
, to
;
1339 SCM_VALIDATE_STRING (1, str
);
1340 len
= scm_i_string_length (str
);
1341 from
= scm_to_unsigned_integer (start
, 0, len
);
1342 if (SCM_UNBNDP (end
))
1345 to
= scm_to_unsigned_integer (end
, from
, len
);
1346 return scm_i_substring_read_only (str
, from
, to
);
1350 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1351 (SCM str
, SCM start
, SCM end
),
1352 "Return a newly allocated string formed from the characters\n"
1353 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1354 "ending with index @var{end} (exclusive).\n"
1355 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1356 "exact integers satisfying:\n\n"
1357 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1358 #define FUNC_NAME s_scm_substring_copy
1360 /* For the Scheme version, START is mandatory, but for the C
1361 version, it is optional. See scm_string_copy in srfi-13.c for a
1367 SCM_VALIDATE_STRING (1, str
);
1368 scm_i_get_substring_spec (scm_i_string_length (str
),
1369 start
, &from
, end
, &to
);
1370 return scm_i_substring_copy (str
, from
, to
);
1374 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1375 (SCM str
, SCM start
, SCM end
),
1376 "Return string that indirectly refers to the characters\n"
1377 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1378 "ending with index @var{end} (exclusive).\n"
1379 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1380 "exact integers satisfying:\n\n"
1381 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1382 #define FUNC_NAME s_scm_substring_shared
1384 size_t len
, from
, to
;
1386 SCM_VALIDATE_STRING (1, str
);
1387 len
= scm_i_string_length (str
);
1388 from
= scm_to_unsigned_integer (start
, 0, len
);
1389 if (SCM_UNBNDP (end
))
1392 to
= scm_to_unsigned_integer (end
, from
, len
);
1393 return scm_i_substring_shared (str
, from
, to
);
1397 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1399 "Return a newly allocated string whose characters form the\n"
1400 "concatenation of the given strings, @var{args}.")
1401 #define FUNC_NAME s_scm_string_append
1414 SCM_VALIDATE_REST_ARGUMENT (args
);
1415 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1418 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1419 len
+= scm_i_string_length (s
);
1420 if (!scm_i_is_narrow_string (s
))
1425 res
= scm_i_make_string (len
, &data
.narrow
, 0);
1427 res
= scm_i_make_wide_string (len
, &data
.wide
, 0);
1429 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1433 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1434 len
= scm_i_string_length (s
);
1437 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1442 if (scm_i_is_narrow_string (s
))
1444 for (i
= 0; i
< scm_i_string_length (s
); i
++)
1445 data
.wide
[i
] = (unsigned char) scm_i_string_chars (s
)[i
];
1448 u32_cpy ((scm_t_uint32
*) data
.wide
,
1449 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1452 scm_remember_upto_here_1 (s
);
1460 /* Charset conversion error handling. */
1462 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1463 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1465 /* Raise an exception informing that character CHR could not be written
1466 to PORT in its current encoding. */
1468 scm_encoding_error (const char *subr
, int err
, const char *message
,
1471 scm_throw (scm_encoding_error_key
,
1472 scm_list_n (scm_from_latin1_string (subr
),
1473 scm_from_latin1_string (message
),
1479 /* Raise an exception informing of an encoding error on PORT. This
1480 means that a character could not be written in PORT's encoding. */
1482 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1484 scm_throw (scm_decoding_error_key
,
1485 scm_list_n (scm_from_latin1_string (subr
),
1486 scm_from_latin1_string (message
),
1493 /* String conversion to/from C. */
1496 decoding_error (const char *func_name
, int errno_save
,
1497 const char *str
, size_t len
)
1499 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1504 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1505 memcpy (buf
, str
, len
);
1506 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1508 scm_decoding_error (func_name
, errno_save
,
1509 "input locale conversion error", bv
);
1513 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1514 scm_t_string_failed_conversion_handler handler
)
1521 /* The order of these checks is important. */
1522 if (!str
&& len
!= 0)
1523 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1524 if (len
== (size_t) -1)
1527 if (encoding
== NULL
|| len
== 0)
1528 return scm_from_latin1_stringn (str
, len
);
1529 else if (strcmp (encoding
, "UTF-8") == 0)
1530 return scm_from_utf8_stringn (str
, len
);
1533 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1534 (enum iconv_ilseq_handler
)
1540 if (SCM_UNLIKELY (u32
== NULL
))
1541 decoding_error (__func__
, errno
, str
, len
);
1545 if (u32
[i
++] > 0xFF)
1554 res
= scm_i_make_string (u32len
, &dst
, 0);
1555 for (i
= 0; i
< u32len
; i
++)
1556 dst
[i
] = (unsigned char) u32
[i
];
1562 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1563 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1572 scm_from_locale_string (const char *str
)
1574 return scm_from_locale_stringn (str
, -1);
1578 scm_from_locale_stringn (const char *str
, size_t len
)
1580 return scm_from_stringn (str
, len
, locale_charset (),
1581 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1585 scm_from_latin1_string (const char *str
)
1587 return scm_from_latin1_stringn (str
, -1);
1591 scm_from_latin1_stringn (const char *str
, size_t len
)
1596 if (len
== (size_t) -1)
1599 /* Make a narrow string and copy STR as is. */
1600 result
= scm_i_make_string (len
, &buf
, 0);
1601 memcpy (buf
, str
, len
);
1607 scm_from_utf8_string (const char *str
)
1609 return scm_from_utf8_stringn (str
, -1);
1613 scm_from_utf8_stringn (const char *str
, size_t len
)
1616 const scm_t_uint8
*ustr
= (const scm_t_uint8
*) str
;
1617 int ascii
= 1, narrow
= 1;
1620 if (len
== (size_t) -1)
1640 nbytes
= u8_mbtouc (&c
, ustr
+ i
, len
- i
);
1644 decoding_error (__func__
, errno
, str
, len
);
1657 res
= scm_i_make_string (char_len
, &dst
, 0);
1658 memcpy (dst
, str
, len
);
1666 res
= scm_i_make_string (char_len
, &dst
, 0);
1668 for (i
= 0, j
= 0; i
< len
; j
++)
1670 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1671 dst
[j
] = (signed char) c
;
1680 res
= scm_i_make_wide_string (char_len
, &dst
, 0);
1682 for (i
= 0, j
= 0; i
< len
; j
++)
1684 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1693 scm_from_utf32_string (const scm_t_wchar
*str
)
1695 return scm_from_utf32_stringn (str
, -1);
1699 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1704 if (len
== (size_t) -1)
1705 len
= u32_strlen ((uint32_t *) str
);
1707 result
= scm_i_make_wide_string (len
, &buf
, 0);
1708 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1709 scm_i_try_narrow_string (result
);
1714 /* Create a new scheme string from the C string STR. The memory of
1715 STR may be used directly as storage for the new string. */
1716 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1717 would be to register a finalizer to eventually free(3) STR, which isn't
1718 worth it. Should we just deprecate the `scm_take_' functions? */
1720 scm_take_locale_stringn (char *str
, size_t len
)
1724 res
= scm_from_locale_stringn (str
, len
);
1731 scm_take_locale_string (char *str
)
1733 return scm_take_locale_stringn (str
, -1);
1736 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1737 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1738 Set *LENP to the size of the resulting string.
1740 FIXME: This is a hack we should get rid of. See
1741 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1744 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1746 char *before
, *after
;
1755 if ((i
<= *lenp
- 6)
1756 && before
[i
] == '\\'
1757 && before
[i
+ 1] == 'u'
1758 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1760 /* Convert \u00NN to \xNN */
1763 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1764 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1768 else if ((i
<= *lenp
- 10)
1769 && before
[i
] == '\\'
1770 && before
[i
+ 1] == 'U'
1771 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1773 /* Convert \U00NNNNNN to \UNNNNNN */
1776 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1777 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1778 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1779 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1780 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1781 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1787 after
[j
] = before
[i
];
1795 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1796 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1797 of the resulting string. BUF must be large enough to handle the
1798 worst case when `\uXXXX' escapes (6 characters) are replaced by
1799 `\xXXXX;' (7 characters). */
1801 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1803 char *before
, *after
;
1805 /* The worst case is if the input string contains all 4-digit hex escapes.
1806 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1807 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1808 size_t nzeros
, ndigits
;
1811 after
= alloca (max_out_len
);
1816 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1817 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1819 if (before
[i
+ 1] == 'u')
1821 else if (before
[i
+ 1] == 'U')
1826 /* Add the R6RS hex escape initial sequence. */
1830 /* Move string positions to the start of the hex numbers. */
1834 /* Find the number of initial zeros in this hex number. */
1836 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1839 /* Copy the number, skipping initial zeros, and then move the string
1841 if (nzeros
== ndigits
)
1850 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1851 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1853 j
+= (ndigits
- nzeros
);
1856 /* Add terminating semicolon. */
1862 after
[j
] = before
[i
];
1868 memcpy (before
, after
, j
);
1872 scm_to_locale_string (SCM str
)
1874 return scm_to_locale_stringn (str
, NULL
);
1878 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1880 return scm_to_stringn (str
, lenp
,
1882 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1886 scm_to_latin1_string (SCM str
)
1888 return scm_to_latin1_stringn (str
, NULL
);
1892 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1893 #define FUNC_NAME "scm_to_latin1_stringn"
1897 SCM_VALIDATE_STRING (1, str
);
1899 if (scm_i_is_narrow_string (str
))
1901 size_t len
= scm_i_string_length (str
);
1906 result
= scm_strndup (scm_i_string_data (str
), len
);
1909 result
= scm_to_stringn (str
, lenp
, NULL
,
1910 SCM_FAILED_CONVERSION_ERROR
);
1917 scm_to_utf8_string (SCM str
)
1919 return scm_to_utf8_stringn (str
, NULL
);
1923 latin1_u8_strlen (const scm_t_uint8
*str
, size_t len
)
1926 for (i
= 0, ret
= 0; i
< len
; i
++)
1927 ret
+= (str
[i
] < 128) ? 1 : 2;
1932 latin1_to_u8 (const scm_t_uint8
*str
, size_t latin_len
,
1933 scm_t_uint8
*u8_result
, size_t *u8_lenp
)
1936 size_t u8_len
= latin1_u8_strlen (str
, latin_len
);
1938 if (!(u8_result
&& u8_lenp
&& *u8_lenp
> u8_len
))
1939 u8_result
= scm_malloc (u8_len
+ 1);
1943 for (i
= 0, n
= 0; i
< latin_len
; i
++)
1944 n
+= u8_uctomb (u8_result
+ n
, str
[i
], u8_len
- n
);
1953 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
1955 if (scm_i_is_narrow_string (str
))
1956 return (char *) latin1_to_u8 ((scm_t_uint8
*) scm_i_string_chars (str
),
1957 scm_i_string_length (str
),
1960 return (char *) u32_to_u8 ((scm_t_uint32
*)scm_i_string_wide_chars (str
),
1961 scm_i_string_length (str
),
1966 scm_to_utf32_string (SCM str
)
1968 return scm_to_utf32_stringn (str
, NULL
);
1972 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
1973 #define FUNC_NAME "scm_to_utf32_stringn"
1975 scm_t_wchar
*result
;
1977 SCM_VALIDATE_STRING (1, str
);
1979 if (scm_i_is_narrow_string (str
))
1981 scm_t_uint8
*codepoints
;
1984 codepoints
= (scm_t_uint8
*) scm_i_string_chars (str
);
1985 len
= scm_i_string_length (str
);
1989 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
1990 for (i
= 0; i
< len
; i
++)
1991 result
[i
] = codepoints
[i
];
1998 len
= scm_i_string_length (str
);
2002 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2003 memcpy (result
, scm_i_string_wide_chars (str
),
2004 len
* sizeof (scm_t_wchar
));
2012 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2013 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2014 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2015 defined by HANDLER. */
2017 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
2018 scm_t_string_failed_conversion_handler handler
)
2021 size_t ilen
, len
, i
;
2025 if (!scm_is_string (str
))
2026 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2027 ilen
= scm_i_string_length (str
);
2031 buf
= scm_malloc (1);
2039 for (i
= 0; i
< ilen
; i
++)
2040 if (scm_i_string_ref (str
, i
) == '\0')
2041 scm_misc_error (NULL
,
2042 "string contains #\\nul character: ~S",
2045 if (scm_i_is_narrow_string (str
) && (encoding
== NULL
))
2047 /* If using native Latin-1 encoding, just copy the string
2051 buf
= scm_malloc (ilen
);
2052 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2058 buf
= scm_malloc (ilen
+ 1);
2059 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2071 if (scm_i_is_narrow_string (str
))
2073 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
2075 (enum iconv_ilseq_handler
) handler
, NULL
,
2079 scm_encoding_error (__func__
, errno
,
2080 "cannot convert narrow string to output locale",
2082 /* FIXME: Faulty character unknown. */
2087 buf
= u32_conv_to_encoding (enc
,
2088 (enum iconv_ilseq_handler
) handler
,
2089 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
2094 scm_encoding_error (__func__
, errno
,
2095 "cannot convert wide string to output locale",
2097 /* FIXME: Faulty character unknown. */
2100 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
2102 if (SCM_R6RS_ESCAPES_P
)
2104 /* The worst case is if the input string contains all 4-digit
2105 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2106 (seven characters). Make BUF large enough to hold
2108 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
2109 unistring_escapes_to_r6rs_escapes (buf
, &len
);
2112 unistring_escapes_to_guile_escapes (buf
, &len
);
2114 buf
= scm_realloc (buf
, len
);
2120 buf
= scm_realloc (buf
, len
+ 1);
2124 scm_remember_upto_here_1 (str
);
2129 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
2132 char *result
= NULL
;
2133 if (!scm_is_string (str
))
2134 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2135 result
= scm_to_locale_stringn (str
, &len
);
2137 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
2140 scm_remember_upto_here_1 (str
);
2145 /* Unicode string normalization. */
2147 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2148 libguile/i18n.c. It would be useful to have this factored out into a more
2149 convenient location, but its use of alloca makes that tricky to do. */
2152 normalize_str (SCM string
, uninorm_t form
)
2155 scm_t_uint32
*w_str
;
2157 size_t rlen
, len
= scm_i_string_length (string
);
2159 if (scm_i_is_narrow_string (string
))
2162 const char *buf
= scm_i_string_chars (string
);
2164 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
2166 for (i
= 0; i
< len
; i
++)
2167 w_str
[i
] = (unsigned char) buf
[i
];
2171 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2173 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2175 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2176 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2179 scm_i_try_narrow_string (ret
);
2184 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2186 "Returns the NFC normalized form of @var{string}.")
2187 #define FUNC_NAME s_scm_string_normalize_nfc
2189 SCM_VALIDATE_STRING (1, string
);
2190 return normalize_str (string
, UNINORM_NFC
);
2194 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2196 "Returns the NFD normalized form of @var{string}.")
2197 #define FUNC_NAME s_scm_string_normalize_nfd
2199 SCM_VALIDATE_STRING (1, string
);
2200 return normalize_str (string
, UNINORM_NFD
);
2204 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2206 "Returns the NFKC normalized form of @var{string}.")
2207 #define FUNC_NAME s_scm_string_normalize_nfkc
2209 SCM_VALIDATE_STRING (1, string
);
2210 return normalize_str (string
, UNINORM_NFKC
);
2214 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2216 "Returns the NFKD normalized form of @var{string}.")
2217 #define FUNC_NAME s_scm_string_normalize_nfkd
2219 SCM_VALIDATE_STRING (1, string
);
2220 return normalize_str (string
, UNINORM_NFKD
);
2224 /* converts C scm_array of strings to SCM scm_list of strings.
2225 If argc < 0, a null terminated scm_array is assumed.
2226 The current locale encoding is assumed */
2228 scm_makfromstrs (int argc
, char **argv
)
2233 for (i
= 0; argv
[i
]; i
++);
2235 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2239 /* Return a newly allocated array of char pointers to each of the strings
2240 in args, with a terminating NULL pointer. The strings are encoded using
2241 the current locale. */
2244 scm_i_allocate_string_pointers (SCM list
)
2245 #define FUNC_NAME "scm_i_allocate_string_pointers"
2248 int list_len
= scm_ilength (list
);
2252 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2254 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2256 result
[list_len
] = NULL
;
2258 /* The list might have been modified in another thread, so
2259 we check LIST before each access.
2261 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2263 SCM str
= SCM_CAR (list
);
2264 size_t len
; /* String length in bytes */
2265 char *c_str
= scm_to_locale_stringn (str
, &len
);
2267 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2268 scm_malloc to allocate the returned string, which must be
2269 explicitly deallocated. This forces us to copy the string a
2270 second time into a new buffer. Ideally there would be variants
2271 of scm_to_*_stringn that can return garbage-collected buffers. */
2273 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2274 memcpy (result
[i
], c_str
, len
);
2275 result
[i
][len
] = '\0';
2278 list
= SCM_CDR (list
);
2286 scm_i_get_substring_spec (size_t len
,
2287 SCM start
, size_t *cstart
,
2288 SCM end
, size_t *cend
)
2290 if (SCM_UNBNDP (start
))
2293 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2295 if (SCM_UNBNDP (end
))
2298 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2302 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2304 return scm_c_string_ref (h
->array
, index
);
2308 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2310 scm_c_string_set_x (h
->array
, index
, val
);
2314 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2320 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2322 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2323 h
->elements
= h
->writable_elements
= NULL
;
2326 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2327 string_handle_ref
, string_handle_set
,
2329 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2334 scm_nullstr
= scm_i_make_string (0, NULL
, 0);
2336 #include "libguile/strings.x"