1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
34 #include "striconveh.h"
36 #include "libguile/_scm.h"
37 #include "libguile/chars.h"
38 #include "libguile/root.h"
39 #include "libguile/strings.h"
40 #include "libguile/ports.h"
41 #include "libguile/ports-internal.h"
42 #include "libguile/error.h"
43 #include "libguile/generalized-vectors.h"
44 #include "libguile/deprecation.h"
45 #include "libguile/validate.h"
46 #include "libguile/private-options.h"
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
58 * shared, not whether it definitely is.
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
77 /* The size in words of the stringbuf header (type tag + size). */
78 #define STRINGBUF_HEADER_SIZE 2U
80 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
82 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
83 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
89 #define STRINGBUF_CONTENTS(buf) ((void *) \
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
92 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
95 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
97 #define SET_STRINGBUF_SHARED(buf) \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
108 static size_t lenhist
[1001];
111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
114 make_stringbuf (size_t len
)
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
132 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
135 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
136 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
138 STRINGBUF_CHARS (buf
)[len
] = 0;
143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 make_wide_stringbuf (size_t len
)
151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
158 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
159 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
162 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
163 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
165 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
173 wide_stringbuf (SCM buf
)
177 if (STRINGBUF_WIDE (buf
))
184 len
= STRINGBUF_LENGTH (buf
);
186 new_buf
= make_wide_stringbuf (len
);
188 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
189 for (i
= 0; i
< len
; i
++)
190 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
200 narrow_stringbuf (SCM buf
)
204 if (!STRINGBUF_WIDE (buf
))
212 len
= STRINGBUF_LENGTH (buf
);
213 wmem
= STRINGBUF_WIDE_CHARS (buf
);
215 for (i
= 0; i
< len
; i
++)
217 /* BUF cannot be narrowed. */
220 new_buf
= make_stringbuf (len
);
222 mem
= STRINGBUF_CHARS (new_buf
);
223 for (i
= 0; i
< len
; i
++)
224 mem
[i
] = (unsigned char) wmem
[i
];
231 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
234 /* Copy-on-write strings.
237 #define STRING_TAG scm_tc7_string
239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
246 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
248 /* Read-only strings.
251 #define RO_STRING_TAG scm_tc7_ro_string
252 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
254 /* Mutation-sharing substrings
257 #define SH_STRING_TAG (scm_tc7_string + 0x100)
259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260 /* START and LENGTH as for STRINGs. */
262 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
265 scm_i_print_stringbuf (SCM exp
, SCM port
, scm_print_state
*pstate
)
269 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
270 SET_STRINGBUF_SHARED (exp
);
271 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
273 str
= scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(exp
),
274 0, STRINGBUF_LENGTH (exp
));
276 scm_puts ("#<stringbuf ", port
);
277 scm_iprin1 (str
, port
, pstate
);
278 scm_puts (">", port
);
283 static SCM null_stringbuf
;
286 init_null_stringbuf (void)
288 null_stringbuf
= make_stringbuf (0);
289 SET_STRINGBUF_SHARED (null_stringbuf
);
292 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
293 characters. CHARSP, if not NULL, will be set to location of the
294 char array. If READ_ONLY_P, the returned string is read-only;
295 otherwise it is writable. */
297 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
304 static scm_i_pthread_once_t once
= SCM_I_PTHREAD_ONCE_INIT
;
305 scm_i_pthread_once (&once
, init_null_stringbuf
);
306 buf
= null_stringbuf
;
309 buf
= make_stringbuf (len
);
312 *charsp
= (char *) STRINGBUF_CHARS (buf
);
313 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
315 (scm_t_bits
) 0, (scm_t_bits
) len
);
319 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
320 characters. CHARSP, if not NULL, will be set to location of the
321 character array. If READ_ONLY_P, the returned string is read-only;
322 otherwise it is writable. */
324 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
326 SCM buf
= make_wide_stringbuf (len
);
329 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
330 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
332 (scm_t_bits
) 0, (scm_t_bits
) len
);
337 validate_substring_args (SCM str
, size_t start
, size_t end
)
339 if (!IS_STRING (str
))
340 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
341 if (start
> STRING_LENGTH (str
))
342 scm_out_of_range (NULL
, scm_from_size_t (start
));
343 if (end
> STRING_LENGTH (str
) || end
< start
)
344 scm_out_of_range (NULL
, scm_from_size_t (end
));
348 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
350 *start
= STRING_START (*str
);
351 if (IS_SH_STRING (*str
))
353 *str
= SH_STRING_STRING (*str
);
354 *start
+= STRING_START (*str
);
356 *buf
= STRING_STRINGBUF (*str
);
360 scm_i_substring (SCM str
, size_t start
, size_t end
)
363 return scm_i_make_string (0, NULL
, 0);
368 get_str_buf_start (&str
, &buf
, &str_start
);
369 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
370 SET_STRINGBUF_SHARED (buf
);
371 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
372 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
373 (scm_t_bits
)str_start
+ start
,
374 (scm_t_bits
) end
- start
);
379 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
382 return scm_i_make_string (0, NULL
, 1);
387 get_str_buf_start (&str
, &buf
, &str_start
);
388 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
389 SET_STRINGBUF_SHARED (buf
);
390 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
391 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
392 (scm_t_bits
)str_start
+ start
,
393 (scm_t_bits
) end
- start
);
398 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
401 return scm_i_make_string (0, NULL
, 0);
404 size_t len
= end
- start
;
405 SCM buf
, my_buf
, substr
;
408 get_str_buf_start (&str
, &buf
, &str_start
);
409 if (scm_i_is_narrow_string (str
))
411 my_buf
= make_stringbuf (len
);
412 memcpy (STRINGBUF_CHARS (my_buf
),
413 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
417 my_buf
= make_wide_stringbuf (len
);
418 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
419 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
423 scm_remember_upto_here_1 (buf
);
424 substr
= scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
425 (scm_t_bits
) 0, (scm_t_bits
) len
);
427 scm_i_try_narrow_string (substr
);
433 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
435 if (start
== 0 && end
== STRING_LENGTH (str
))
437 else if (start
== end
)
438 return scm_i_make_string (0, NULL
, 0);
441 size_t len
= end
- start
;
442 if (IS_SH_STRING (str
))
444 start
+= STRING_START (str
);
445 str
= SH_STRING_STRING (str
);
447 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
448 (scm_t_bits
)start
, (scm_t_bits
) len
);
453 scm_c_substring (SCM str
, size_t start
, size_t end
)
455 validate_substring_args (str
, start
, end
);
456 return scm_i_substring (str
, start
, end
);
460 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
462 validate_substring_args (str
, start
, end
);
463 return scm_i_substring_read_only (str
, start
, end
);
467 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
469 validate_substring_args (str
, start
, end
);
470 return scm_i_substring_copy (str
, start
, end
);
474 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
476 validate_substring_args (str
, start
, end
);
477 return scm_i_substring_shared (str
, start
, end
);
481 /* Internal accessors
484 /* Returns the number of characters in STR. This may be different
485 than the memory size of the string storage. */
487 scm_i_string_length (SCM str
)
489 return STRING_LENGTH (str
);
492 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
493 encoding. False if it is 'wide', having a 32-bit UCS-4
496 scm_i_is_narrow_string (SCM str
)
498 if (IS_SH_STRING (str
))
499 str
= SH_STRING_STRING (str
);
501 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
504 /* Try to coerce a string to be narrow. It if is narrow already, do
505 nothing. If it is wide, shrink it to narrow if none of its
506 characters are above 0xFF. Return true if the string is narrow or
507 was made to be narrow. */
509 scm_i_try_narrow_string (SCM str
)
511 if (IS_SH_STRING (str
))
512 str
= SH_STRING_STRING (str
);
514 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
516 return scm_i_is_narrow_string (str
);
519 /* Return a pointer to the raw data of the string, which can be either Latin-1
520 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
522 scm_i_string_data (SCM str
)
528 get_str_buf_start (&str
, &buf
, &start
);
530 data
= STRINGBUF_CONTENTS (buf
);
531 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
536 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
539 scm_i_string_chars (SCM str
)
543 get_str_buf_start (&str
, &buf
, &start
);
544 if (scm_i_is_narrow_string (str
))
545 return (const char *) STRINGBUF_CHARS (buf
) + start
;
547 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
552 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
555 scm_i_string_wide_chars (SCM str
)
560 get_str_buf_start (&str
, &buf
, &start
);
561 if (!scm_i_is_narrow_string (str
))
562 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
564 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
568 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
569 a new string buffer, so that it can be modified without modifying
570 other strings. Also, lock the string mutex. Later, one must call
571 scm_i_string_stop_writing to unlock the mutex. */
573 scm_i_string_start_writing (SCM orig_str
)
575 SCM buf
, str
= orig_str
;
578 get_str_buf_start (&str
, &buf
, &start
);
579 if (IS_RO_STRING (str
))
580 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
582 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
583 if (STRINGBUF_SHARED (buf
))
585 /* Clone the stringbuf. */
586 size_t len
= STRING_LENGTH (str
);
589 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
591 if (scm_i_is_narrow_string (str
))
593 new_buf
= make_stringbuf (len
);
594 memcpy (STRINGBUF_CHARS (new_buf
),
595 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
600 new_buf
= make_wide_stringbuf (len
);
601 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
602 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
603 + STRING_START (str
)), len
);
606 SET_STRING_STRINGBUF (str
, new_buf
);
607 start
-= STRING_START (str
);
609 /* FIXME: The following operations are not atomic, so other threads
610 looking at STR may see an inconsistent state. Nevertheless it can't
611 hurt much since (i) accessing STR while it is being mutated can't
612 yield a crash, and (ii) concurrent accesses to STR should be
613 protected by a mutex at the application level. The latter may not
614 apply when STR != ORIG_STR, though. */
615 SET_STRING_START (str
, 0);
616 SET_STRING_STRINGBUF (str
, new_buf
);
620 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
625 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
627 scm_i_string_writable_chars (SCM str
)
632 get_str_buf_start (&str
, &buf
, &start
);
633 if (scm_i_is_narrow_string (str
))
634 return (char *) STRINGBUF_CHARS (buf
) + start
;
636 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
641 /* Return a pointer to the UCS-4 codepoints of a string. */
643 scm_i_string_writable_wide_chars (SCM str
)
648 get_str_buf_start (&str
, &buf
, &start
);
649 if (!scm_i_is_narrow_string (str
))
650 return STRINGBUF_WIDE_CHARS (buf
) + start
;
652 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
656 /* Unlock the string mutex that was locked when
657 scm_i_string_start_writing was called. */
659 scm_i_string_stop_writing (void)
661 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
664 /* Return the Xth character of STR as a UCS-4 codepoint. */
666 scm_i_string_ref (SCM str
, size_t x
)
668 if (scm_i_is_narrow_string (str
))
669 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
671 return scm_i_string_wide_chars (str
)[x
];
674 /* Returns index+1 of the first char in STR that matches C, or
675 0 if the char is not found. */
677 scm_i_string_contains_char (SCM str
, char ch
)
680 size_t len
= scm_i_string_length (str
);
683 if (scm_i_is_narrow_string (str
))
687 if (scm_i_string_chars (str
)[i
] == ch
)
696 if (scm_i_string_wide_chars (str
)[i
]
697 == (unsigned char) ch
)
706 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
708 if (scm_i_is_narrow_string (sstr
))
710 const char *a
= scm_i_string_chars (sstr
) + start_x
;
711 const char *b
= cstr
;
712 return strncmp (a
, b
, strlen(b
));
717 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
718 const char *b
= cstr
;
719 for (i
= 0; i
< strlen (b
); i
++)
721 if (a
[i
] != (unsigned char) b
[i
])
728 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
730 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
732 if (IS_SH_STRING (str
))
734 p
+= STRING_START (str
);
735 str
= SH_STRING_STRING (str
);
738 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
739 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
741 if (scm_i_is_narrow_string (str
))
743 char *dst
= scm_i_string_writable_chars (str
);
748 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
756 Basic symbol creation and accessing is done here, the rest is in
757 symbols.[hc]. This has been done to keep stringbufs and the
758 internals of strings and string-like objects confined to this file.
761 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
764 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
765 unsigned long hash
, SCM props
)
768 size_t start
= STRING_START (name
);
769 size_t length
= STRING_LENGTH (name
);
771 if (IS_SH_STRING (name
))
773 name
= SH_STRING_STRING (name
);
774 start
+= STRING_START (name
);
776 buf
= STRING_STRINGBUF (name
);
778 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
781 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
782 SET_STRINGBUF_SHARED (buf
);
783 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
788 if (scm_i_is_narrow_string (name
))
790 SCM new_buf
= make_stringbuf (length
);
791 memcpy (STRINGBUF_CHARS (new_buf
),
792 STRINGBUF_CHARS (buf
) + start
, length
);
797 SCM new_buf
= make_wide_stringbuf (length
);
798 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
799 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
804 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
805 (scm_t_bits
) hash
, SCM_UNPACK (props
));
809 scm_i_c_make_symbol (const char *name
, size_t len
,
810 scm_t_bits flags
, unsigned long hash
, SCM props
)
812 SCM buf
= make_stringbuf (len
);
813 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
815 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
816 (scm_t_bits
) hash
, SCM_UNPACK (props
));
819 /* Returns the number of characters in SYM. This may be different
820 from the memory size of SYM. */
822 scm_i_symbol_length (SCM sym
)
824 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
828 scm_c_symbol_length (SCM sym
)
829 #define FUNC_NAME "scm_c_symbol_length"
831 SCM_VALIDATE_SYMBOL (1, sym
);
833 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
837 /* True if the name of SYM is stored as a Latin-1 encoded string.
838 False if it is stored as a 32-bit UCS-4-encoded string. */
840 scm_i_is_narrow_symbol (SCM sym
)
844 buf
= SYMBOL_STRINGBUF (sym
);
845 return !STRINGBUF_WIDE (buf
);
848 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
849 contains the name of SYM. */
851 scm_i_symbol_chars (SCM sym
)
855 buf
= SYMBOL_STRINGBUF (sym
);
856 if (!STRINGBUF_WIDE (buf
))
857 return (const char *) STRINGBUF_CHARS (buf
);
859 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
863 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
866 scm_i_symbol_wide_chars (SCM sym
)
870 buf
= SYMBOL_STRINGBUF (sym
);
871 if (STRINGBUF_WIDE (buf
))
872 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
874 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
879 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
881 SCM buf
= SYMBOL_STRINGBUF (sym
);
882 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
883 SET_STRINGBUF_SHARED (buf
);
884 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
885 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
886 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
889 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
891 scm_i_symbol_ref (SCM sym
, size_t x
)
893 if (scm_i_is_narrow_symbol (sym
))
894 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
896 return scm_i_symbol_wide_chars (sym
)[x
];
902 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
903 "Returns an association list containing debugging information\n"
904 "for @var{str}. The association list has the following entries."
907 "The string itself.\n"
909 "The start index of the string into its stringbuf\n"
911 "The length of the string\n"
913 "If this string is a substring, it returns its parent string.\n"
914 "Otherwise, it returns @code{#f}\n"
916 "@code{#t} if the string is read-only\n"
917 "@item stringbuf-chars\n"
918 "A new string containing this string's stringbuf's characters\n"
919 "@item stringbuf-length\n"
920 "The number of characters in this stringbuf\n"
921 "@item stringbuf-shared\n"
922 "@code{#t} if this stringbuf is shared\n"
923 "@item stringbuf-wide\n"
924 "@code{#t} if this stringbuf's characters are stored in a\n"
925 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
928 #define FUNC_NAME s_scm_sys_string_dump
930 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
932 SCM_VALIDATE_STRING (1, str
);
935 e1
= scm_cons (scm_from_latin1_symbol ("string"),
937 e2
= scm_cons (scm_from_latin1_symbol ("start"),
938 scm_from_size_t (STRING_START (str
)));
939 e3
= scm_cons (scm_from_latin1_symbol ("length"),
940 scm_from_size_t (STRING_LENGTH (str
)));
942 if (IS_SH_STRING (str
))
944 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
945 SH_STRING_STRING (str
));
946 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
950 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
952 buf
= STRING_STRINGBUF (str
);
955 if (IS_RO_STRING (str
))
956 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
959 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
963 if (!STRINGBUF_WIDE (buf
))
965 size_t len
= STRINGBUF_LENGTH (buf
);
967 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
968 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
969 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
974 size_t len
= STRINGBUF_LENGTH (buf
);
976 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
977 u32_cpy ((scm_t_uint32
*) cbuf
,
978 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
979 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
982 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
983 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
984 if (STRINGBUF_SHARED (buf
))
985 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
988 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
990 if (STRINGBUF_WIDE (buf
))
991 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
994 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
997 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
1001 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
1002 "Returns an association list containing debugging information\n"
1003 "for @var{sym}. The association list has the following entries."
1006 "The symbol itself\n"
1010 "@code{#t} if it is an interned symbol\n"
1011 "@item stringbuf-chars\n"
1012 "A new string containing this symbols's stringbuf's characters\n"
1013 "@item stringbuf-length\n"
1014 "The number of characters in this stringbuf\n"
1015 "@item stringbuf-shared\n"
1016 "@code{#t} if this stringbuf is shared\n"
1017 "@item stringbuf-wide\n"
1018 "@code{#t} if this stringbuf's characters are stored in a\n"
1019 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1022 #define FUNC_NAME s_scm_sys_symbol_dump
1024 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
1026 SCM_VALIDATE_SYMBOL (1, sym
);
1027 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
1029 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
1030 scm_from_ulong (scm_i_symbol_hash (sym
)));
1031 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
1032 scm_symbol_interned_p (sym
));
1033 buf
= SYMBOL_STRINGBUF (sym
);
1035 /* Stringbuf info */
1036 if (!STRINGBUF_WIDE (buf
))
1038 size_t len
= STRINGBUF_LENGTH (buf
);
1040 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
1041 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
1042 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1047 size_t len
= STRINGBUF_LENGTH (buf
);
1049 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
1050 u32_cpy ((scm_t_uint32
*) cbuf
,
1051 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
1052 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1055 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1056 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
1057 if (STRINGBUF_SHARED (buf
))
1058 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1061 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1063 if (STRINGBUF_WIDE (buf
))
1064 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1067 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1069 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1074 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1076 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1077 #define FUNC_NAME s_scm_sys_stringbuf_hist
1080 for (i
= 0; i
< 1000; i
++)
1082 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1083 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1084 return SCM_UNSPECIFIED
;
1092 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1094 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1095 #define FUNC_NAME s_scm_string_p
1097 return scm_from_bool (IS_STRING (obj
));
1102 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1104 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1106 "@deffnx {Scheme Procedure} list->string chrs\n"
1107 "Return a newly allocated string composed of the arguments,\n"
1109 #define FUNC_NAME s_scm_string
1111 SCM result
= SCM_BOOL_F
;
1118 /* Verify that this is a list of chars. */
1119 i
= scm_ilength (chrs
);
1120 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1125 while (len
> 0 && scm_is_pair (rest
))
1127 SCM elt
= SCM_CAR (rest
);
1128 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1129 if (SCM_CHAR (elt
) > 0xFF)
1131 rest
= SCM_CDR (rest
);
1133 scm_remember_upto_here_1 (elt
);
1136 /* Construct a string containing this list of chars. */
1144 result
= scm_i_make_string (len
, NULL
, 0);
1145 result
= scm_i_string_start_writing (result
);
1146 buf
= scm_i_string_writable_chars (result
);
1147 while (len
> 0 && scm_is_pair (rest
))
1149 SCM elt
= SCM_CAR (rest
);
1150 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1152 rest
= SCM_CDR (rest
);
1154 scm_remember_upto_here_1 (elt
);
1161 result
= scm_i_make_wide_string (len
, NULL
, 0);
1162 result
= scm_i_string_start_writing (result
);
1163 buf
= scm_i_string_writable_wide_chars (result
);
1164 while (len
> 0 && scm_is_pair (rest
))
1166 SCM elt
= SCM_CAR (rest
);
1167 buf
[p
] = SCM_CHAR (elt
);
1169 rest
= SCM_CDR (rest
);
1171 scm_remember_upto_here_1 (elt
);
1174 scm_i_string_stop_writing ();
1177 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1178 if (!scm_is_null (rest
))
1179 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1185 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1187 "Return a newly allocated string of\n"
1188 "length @var{k}. If @var{chr} is given, then all elements of\n"
1189 "the string are initialized to @var{chr}, otherwise the contents\n"
1190 "of the string are all set to @code{#\nul}.")
1191 #define FUNC_NAME s_scm_make_string
1193 return scm_c_make_string (scm_to_size_t (k
), chr
);
1198 scm_c_make_string (size_t len
, SCM chr
)
1199 #define FUNC_NAME NULL
1202 char *contents
= NULL
;
1203 SCM res
= scm_i_make_string (len
, &contents
, 0);
1205 /* If no char is given, initialize string contents to NULL. */
1206 if (SCM_UNBNDP (chr
))
1207 memset (contents
, 0, len
);
1210 SCM_VALIDATE_CHAR (0, chr
);
1211 res
= scm_i_string_start_writing (res
);
1212 for (p
= 0; p
< len
; p
++)
1213 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1214 scm_i_string_stop_writing ();
1221 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1223 "Return the number of characters in @var{string}.")
1224 #define FUNC_NAME s_scm_string_length
1226 SCM_VALIDATE_STRING (1, string
);
1227 return scm_from_size_t (STRING_LENGTH (string
));
1231 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1233 "Return the bytes used to represent a character in @var{string}."
1234 "This will return 1 or 4.")
1235 #define FUNC_NAME s_scm_string_bytes_per_char
1237 SCM_VALIDATE_STRING (1, string
);
1238 if (!scm_i_is_narrow_string (string
))
1239 return scm_from_int (4);
1241 return scm_from_int (1);
1246 scm_c_string_length (SCM string
)
1248 if (!IS_STRING (string
))
1249 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1250 return STRING_LENGTH (string
);
1253 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1255 "Return character @var{k} of @var{str} using zero-origin\n"
1256 "indexing. @var{k} must be a valid index of @var{str}.")
1257 #define FUNC_NAME s_scm_string_ref
1262 SCM_VALIDATE_STRING (1, str
);
1264 len
= scm_i_string_length (str
);
1265 if (SCM_LIKELY (len
> 0))
1266 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1268 scm_out_of_range (NULL
, k
);
1270 if (scm_i_is_narrow_string (str
))
1271 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1273 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1278 scm_c_string_ref (SCM str
, size_t p
)
1280 if (p
>= scm_i_string_length (str
))
1281 scm_out_of_range (NULL
, scm_from_size_t (p
));
1282 if (scm_i_is_narrow_string (str
))
1283 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1285 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1289 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1290 (SCM str
, SCM k
, SCM chr
),
1291 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1292 "an unspecified value. @var{k} must be a valid index of\n"
1294 #define FUNC_NAME s_scm_string_set_x
1299 SCM_VALIDATE_STRING (1, str
);
1301 len
= scm_i_string_length (str
);
1302 if (SCM_LIKELY (len
> 0))
1303 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1305 scm_out_of_range (NULL
, k
);
1307 SCM_VALIDATE_CHAR (3, chr
);
1308 str
= scm_i_string_start_writing (str
);
1309 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1310 scm_i_string_stop_writing ();
1312 return SCM_UNSPECIFIED
;
1317 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1319 if (p
>= scm_i_string_length (str
))
1320 scm_out_of_range (NULL
, scm_from_size_t (p
));
1321 str
= scm_i_string_start_writing (str
);
1322 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1323 scm_i_string_stop_writing ();
1326 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1327 (SCM str
, SCM start
, SCM end
),
1328 "Return a newly allocated string formed from the characters\n"
1329 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1330 "ending with index @var{end} (exclusive).\n"
1331 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1332 "exact integers satisfying:\n\n"
1333 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1334 #define FUNC_NAME s_scm_substring
1336 size_t len
, from
, to
;
1338 SCM_VALIDATE_STRING (1, str
);
1339 len
= scm_i_string_length (str
);
1340 from
= scm_to_unsigned_integer (start
, 0, len
);
1341 if (SCM_UNBNDP (end
))
1344 to
= scm_to_unsigned_integer (end
, from
, len
);
1345 return scm_i_substring (str
, from
, to
);
1349 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1350 (SCM str
, SCM start
, SCM end
),
1351 "Return a newly allocated string formed from the characters\n"
1352 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1353 "ending with index @var{end} (exclusive).\n"
1354 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1355 "exact integers satisfying:\n"
1357 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1359 "The returned string is read-only.\n")
1360 #define FUNC_NAME s_scm_substring_read_only
1362 size_t len
, from
, to
;
1364 SCM_VALIDATE_STRING (1, str
);
1365 len
= scm_i_string_length (str
);
1366 from
= scm_to_unsigned_integer (start
, 0, len
);
1367 if (SCM_UNBNDP (end
))
1370 to
= scm_to_unsigned_integer (end
, from
, len
);
1371 return scm_i_substring_read_only (str
, from
, to
);
1375 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1376 (SCM str
, SCM start
, SCM end
),
1377 "Return a newly allocated string formed from the characters\n"
1378 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379 "ending with index @var{end} (exclusive).\n"
1380 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381 "exact integers satisfying:\n\n"
1382 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383 #define FUNC_NAME s_scm_substring_copy
1385 /* For the Scheme version, START is mandatory, but for the C
1386 version, it is optional. See scm_string_copy in srfi-13.c for a
1392 SCM_VALIDATE_STRING (1, str
);
1393 scm_i_get_substring_spec (scm_i_string_length (str
),
1394 start
, &from
, end
, &to
);
1395 return scm_i_substring_copy (str
, from
, to
);
1399 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1400 (SCM str
, SCM start
, SCM end
),
1401 "Return string that indirectly refers to the characters\n"
1402 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1403 "ending with index @var{end} (exclusive).\n"
1404 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1405 "exact integers satisfying:\n\n"
1406 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1407 #define FUNC_NAME s_scm_substring_shared
1409 size_t len
, from
, to
;
1411 SCM_VALIDATE_STRING (1, str
);
1412 len
= scm_i_string_length (str
);
1413 from
= scm_to_unsigned_integer (start
, 0, len
);
1414 if (SCM_UNBNDP (end
))
1417 to
= scm_to_unsigned_integer (end
, from
, len
);
1418 return scm_i_substring_shared (str
, from
, to
);
1422 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1424 "Return a newly allocated string whose characters form the\n"
1425 "concatenation of the given strings, @var{args}.")
1426 #define FUNC_NAME s_scm_string_append
1440 SCM_VALIDATE_REST_ARGUMENT (args
);
1441 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1444 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1445 len
= scm_i_string_length (s
);
1446 if (((size_t) -1) - total
< len
)
1447 scm_num_overflow (s_scm_string_append
);
1449 if (!scm_i_is_narrow_string (s
))
1454 res
= scm_i_make_string (total
, &data
.narrow
, 0);
1456 res
= scm_i_make_wide_string (total
, &data
.wide
, 0);
1458 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1462 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1463 len
= scm_i_string_length (s
);
1465 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL
);
1468 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1473 if (scm_i_is_narrow_string (s
))
1475 const char *src
= scm_i_string_chars (s
);
1476 for (i
= 0; i
< len
; i
++)
1477 data
.wide
[i
] = (unsigned char) src
[i
];
1480 u32_cpy ((scm_t_uint32
*) data
.wide
,
1481 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1485 scm_remember_upto_here_1 (s
);
1488 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL
);
1495 /* Charset conversion error handling. */
1497 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1498 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1500 /* Raise an exception informing that character CHR could not be written
1501 to PORT in its current encoding. */
1503 scm_encoding_error (const char *subr
, int err
, const char *message
,
1506 scm_throw (scm_encoding_error_key
,
1507 scm_list_n (scm_from_latin1_string (subr
),
1508 scm_from_latin1_string (message
),
1514 /* Raise an exception informing of an encoding error on PORT. This
1515 means that a character could not be written in PORT's encoding. */
1517 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1519 scm_throw (scm_decoding_error_key
,
1520 scm_list_n (scm_from_latin1_string (subr
),
1521 scm_from_latin1_string (message
),
1528 /* String conversion to/from C. */
1531 decoding_error (const char *func_name
, int errno_save
,
1532 const char *str
, size_t len
)
1534 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1539 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1540 memcpy (buf
, str
, len
);
1541 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1543 scm_decoding_error (func_name
, errno_save
,
1544 "input locale conversion error", bv
);
1548 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1549 scm_t_string_failed_conversion_handler handler
)
1556 /* The order of these checks is important. */
1557 if (!str
&& len
!= 0)
1558 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1559 if (len
== (size_t) -1)
1562 if (c_strcasecmp (encoding
, "ISO-8859-1") == 0 || len
== 0)
1563 return scm_from_latin1_stringn (str
, len
);
1564 else if (c_strcasecmp (encoding
, "UTF-8") == 0
1565 && handler
== SCM_FAILED_CONVERSION_ERROR
)
1566 return scm_from_utf8_stringn (str
, len
);
1569 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1570 (enum iconv_ilseq_handler
)
1576 if (SCM_UNLIKELY (u32
== NULL
))
1577 decoding_error (__func__
, errno
, str
, len
);
1581 if (u32
[i
++] > 0xFF)
1590 res
= scm_i_make_string (u32len
, &dst
, 0);
1591 for (i
= 0; i
< u32len
; i
++)
1592 dst
[i
] = (unsigned char) u32
[i
];
1598 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1599 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1608 scm_from_locale_string (const char *str
)
1610 return scm_from_locale_stringn (str
, -1);
1614 scm_from_locale_stringn (const char *str
, size_t len
)
1616 return scm_from_stringn (str
, len
, locale_charset (),
1617 scm_i_default_port_conversion_handler ());
1621 scm_from_latin1_string (const char *str
)
1623 return scm_from_latin1_stringn (str
, -1);
1627 scm_from_latin1_stringn (const char *str
, size_t len
)
1632 if (len
== (size_t) -1)
1635 /* Make a narrow string and copy STR as is. */
1636 result
= scm_i_make_string (len
, &buf
, 0);
1637 memcpy (buf
, str
, len
);
1643 scm_from_utf8_string (const char *str
)
1645 return scm_from_utf8_stringn (str
, -1);
1649 scm_from_utf8_stringn (const char *str
, size_t len
)
1652 const scm_t_uint8
*ustr
= (const scm_t_uint8
*) str
;
1653 int ascii
= 1, narrow
= 1;
1656 if (len
== (size_t) -1)
1676 nbytes
= u8_mbtouc (&c
, ustr
+ i
, len
- i
);
1680 decoding_error (__func__
, errno
, str
, len
);
1693 res
= scm_i_make_string (char_len
, &dst
, 0);
1694 memcpy (dst
, str
, len
);
1702 res
= scm_i_make_string (char_len
, &dst
, 0);
1704 for (i
= 0, j
= 0; i
< len
; j
++)
1706 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1707 dst
[j
] = (signed char) c
;
1716 res
= scm_i_make_wide_string (char_len
, &dst
, 0);
1718 for (i
= 0, j
= 0; i
< len
; j
++)
1720 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1729 scm_from_utf32_string (const scm_t_wchar
*str
)
1731 return scm_from_utf32_stringn (str
, -1);
1735 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1740 if (len
== (size_t) -1)
1741 len
= u32_strlen ((uint32_t *) str
);
1743 result
= scm_i_make_wide_string (len
, &buf
, 0);
1744 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1745 scm_i_try_narrow_string (result
);
1751 scm_from_port_string (const char *str
, SCM port
)
1753 return scm_from_port_stringn (str
, -1, port
);
1757 scm_from_port_stringn (const char *str
, size_t len
, SCM port
)
1759 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
1760 scm_t_port_internal
*pti
= SCM_PORT_GET_INTERNAL (port
);
1762 if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
)
1763 return scm_from_latin1_stringn (str
, len
);
1764 else if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_UTF8
1765 && (pt
->ilseq_handler
== SCM_FAILED_CONVERSION_ERROR
1766 || (u8_check ((uint8_t *) str
, len
) == NULL
)))
1767 return scm_from_utf8_stringn (str
, len
);
1769 return scm_from_stringn (str
, len
, pt
->encoding
, pt
->ilseq_handler
);
1772 /* Create a new scheme string from the C string STR. The memory of
1773 STR may be used directly as storage for the new string. */
1774 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1775 would be to register a finalizer to eventually free(3) STR, which isn't
1776 worth it. Should we just deprecate the `scm_take_' functions? */
1778 scm_take_locale_stringn (char *str
, size_t len
)
1782 res
= scm_from_locale_stringn (str
, len
);
1789 scm_take_locale_string (char *str
)
1791 return scm_take_locale_stringn (str
, -1);
1794 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1795 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1796 Set *LENP to the size of the resulting string.
1798 FIXME: This is a hack we should get rid of. See
1799 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1802 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1804 char *before
, *after
;
1813 if ((i
<= *lenp
- 6)
1814 && before
[i
] == '\\'
1815 && before
[i
+ 1] == 'u'
1816 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1818 /* Convert \u00NN to \xNN */
1821 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1822 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1826 else if ((i
<= *lenp
- 10)
1827 && before
[i
] == '\\'
1828 && before
[i
+ 1] == 'U'
1829 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1831 /* Convert \U00NNNNNN to \UNNNNNN */
1834 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1835 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1836 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1837 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1838 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1839 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1845 after
[j
] = before
[i
];
1853 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1854 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1855 of the resulting string. BUF must be large enough to handle the
1856 worst case when `\uXXXX' escapes (6 characters) are replaced by
1857 `\xXXXX;' (7 characters). */
1859 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1861 char *before
, *after
;
1863 /* The worst case is if the input string contains all 4-digit hex escapes.
1864 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1865 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1866 size_t nzeros
, ndigits
;
1869 after
= alloca (max_out_len
);
1874 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1875 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1877 if (before
[i
+ 1] == 'u')
1879 else if (before
[i
+ 1] == 'U')
1884 /* Add the R6RS hex escape initial sequence. */
1888 /* Move string positions to the start of the hex numbers. */
1892 /* Find the number of initial zeros in this hex number. */
1894 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1897 /* Copy the number, skipping initial zeros, and then move the string
1899 if (nzeros
== ndigits
)
1908 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1909 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1911 j
+= (ndigits
- nzeros
);
1914 /* Add terminating semicolon. */
1920 after
[j
] = before
[i
];
1926 memcpy (before
, after
, j
);
1930 scm_to_locale_string (SCM str
)
1932 return scm_to_locale_stringn (str
, NULL
);
1936 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1938 return scm_to_stringn (str
, lenp
,
1940 scm_i_default_port_conversion_handler ());
1944 scm_to_latin1_string (SCM str
)
1946 return scm_to_latin1_stringn (str
, NULL
);
1950 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1951 #define FUNC_NAME "scm_to_latin1_stringn"
1955 SCM_VALIDATE_STRING (1, str
);
1957 if (scm_i_is_narrow_string (str
))
1959 size_t len
= scm_i_string_length (str
);
1964 result
= scm_strndup (scm_i_string_data (str
), len
);
1967 result
= scm_to_stringn (str
, lenp
, NULL
,
1968 SCM_FAILED_CONVERSION_ERROR
);
1975 scm_to_utf8_string (SCM str
)
1977 return scm_to_utf8_stringn (str
, NULL
);
1981 latin1_u8_strlen (const scm_t_uint8
*str
, size_t len
)
1984 for (i
= 0, ret
= 0; i
< len
; i
++)
1985 ret
+= (str
[i
] < 128) ? 1 : 2;
1990 latin1_to_u8 (const scm_t_uint8
*str
, size_t latin_len
,
1991 scm_t_uint8
*u8_result
, size_t *u8_lenp
)
1994 size_t u8_len
= latin1_u8_strlen (str
, latin_len
);
1996 if (!(u8_result
&& u8_lenp
&& *u8_lenp
> u8_len
))
1997 u8_result
= scm_malloc (u8_len
+ 1);
2001 for (i
= 0, n
= 0; i
< latin_len
; i
++)
2002 n
+= u8_uctomb (u8_result
+ n
, str
[i
], u8_len
- n
);
2012 (Note that this includes code points that are not allowed by Unicode,
2013 but since this function has no way to report an error, and its
2014 purpose is to determine the size of destination buffers for
2015 libunicode conversion functions, we err on the safe side and handle
2016 everything that libunicode might conceivably handle, now or in the
2019 Char. number range | UTF-8 octet sequence
2020 (hexadecimal) | (binary)
2021 --------------------+------------------------------------------------------
2022 0000 0000-0000 007F | 0xxxxxxx
2023 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2024 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2025 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2026 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2027 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2031 u32_u8_length_in_bytes (const scm_t_uint32
*str
, size_t len
)
2035 for (i
= 0, ret
= 0; i
< len
; i
++)
2037 scm_t_uint32 c
= str
[i
];
2041 else if (c
<= 0x7ff)
2043 else if (c
<= 0xffff)
2045 else if (c
<= 0x1fffff)
2047 else if (c
<= 0x3ffffff)
2057 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
2058 #define FUNC_NAME "scm_to_utf8_stringn"
2060 SCM_VALIDATE_STRING (1, str
);
2062 if (scm_i_is_narrow_string (str
))
2063 return (char *) latin1_to_u8 ((scm_t_uint8
*) scm_i_string_chars (str
),
2064 scm_i_string_length (str
),
2068 scm_t_uint32
*chars
= (scm_t_uint32
*) scm_i_string_wide_chars (str
);
2069 scm_t_uint8
*buf
, *ret
;
2070 size_t num_chars
= scm_i_string_length (str
);
2071 size_t num_bytes_predicted
, num_bytes_actual
;
2073 num_bytes_predicted
= u32_u8_length_in_bytes (chars
, num_chars
);
2077 *lenp
= num_bytes_predicted
;
2078 buf
= scm_malloc (num_bytes_predicted
);
2082 buf
= scm_malloc (num_bytes_predicted
+ 1);
2083 buf
[num_bytes_predicted
] = 0;
2086 num_bytes_actual
= num_bytes_predicted
;
2087 ret
= u32_to_u8 (chars
, num_chars
, buf
, &num_bytes_actual
);
2089 if (SCM_LIKELY (ret
== buf
&& num_bytes_actual
== num_bytes_predicted
))
2090 return (char *) ret
;
2092 /* An error: a bad codepoint. */
2094 int saved_errno
= errno
;
2100 scm_decoding_error ("scm_to_utf8_stringn", errno
,
2101 "invalid codepoint in string", str
);
2111 scm_to_utf32_string (SCM str
)
2113 return scm_to_utf32_stringn (str
, NULL
);
2117 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
2118 #define FUNC_NAME "scm_to_utf32_stringn"
2120 scm_t_wchar
*result
;
2122 SCM_VALIDATE_STRING (1, str
);
2124 if (scm_i_is_narrow_string (str
))
2126 scm_t_uint8
*codepoints
;
2129 codepoints
= (scm_t_uint8
*) scm_i_string_chars (str
);
2130 len
= scm_i_string_length (str
);
2134 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2135 for (i
= 0; i
< len
; i
++)
2136 result
[i
] = codepoints
[i
];
2143 len
= scm_i_string_length (str
);
2147 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2148 memcpy (result
, scm_i_string_wide_chars (str
),
2149 len
* sizeof (scm_t_wchar
));
2158 scm_to_port_string (SCM str
, SCM port
)
2160 return scm_to_port_stringn (str
, NULL
, port
);
2164 scm_to_port_stringn (SCM str
, size_t *lenp
, SCM port
)
2166 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
2167 scm_t_port_internal
*pti
= SCM_PORT_GET_INTERNAL (port
);
2169 if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
2170 && pt
->ilseq_handler
== SCM_FAILED_CONVERSION_ERROR
)
2171 return scm_to_latin1_stringn (str
, lenp
);
2172 else if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_UTF8
)
2173 return scm_to_utf8_stringn (str
, lenp
);
2175 return scm_to_stringn (str
, lenp
, pt
->encoding
, pt
->ilseq_handler
);
2178 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2179 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2180 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2181 defined by HANDLER. */
2183 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
2184 scm_t_string_failed_conversion_handler handler
)
2187 size_t ilen
, len
, i
;
2191 if (!scm_is_string (str
))
2192 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2193 ilen
= scm_i_string_length (str
);
2197 buf
= scm_malloc (1);
2205 for (i
= 0; i
< ilen
; i
++)
2206 if (scm_i_string_ref (str
, i
) == '\0')
2207 scm_misc_error (NULL
,
2208 "string contains #\\nul character: ~S",
2211 if (scm_i_is_narrow_string (str
)
2212 && c_strcasecmp (encoding
, "ISO-8859-1") == 0)
2214 /* If using native Latin-1 encoding, just copy the string
2218 buf
= scm_malloc (ilen
);
2219 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2225 buf
= scm_malloc (ilen
+ 1);
2226 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2238 if (scm_i_is_narrow_string (str
))
2240 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
2242 (enum iconv_ilseq_handler
) handler
, NULL
,
2246 scm_encoding_error (__func__
, errno
,
2247 "cannot convert narrow string to output locale",
2249 /* FIXME: Faulty character unknown. */
2254 buf
= u32_conv_to_encoding (enc
,
2255 (enum iconv_ilseq_handler
) handler
,
2256 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
2261 scm_encoding_error (__func__
, errno
,
2262 "cannot convert wide string to output locale",
2264 /* FIXME: Faulty character unknown. */
2267 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
2269 if (SCM_R6RS_ESCAPES_P
)
2271 /* The worst case is if the input string contains all 4-digit
2272 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2273 (seven characters). Make BUF large enough to hold
2275 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
2276 unistring_escapes_to_r6rs_escapes (buf
, &len
);
2279 unistring_escapes_to_guile_escapes (buf
, &len
);
2281 buf
= scm_realloc (buf
, len
);
2287 buf
= scm_realloc (buf
, len
+ 1);
2291 scm_remember_upto_here_1 (str
);
2296 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
2299 char *result
= NULL
;
2300 if (!scm_is_string (str
))
2301 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2302 result
= scm_to_locale_stringn (str
, &len
);
2304 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
2307 scm_remember_upto_here_1 (str
);
2312 /* Unicode string normalization. */
2314 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2315 libguile/i18n.c. It would be useful to have this factored out into a more
2316 convenient location, but its use of alloca makes that tricky to do. */
2319 normalize_str (SCM string
, uninorm_t form
)
2322 scm_t_uint32
*w_str
;
2324 size_t rlen
, len
= scm_i_string_length (string
);
2326 if (scm_i_is_narrow_string (string
))
2329 const char *buf
= scm_i_string_chars (string
);
2331 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
2333 for (i
= 0; i
< len
; i
++)
2334 w_str
[i
] = (unsigned char) buf
[i
];
2338 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2340 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2342 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2343 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2346 scm_i_try_narrow_string (ret
);
2351 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2353 "Returns the NFC normalized form of @var{string}.")
2354 #define FUNC_NAME s_scm_string_normalize_nfc
2356 SCM_VALIDATE_STRING (1, string
);
2357 return normalize_str (string
, UNINORM_NFC
);
2361 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2363 "Returns the NFD normalized form of @var{string}.")
2364 #define FUNC_NAME s_scm_string_normalize_nfd
2366 SCM_VALIDATE_STRING (1, string
);
2367 return normalize_str (string
, UNINORM_NFD
);
2371 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2373 "Returns the NFKC normalized form of @var{string}.")
2374 #define FUNC_NAME s_scm_string_normalize_nfkc
2376 SCM_VALIDATE_STRING (1, string
);
2377 return normalize_str (string
, UNINORM_NFKC
);
2381 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2383 "Returns the NFKD normalized form of @var{string}.")
2384 #define FUNC_NAME s_scm_string_normalize_nfkd
2386 SCM_VALIDATE_STRING (1, string
);
2387 return normalize_str (string
, UNINORM_NFKD
);
2391 /* converts C scm_array of strings to SCM scm_list of strings.
2392 If argc < 0, a null terminated scm_array is assumed.
2393 The current locale encoding is assumed */
2395 scm_makfromstrs (int argc
, char **argv
)
2400 for (i
= 0; argv
[i
]; i
++);
2402 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2406 /* Return a newly allocated array of char pointers to each of the strings
2407 in args, with a terminating NULL pointer. The strings are encoded using
2408 the current locale. */
2411 scm_i_allocate_string_pointers (SCM list
)
2412 #define FUNC_NAME "scm_i_allocate_string_pointers"
2415 int list_len
= scm_ilength (list
);
2419 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2421 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2423 result
[list_len
] = NULL
;
2425 /* The list might have been modified in another thread, so
2426 we check LIST before each access.
2428 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2430 SCM str
= SCM_CAR (list
);
2431 size_t len
; /* String length in bytes */
2432 char *c_str
= scm_to_locale_stringn (str
, &len
);
2434 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2435 scm_malloc to allocate the returned string, which must be
2436 explicitly deallocated. This forces us to copy the string a
2437 second time into a new buffer. Ideally there would be variants
2438 of scm_to_*_stringn that can return garbage-collected buffers. */
2440 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2441 memcpy (result
[i
], c_str
, len
);
2442 result
[i
][len
] = '\0';
2445 list
= SCM_CDR (list
);
2453 scm_i_get_substring_spec (size_t len
,
2454 SCM start
, size_t *cstart
,
2455 SCM end
, size_t *cend
)
2457 if (SCM_UNBNDP (start
))
2460 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2462 if (SCM_UNBNDP (end
))
2465 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2468 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2473 scm_nullstr
= scm_i_make_string (0, NULL
, 0);
2475 #include "libguile/strings.x"