1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include <c-strcase.h>
34 #include "striconveh.h"
36 #include "libguile/_scm.h"
37 #include "libguile/chars.h"
38 #include "libguile/root.h"
39 #include "libguile/strings.h"
40 #include "libguile/ports.h"
41 #include "libguile/ports-internal.h"
42 #include "libguile/error.h"
43 #include "libguile/generalized-vectors.h"
44 #include "libguile/deprecation.h"
45 #include "libguile/validate.h"
46 #include "libguile/private-options.h"
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
58 * shared, not whether it definitely is.
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
77 /* The size in words of the stringbuf header (type tag + size). */
78 #define STRINGBUF_HEADER_SIZE 2U
80 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
82 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
83 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
85 #define STRINGBUF_TAG scm_tc7_stringbuf
86 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
89 #define STRINGBUF_CONTENTS(buf) ((void *) \
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
92 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
95 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
97 #define SET_STRINGBUF_SHARED(buf) \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
108 static size_t lenhist
[1001];
111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
114 make_stringbuf (size_t len
)
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
132 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
135 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
136 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
138 STRINGBUF_CHARS (buf
)[len
] = 0;
143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 make_wide_stringbuf (size_t len
)
151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
158 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
159 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
162 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
163 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
165 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
173 wide_stringbuf (SCM buf
)
177 if (STRINGBUF_WIDE (buf
))
184 len
= STRINGBUF_LENGTH (buf
);
186 new_buf
= make_wide_stringbuf (len
);
188 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
189 for (i
= 0; i
< len
; i
++)
190 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
200 narrow_stringbuf (SCM buf
)
204 if (!STRINGBUF_WIDE (buf
))
212 len
= STRINGBUF_LENGTH (buf
);
213 wmem
= STRINGBUF_WIDE_CHARS (buf
);
215 for (i
= 0; i
< len
; i
++)
217 /* BUF cannot be narrowed. */
220 new_buf
= make_stringbuf (len
);
222 mem
= STRINGBUF_CHARS (new_buf
);
223 for (i
= 0; i
< len
; i
++)
224 mem
[i
] = (unsigned char) wmem
[i
];
231 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
234 /* Copy-on-write strings.
237 #define STRING_TAG scm_tc7_string
239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
246 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
248 /* Read-only strings.
251 #define RO_STRING_TAG scm_tc7_ro_string
252 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
254 /* Mutation-sharing substrings
257 #define SH_STRING_TAG (scm_tc7_string + 0x100)
259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260 /* START and LENGTH as for STRINGs. */
262 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
265 scm_i_print_stringbuf (SCM exp
, SCM port
, scm_print_state
*pstate
)
269 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
270 SET_STRINGBUF_SHARED (exp
);
271 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
273 str
= scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(exp
),
274 0, STRINGBUF_LENGTH (exp
));
276 scm_puts ("#<stringbuf ", port
);
277 scm_iprin1 (str
, port
, pstate
);
278 scm_puts (">", port
);
283 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
284 characters. CHARSP, if not NULL, will be set to location of the
285 char array. If READ_ONLY_P, the returned string is read-only;
286 otherwise it is writable. */
288 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
290 static SCM null_stringbuf
= SCM_BOOL_F
;
296 if (SCM_UNLIKELY (scm_is_false (null_stringbuf
)))
298 null_stringbuf
= make_stringbuf (0);
299 SET_STRINGBUF_SHARED (null_stringbuf
);
301 buf
= null_stringbuf
;
304 buf
= make_stringbuf (len
);
307 *charsp
= (char *) STRINGBUF_CHARS (buf
);
308 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
310 (scm_t_bits
) 0, (scm_t_bits
) len
);
314 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
315 characters. CHARSP, if not NULL, will be set to location of the
316 character array. If READ_ONLY_P, the returned string is read-only;
317 otherwise it is writable. */
319 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
321 SCM buf
= make_wide_stringbuf (len
);
324 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
325 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
327 (scm_t_bits
) 0, (scm_t_bits
) len
);
332 validate_substring_args (SCM str
, size_t start
, size_t end
)
334 if (!IS_STRING (str
))
335 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
336 if (start
> STRING_LENGTH (str
))
337 scm_out_of_range (NULL
, scm_from_size_t (start
));
338 if (end
> STRING_LENGTH (str
) || end
< start
)
339 scm_out_of_range (NULL
, scm_from_size_t (end
));
343 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
345 *start
= STRING_START (*str
);
346 if (IS_SH_STRING (*str
))
348 *str
= SH_STRING_STRING (*str
);
349 *start
+= STRING_START (*str
);
351 *buf
= STRING_STRINGBUF (*str
);
355 scm_i_substring (SCM str
, size_t start
, size_t end
)
358 return scm_i_make_string (0, NULL
, 0);
363 get_str_buf_start (&str
, &buf
, &str_start
);
364 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
365 SET_STRINGBUF_SHARED (buf
);
366 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
367 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
368 (scm_t_bits
)str_start
+ start
,
369 (scm_t_bits
) end
- start
);
374 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
377 return scm_i_make_string (0, NULL
, 1);
382 get_str_buf_start (&str
, &buf
, &str_start
);
383 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
384 SET_STRINGBUF_SHARED (buf
);
385 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
386 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
387 (scm_t_bits
)str_start
+ start
,
388 (scm_t_bits
) end
- start
);
393 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
396 return scm_i_make_string (0, NULL
, 0);
399 size_t len
= end
- start
;
400 SCM buf
, my_buf
, substr
;
403 get_str_buf_start (&str
, &buf
, &str_start
);
404 if (scm_i_is_narrow_string (str
))
406 my_buf
= make_stringbuf (len
);
407 memcpy (STRINGBUF_CHARS (my_buf
),
408 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
412 my_buf
= make_wide_stringbuf (len
);
413 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
414 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
418 scm_remember_upto_here_1 (buf
);
419 substr
= scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
420 (scm_t_bits
) 0, (scm_t_bits
) len
);
422 scm_i_try_narrow_string (substr
);
428 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
430 if (start
== 0 && end
== STRING_LENGTH (str
))
432 else if (start
== end
)
433 return scm_i_make_string (0, NULL
, 0);
436 size_t len
= end
- start
;
437 if (IS_SH_STRING (str
))
439 start
+= STRING_START (str
);
440 str
= SH_STRING_STRING (str
);
442 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
443 (scm_t_bits
)start
, (scm_t_bits
) len
);
448 scm_c_substring (SCM str
, size_t start
, size_t end
)
450 validate_substring_args (str
, start
, end
);
451 return scm_i_substring (str
, start
, end
);
455 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
457 validate_substring_args (str
, start
, end
);
458 return scm_i_substring_read_only (str
, start
, end
);
462 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
464 validate_substring_args (str
, start
, end
);
465 return scm_i_substring_copy (str
, start
, end
);
469 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
471 validate_substring_args (str
, start
, end
);
472 return scm_i_substring_shared (str
, start
, end
);
476 /* Internal accessors
479 /* Returns the number of characters in STR. This may be different
480 than the memory size of the string storage. */
482 scm_i_string_length (SCM str
)
484 return STRING_LENGTH (str
);
487 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
488 encoding. False if it is 'wide', having a 32-bit UCS-4
491 scm_i_is_narrow_string (SCM str
)
493 if (IS_SH_STRING (str
))
494 str
= SH_STRING_STRING (str
);
496 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
499 /* Try to coerce a string to be narrow. It if is narrow already, do
500 nothing. If it is wide, shrink it to narrow if none of its
501 characters are above 0xFF. Return true if the string is narrow or
502 was made to be narrow. */
504 scm_i_try_narrow_string (SCM str
)
506 if (IS_SH_STRING (str
))
507 str
= SH_STRING_STRING (str
);
509 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
511 return scm_i_is_narrow_string (str
);
514 /* Return a pointer to the raw data of the string, which can be either Latin-1
515 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
517 scm_i_string_data (SCM str
)
523 get_str_buf_start (&str
, &buf
, &start
);
525 data
= STRINGBUF_CONTENTS (buf
);
526 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
531 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
534 scm_i_string_chars (SCM str
)
538 get_str_buf_start (&str
, &buf
, &start
);
539 if (scm_i_is_narrow_string (str
))
540 return (const char *) STRINGBUF_CHARS (buf
) + start
;
542 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
547 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
550 scm_i_string_wide_chars (SCM str
)
555 get_str_buf_start (&str
, &buf
, &start
);
556 if (!scm_i_is_narrow_string (str
))
557 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
559 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
563 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
564 a new string buffer, so that it can be modified without modifying
565 other strings. Also, lock the string mutex. Later, one must call
566 scm_i_string_stop_writing to unlock the mutex. */
568 scm_i_string_start_writing (SCM orig_str
)
570 SCM buf
, str
= orig_str
;
573 get_str_buf_start (&str
, &buf
, &start
);
574 if (IS_RO_STRING (str
))
575 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
577 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
578 if (STRINGBUF_SHARED (buf
))
580 /* Clone the stringbuf. */
581 size_t len
= STRING_LENGTH (str
);
584 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
586 if (scm_i_is_narrow_string (str
))
588 new_buf
= make_stringbuf (len
);
589 memcpy (STRINGBUF_CHARS (new_buf
),
590 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
595 new_buf
= make_wide_stringbuf (len
);
596 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
597 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
598 + STRING_START (str
)), len
);
601 SET_STRING_STRINGBUF (str
, new_buf
);
602 start
-= STRING_START (str
);
604 /* FIXME: The following operations are not atomic, so other threads
605 looking at STR may see an inconsistent state. Nevertheless it can't
606 hurt much since (i) accessing STR while it is being mutated can't
607 yield a crash, and (ii) concurrent accesses to STR should be
608 protected by a mutex at the application level. The latter may not
609 apply when STR != ORIG_STR, though. */
610 SET_STRING_START (str
, 0);
611 SET_STRING_STRINGBUF (str
, new_buf
);
615 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
620 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
622 scm_i_string_writable_chars (SCM str
)
627 get_str_buf_start (&str
, &buf
, &start
);
628 if (scm_i_is_narrow_string (str
))
629 return (char *) STRINGBUF_CHARS (buf
) + start
;
631 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
636 /* Return a pointer to the UCS-4 codepoints of a string. */
638 scm_i_string_writable_wide_chars (SCM str
)
643 get_str_buf_start (&str
, &buf
, &start
);
644 if (!scm_i_is_narrow_string (str
))
645 return STRINGBUF_WIDE_CHARS (buf
) + start
;
647 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
651 /* Unlock the string mutex that was locked when
652 scm_i_string_start_writing was called. */
654 scm_i_string_stop_writing (void)
656 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
659 /* Return the Xth character of STR as a UCS-4 codepoint. */
661 scm_i_string_ref (SCM str
, size_t x
)
663 if (scm_i_is_narrow_string (str
))
664 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
666 return scm_i_string_wide_chars (str
)[x
];
669 /* Returns index+1 of the first char in STR that matches C, or
670 0 if the char is not found. */
672 scm_i_string_contains_char (SCM str
, char ch
)
675 size_t len
= scm_i_string_length (str
);
678 if (scm_i_is_narrow_string (str
))
682 if (scm_i_string_chars (str
)[i
] == ch
)
691 if (scm_i_string_wide_chars (str
)[i
]
692 == (unsigned char) ch
)
701 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
703 if (scm_i_is_narrow_string (sstr
))
705 const char *a
= scm_i_string_chars (sstr
) + start_x
;
706 const char *b
= cstr
;
707 return strncmp (a
, b
, strlen(b
));
712 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
713 const char *b
= cstr
;
714 for (i
= 0; i
< strlen (b
); i
++)
716 if (a
[i
] != (unsigned char) b
[i
])
723 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
725 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
727 if (IS_SH_STRING (str
))
729 p
+= STRING_START (str
);
730 str
= SH_STRING_STRING (str
);
733 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
734 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
736 if (scm_i_is_narrow_string (str
))
738 char *dst
= scm_i_string_writable_chars (str
);
743 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
751 Basic symbol creation and accessing is done here, the rest is in
752 symbols.[hc]. This has been done to keep stringbufs and the
753 internals of strings and string-like objects confined to this file.
756 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
759 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
760 unsigned long hash
, SCM props
)
763 size_t start
= STRING_START (name
);
764 size_t length
= STRING_LENGTH (name
);
766 if (IS_SH_STRING (name
))
768 name
= SH_STRING_STRING (name
);
769 start
+= STRING_START (name
);
771 buf
= STRING_STRINGBUF (name
);
773 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
776 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
777 SET_STRINGBUF_SHARED (buf
);
778 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
783 if (scm_i_is_narrow_string (name
))
785 SCM new_buf
= make_stringbuf (length
);
786 memcpy (STRINGBUF_CHARS (new_buf
),
787 STRINGBUF_CHARS (buf
) + start
, length
);
792 SCM new_buf
= make_wide_stringbuf (length
);
793 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
794 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
799 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
800 (scm_t_bits
) hash
, SCM_UNPACK (props
));
804 scm_i_c_make_symbol (const char *name
, size_t len
,
805 scm_t_bits flags
, unsigned long hash
, SCM props
)
807 SCM buf
= make_stringbuf (len
);
808 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
810 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
811 (scm_t_bits
) hash
, SCM_UNPACK (props
));
814 /* Returns the number of characters in SYM. This may be different
815 from the memory size of SYM. */
817 scm_i_symbol_length (SCM sym
)
819 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
823 scm_c_symbol_length (SCM sym
)
824 #define FUNC_NAME "scm_c_symbol_length"
826 SCM_VALIDATE_SYMBOL (1, sym
);
828 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
832 /* True if the name of SYM is stored as a Latin-1 encoded string.
833 False if it is stored as a 32-bit UCS-4-encoded string. */
835 scm_i_is_narrow_symbol (SCM sym
)
839 buf
= SYMBOL_STRINGBUF (sym
);
840 return !STRINGBUF_WIDE (buf
);
843 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
844 contains the name of SYM. */
846 scm_i_symbol_chars (SCM sym
)
850 buf
= SYMBOL_STRINGBUF (sym
);
851 if (!STRINGBUF_WIDE (buf
))
852 return (const char *) STRINGBUF_CHARS (buf
);
854 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
858 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
861 scm_i_symbol_wide_chars (SCM sym
)
865 buf
= SYMBOL_STRINGBUF (sym
);
866 if (STRINGBUF_WIDE (buf
))
867 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
869 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
874 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
876 SCM buf
= SYMBOL_STRINGBUF (sym
);
877 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
878 SET_STRINGBUF_SHARED (buf
);
879 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
880 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
881 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
884 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
886 scm_i_symbol_ref (SCM sym
, size_t x
)
888 if (scm_i_is_narrow_symbol (sym
))
889 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
891 return scm_i_symbol_wide_chars (sym
)[x
];
897 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
898 "Returns an association list containing debugging information\n"
899 "for @var{str}. The association list has the following entries."
902 "The string itself.\n"
904 "The start index of the string into its stringbuf\n"
906 "The length of the string\n"
908 "If this string is a substring, it returns its parent string.\n"
909 "Otherwise, it returns @code{#f}\n"
911 "@code{#t} if the string is read-only\n"
912 "@item stringbuf-chars\n"
913 "A new string containing this string's stringbuf's characters\n"
914 "@item stringbuf-length\n"
915 "The number of characters in this stringbuf\n"
916 "@item stringbuf-shared\n"
917 "@code{#t} if this stringbuf is shared\n"
918 "@item stringbuf-wide\n"
919 "@code{#t} if this stringbuf's characters are stored in a\n"
920 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
923 #define FUNC_NAME s_scm_sys_string_dump
925 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
927 SCM_VALIDATE_STRING (1, str
);
930 e1
= scm_cons (scm_from_latin1_symbol ("string"),
932 e2
= scm_cons (scm_from_latin1_symbol ("start"),
933 scm_from_size_t (STRING_START (str
)));
934 e3
= scm_cons (scm_from_latin1_symbol ("length"),
935 scm_from_size_t (STRING_LENGTH (str
)));
937 if (IS_SH_STRING (str
))
939 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
940 SH_STRING_STRING (str
));
941 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
945 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
947 buf
= STRING_STRINGBUF (str
);
950 if (IS_RO_STRING (str
))
951 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
954 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
958 if (!STRINGBUF_WIDE (buf
))
960 size_t len
= STRINGBUF_LENGTH (buf
);
962 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
963 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
964 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
969 size_t len
= STRINGBUF_LENGTH (buf
);
971 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
972 u32_cpy ((scm_t_uint32
*) cbuf
,
973 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
974 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
977 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
978 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
979 if (STRINGBUF_SHARED (buf
))
980 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
983 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
985 if (STRINGBUF_WIDE (buf
))
986 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
989 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
992 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
996 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
997 "Returns an association list containing debugging information\n"
998 "for @var{sym}. The association list has the following entries."
1001 "The symbol itself\n"
1005 "@code{#t} if it is an interned symbol\n"
1006 "@item stringbuf-chars\n"
1007 "A new string containing this symbols's stringbuf's characters\n"
1008 "@item stringbuf-length\n"
1009 "The number of characters in this stringbuf\n"
1010 "@item stringbuf-shared\n"
1011 "@code{#t} if this stringbuf is shared\n"
1012 "@item stringbuf-wide\n"
1013 "@code{#t} if this stringbuf's characters are stored in a\n"
1014 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1017 #define FUNC_NAME s_scm_sys_symbol_dump
1019 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
1021 SCM_VALIDATE_SYMBOL (1, sym
);
1022 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
1024 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
1025 scm_from_ulong (scm_i_symbol_hash (sym
)));
1026 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
1027 scm_symbol_interned_p (sym
));
1028 buf
= SYMBOL_STRINGBUF (sym
);
1030 /* Stringbuf info */
1031 if (!STRINGBUF_WIDE (buf
))
1033 size_t len
= STRINGBUF_LENGTH (buf
);
1035 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
1036 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
1037 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1042 size_t len
= STRINGBUF_LENGTH (buf
);
1044 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
1045 u32_cpy ((scm_t_uint32
*) cbuf
,
1046 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
1047 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1050 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1051 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
1052 if (STRINGBUF_SHARED (buf
))
1053 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1056 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1058 if (STRINGBUF_WIDE (buf
))
1059 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1062 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1064 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1069 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1071 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1072 #define FUNC_NAME s_scm_sys_stringbuf_hist
1075 for (i
= 0; i
< 1000; i
++)
1077 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1078 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1079 return SCM_UNSPECIFIED
;
1087 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1089 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1090 #define FUNC_NAME s_scm_string_p
1092 return scm_from_bool (IS_STRING (obj
));
1097 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1099 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1101 "@deffnx {Scheme Procedure} list->string chrs\n"
1102 "Return a newly allocated string composed of the arguments,\n"
1104 #define FUNC_NAME s_scm_string
1106 SCM result
= SCM_BOOL_F
;
1113 /* Verify that this is a list of chars. */
1114 i
= scm_ilength (chrs
);
1115 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1120 while (len
> 0 && scm_is_pair (rest
))
1122 SCM elt
= SCM_CAR (rest
);
1123 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1124 if (SCM_CHAR (elt
) > 0xFF)
1126 rest
= SCM_CDR (rest
);
1128 scm_remember_upto_here_1 (elt
);
1131 /* Construct a string containing this list of chars. */
1139 result
= scm_i_make_string (len
, NULL
, 0);
1140 result
= scm_i_string_start_writing (result
);
1141 buf
= scm_i_string_writable_chars (result
);
1142 while (len
> 0 && scm_is_pair (rest
))
1144 SCM elt
= SCM_CAR (rest
);
1145 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1147 rest
= SCM_CDR (rest
);
1149 scm_remember_upto_here_1 (elt
);
1156 result
= scm_i_make_wide_string (len
, NULL
, 0);
1157 result
= scm_i_string_start_writing (result
);
1158 buf
= scm_i_string_writable_wide_chars (result
);
1159 while (len
> 0 && scm_is_pair (rest
))
1161 SCM elt
= SCM_CAR (rest
);
1162 buf
[p
] = SCM_CHAR (elt
);
1164 rest
= SCM_CDR (rest
);
1166 scm_remember_upto_here_1 (elt
);
1169 scm_i_string_stop_writing ();
1172 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1173 if (!scm_is_null (rest
))
1174 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1180 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1182 "Return a newly allocated string of\n"
1183 "length @var{k}. If @var{chr} is given, then all elements of\n"
1184 "the string are initialized to @var{chr}, otherwise the contents\n"
1185 "of the string are all set to @code{#\nul}.")
1186 #define FUNC_NAME s_scm_make_string
1188 return scm_c_make_string (scm_to_size_t (k
), chr
);
1193 scm_c_make_string (size_t len
, SCM chr
)
1194 #define FUNC_NAME NULL
1197 char *contents
= NULL
;
1198 SCM res
= scm_i_make_string (len
, &contents
, 0);
1200 /* If no char is given, initialize string contents to NULL. */
1201 if (SCM_UNBNDP (chr
))
1202 memset (contents
, 0, len
);
1205 SCM_VALIDATE_CHAR (0, chr
);
1206 res
= scm_i_string_start_writing (res
);
1207 for (p
= 0; p
< len
; p
++)
1208 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1209 scm_i_string_stop_writing ();
1216 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1218 "Return the number of characters in @var{string}.")
1219 #define FUNC_NAME s_scm_string_length
1221 SCM_VALIDATE_STRING (1, string
);
1222 return scm_from_size_t (STRING_LENGTH (string
));
1226 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1228 "Return the bytes used to represent a character in @var{string}."
1229 "This will return 1 or 4.")
1230 #define FUNC_NAME s_scm_string_bytes_per_char
1232 SCM_VALIDATE_STRING (1, string
);
1233 if (!scm_i_is_narrow_string (string
))
1234 return scm_from_int (4);
1236 return scm_from_int (1);
1241 scm_c_string_length (SCM string
)
1243 if (!IS_STRING (string
))
1244 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1245 return STRING_LENGTH (string
);
1248 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1250 "Return character @var{k} of @var{str} using zero-origin\n"
1251 "indexing. @var{k} must be a valid index of @var{str}.")
1252 #define FUNC_NAME s_scm_string_ref
1257 SCM_VALIDATE_STRING (1, str
);
1259 len
= scm_i_string_length (str
);
1260 if (SCM_LIKELY (len
> 0))
1261 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1263 scm_out_of_range (NULL
, k
);
1265 if (scm_i_is_narrow_string (str
))
1266 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1268 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1273 scm_c_string_ref (SCM str
, size_t p
)
1275 if (p
>= scm_i_string_length (str
))
1276 scm_out_of_range (NULL
, scm_from_size_t (p
));
1277 if (scm_i_is_narrow_string (str
))
1278 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1280 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1284 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1285 (SCM str
, SCM k
, SCM chr
),
1286 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1287 "an unspecified value. @var{k} must be a valid index of\n"
1289 #define FUNC_NAME s_scm_string_set_x
1294 SCM_VALIDATE_STRING (1, str
);
1296 len
= scm_i_string_length (str
);
1297 if (SCM_LIKELY (len
> 0))
1298 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1300 scm_out_of_range (NULL
, k
);
1302 SCM_VALIDATE_CHAR (3, chr
);
1303 str
= scm_i_string_start_writing (str
);
1304 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1305 scm_i_string_stop_writing ();
1307 return SCM_UNSPECIFIED
;
1312 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1314 if (p
>= scm_i_string_length (str
))
1315 scm_out_of_range (NULL
, scm_from_size_t (p
));
1316 str
= scm_i_string_start_writing (str
);
1317 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1318 scm_i_string_stop_writing ();
1321 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1322 (SCM str
, SCM start
, SCM end
),
1323 "Return a newly allocated string formed from the characters\n"
1324 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1325 "ending with index @var{end} (exclusive).\n"
1326 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1327 "exact integers satisfying:\n\n"
1328 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1329 #define FUNC_NAME s_scm_substring
1331 size_t len
, from
, to
;
1333 SCM_VALIDATE_STRING (1, str
);
1334 len
= scm_i_string_length (str
);
1335 from
= scm_to_unsigned_integer (start
, 0, len
);
1336 if (SCM_UNBNDP (end
))
1339 to
= scm_to_unsigned_integer (end
, from
, len
);
1340 return scm_i_substring (str
, from
, to
);
1344 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1345 (SCM str
, SCM start
, SCM end
),
1346 "Return a newly allocated string formed from the characters\n"
1347 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1348 "ending with index @var{end} (exclusive).\n"
1349 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1350 "exact integers satisfying:\n"
1352 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1354 "The returned string is read-only.\n")
1355 #define FUNC_NAME s_scm_substring_read_only
1357 size_t len
, from
, to
;
1359 SCM_VALIDATE_STRING (1, str
);
1360 len
= scm_i_string_length (str
);
1361 from
= scm_to_unsigned_integer (start
, 0, len
);
1362 if (SCM_UNBNDP (end
))
1365 to
= scm_to_unsigned_integer (end
, from
, len
);
1366 return scm_i_substring_read_only (str
, from
, to
);
1370 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1371 (SCM str
, SCM start
, SCM end
),
1372 "Return a newly allocated string formed from the characters\n"
1373 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1374 "ending with index @var{end} (exclusive).\n"
1375 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1376 "exact integers satisfying:\n\n"
1377 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1378 #define FUNC_NAME s_scm_substring_copy
1380 /* For the Scheme version, START is mandatory, but for the C
1381 version, it is optional. See scm_string_copy in srfi-13.c for a
1387 SCM_VALIDATE_STRING (1, str
);
1388 scm_i_get_substring_spec (scm_i_string_length (str
),
1389 start
, &from
, end
, &to
);
1390 return scm_i_substring_copy (str
, from
, to
);
1394 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1395 (SCM str
, SCM start
, SCM end
),
1396 "Return string that indirectly refers to the characters\n"
1397 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1398 "ending with index @var{end} (exclusive).\n"
1399 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1400 "exact integers satisfying:\n\n"
1401 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1402 #define FUNC_NAME s_scm_substring_shared
1404 size_t len
, from
, to
;
1406 SCM_VALIDATE_STRING (1, str
);
1407 len
= scm_i_string_length (str
);
1408 from
= scm_to_unsigned_integer (start
, 0, len
);
1409 if (SCM_UNBNDP (end
))
1412 to
= scm_to_unsigned_integer (end
, from
, len
);
1413 return scm_i_substring_shared (str
, from
, to
);
1417 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1419 "Return a newly allocated string whose characters form the\n"
1420 "concatenation of the given strings, @var{args}.")
1421 #define FUNC_NAME s_scm_string_append
1435 SCM_VALIDATE_REST_ARGUMENT (args
);
1436 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1439 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1440 len
= scm_i_string_length (s
);
1441 if (((size_t) -1) - total
< len
)
1442 scm_num_overflow (s_scm_string_append
);
1444 if (!scm_i_is_narrow_string (s
))
1449 res
= scm_i_make_string (total
, &data
.narrow
, 0);
1451 res
= scm_i_make_wide_string (total
, &data
.wide
, 0);
1453 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1457 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1458 len
= scm_i_string_length (s
);
1460 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL
);
1463 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1468 if (scm_i_is_narrow_string (s
))
1470 const char *src
= scm_i_string_chars (s
);
1471 for (i
= 0; i
< len
; i
++)
1472 data
.wide
[i
] = (unsigned char) src
[i
];
1475 u32_cpy ((scm_t_uint32
*) data
.wide
,
1476 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1480 scm_remember_upto_here_1 (s
);
1483 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL
);
1490 /* Charset conversion error handling. */
1492 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1493 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1495 /* Raise an exception informing that character CHR could not be written
1496 to PORT in its current encoding. */
1498 scm_encoding_error (const char *subr
, int err
, const char *message
,
1501 scm_throw (scm_encoding_error_key
,
1502 scm_list_n (scm_from_latin1_string (subr
),
1503 scm_from_latin1_string (message
),
1509 /* Raise an exception informing of an encoding error on PORT. This
1510 means that a character could not be written in PORT's encoding. */
1512 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1514 scm_throw (scm_decoding_error_key
,
1515 scm_list_n (scm_from_latin1_string (subr
),
1516 scm_from_latin1_string (message
),
1523 /* String conversion to/from C. */
1526 decoding_error (const char *func_name
, int errno_save
,
1527 const char *str
, size_t len
)
1529 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1534 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1535 memcpy (buf
, str
, len
);
1536 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1538 scm_decoding_error (func_name
, errno_save
,
1539 "input locale conversion error", bv
);
1543 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1544 scm_t_string_failed_conversion_handler handler
)
1551 /* The order of these checks is important. */
1552 if (!str
&& len
!= 0)
1553 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1554 if (len
== (size_t) -1)
1557 if (c_strcasecmp (encoding
, "ISO-8859-1") == 0 || len
== 0)
1558 return scm_from_latin1_stringn (str
, len
);
1559 else if (c_strcasecmp (encoding
, "UTF-8") == 0
1560 && handler
== SCM_FAILED_CONVERSION_ERROR
)
1561 return scm_from_utf8_stringn (str
, len
);
1564 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1565 (enum iconv_ilseq_handler
)
1571 if (SCM_UNLIKELY (u32
== NULL
))
1572 decoding_error (__func__
, errno
, str
, len
);
1576 if (u32
[i
++] > 0xFF)
1585 res
= scm_i_make_string (u32len
, &dst
, 0);
1586 for (i
= 0; i
< u32len
; i
++)
1587 dst
[i
] = (unsigned char) u32
[i
];
1593 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1594 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1603 scm_from_locale_string (const char *str
)
1605 return scm_from_locale_stringn (str
, -1);
1609 scm_from_locale_stringn (const char *str
, size_t len
)
1611 return scm_from_stringn (str
, len
, locale_charset (),
1612 scm_i_default_port_conversion_handler ());
1616 scm_from_latin1_string (const char *str
)
1618 return scm_from_latin1_stringn (str
, -1);
1622 scm_from_latin1_stringn (const char *str
, size_t len
)
1627 if (len
== (size_t) -1)
1630 /* Make a narrow string and copy STR as is. */
1631 result
= scm_i_make_string (len
, &buf
, 0);
1632 memcpy (buf
, str
, len
);
1638 scm_from_utf8_string (const char *str
)
1640 return scm_from_utf8_stringn (str
, -1);
1644 scm_from_utf8_stringn (const char *str
, size_t len
)
1647 const scm_t_uint8
*ustr
= (const scm_t_uint8
*) str
;
1648 int ascii
= 1, narrow
= 1;
1651 if (len
== (size_t) -1)
1671 nbytes
= u8_mbtouc (&c
, ustr
+ i
, len
- i
);
1675 decoding_error (__func__
, errno
, str
, len
);
1688 res
= scm_i_make_string (char_len
, &dst
, 0);
1689 memcpy (dst
, str
, len
);
1697 res
= scm_i_make_string (char_len
, &dst
, 0);
1699 for (i
= 0, j
= 0; i
< len
; j
++)
1701 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1702 dst
[j
] = (signed char) c
;
1711 res
= scm_i_make_wide_string (char_len
, &dst
, 0);
1713 for (i
= 0, j
= 0; i
< len
; j
++)
1715 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1724 scm_from_utf32_string (const scm_t_wchar
*str
)
1726 return scm_from_utf32_stringn (str
, -1);
1730 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1735 if (len
== (size_t) -1)
1736 len
= u32_strlen ((uint32_t *) str
);
1738 result
= scm_i_make_wide_string (len
, &buf
, 0);
1739 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1740 scm_i_try_narrow_string (result
);
1746 scm_from_port_string (const char *str
, SCM port
)
1748 return scm_from_port_stringn (str
, -1, port
);
1752 scm_from_port_stringn (const char *str
, size_t len
, SCM port
)
1754 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
1755 scm_t_port_internal
*pti
= SCM_PORT_GET_INTERNAL (port
);
1757 if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
)
1758 return scm_from_latin1_stringn (str
, len
);
1759 else if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_UTF8
1760 && (pt
->ilseq_handler
== SCM_FAILED_CONVERSION_ERROR
1761 || (u8_check ((uint8_t *) str
, len
) == NULL
)))
1762 return scm_from_utf8_stringn (str
, len
);
1764 return scm_from_stringn (str
, len
, pt
->encoding
, pt
->ilseq_handler
);
1767 /* Create a new scheme string from the C string STR. The memory of
1768 STR may be used directly as storage for the new string. */
1769 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1770 would be to register a finalizer to eventually free(3) STR, which isn't
1771 worth it. Should we just deprecate the `scm_take_' functions? */
1773 scm_take_locale_stringn (char *str
, size_t len
)
1777 res
= scm_from_locale_stringn (str
, len
);
1784 scm_take_locale_string (char *str
)
1786 return scm_take_locale_stringn (str
, -1);
1789 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1790 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1791 Set *LENP to the size of the resulting string.
1793 FIXME: This is a hack we should get rid of. See
1794 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1797 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1799 char *before
, *after
;
1808 if ((i
<= *lenp
- 6)
1809 && before
[i
] == '\\'
1810 && before
[i
+ 1] == 'u'
1811 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1813 /* Convert \u00NN to \xNN */
1816 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1817 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1821 else if ((i
<= *lenp
- 10)
1822 && before
[i
] == '\\'
1823 && before
[i
+ 1] == 'U'
1824 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1826 /* Convert \U00NNNNNN to \UNNNNNN */
1829 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1830 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1831 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1832 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1833 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1834 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1840 after
[j
] = before
[i
];
1848 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1849 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1850 of the resulting string. BUF must be large enough to handle the
1851 worst case when `\uXXXX' escapes (6 characters) are replaced by
1852 `\xXXXX;' (7 characters). */
1854 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1856 char *before
, *after
;
1858 /* The worst case is if the input string contains all 4-digit hex escapes.
1859 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1860 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1861 size_t nzeros
, ndigits
;
1864 after
= alloca (max_out_len
);
1869 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1870 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1872 if (before
[i
+ 1] == 'u')
1874 else if (before
[i
+ 1] == 'U')
1879 /* Add the R6RS hex escape initial sequence. */
1883 /* Move string positions to the start of the hex numbers. */
1887 /* Find the number of initial zeros in this hex number. */
1889 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1892 /* Copy the number, skipping initial zeros, and then move the string
1894 if (nzeros
== ndigits
)
1903 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1904 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1906 j
+= (ndigits
- nzeros
);
1909 /* Add terminating semicolon. */
1915 after
[j
] = before
[i
];
1921 memcpy (before
, after
, j
);
1925 scm_to_locale_string (SCM str
)
1927 return scm_to_locale_stringn (str
, NULL
);
1931 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1933 return scm_to_stringn (str
, lenp
,
1935 scm_i_default_port_conversion_handler ());
1939 scm_to_latin1_string (SCM str
)
1941 return scm_to_latin1_stringn (str
, NULL
);
1945 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1946 #define FUNC_NAME "scm_to_latin1_stringn"
1950 SCM_VALIDATE_STRING (1, str
);
1952 if (scm_i_is_narrow_string (str
))
1954 size_t len
= scm_i_string_length (str
);
1959 result
= scm_strndup (scm_i_string_data (str
), len
);
1962 result
= scm_to_stringn (str
, lenp
, NULL
,
1963 SCM_FAILED_CONVERSION_ERROR
);
1970 scm_to_utf8_string (SCM str
)
1972 return scm_to_utf8_stringn (str
, NULL
);
1976 latin1_u8_strlen (const scm_t_uint8
*str
, size_t len
)
1979 for (i
= 0, ret
= 0; i
< len
; i
++)
1980 ret
+= (str
[i
] < 128) ? 1 : 2;
1985 latin1_to_u8 (const scm_t_uint8
*str
, size_t latin_len
,
1986 scm_t_uint8
*u8_result
, size_t *u8_lenp
)
1989 size_t u8_len
= latin1_u8_strlen (str
, latin_len
);
1991 if (!(u8_result
&& u8_lenp
&& *u8_lenp
> u8_len
))
1992 u8_result
= scm_malloc (u8_len
+ 1);
1996 for (i
= 0, n
= 0; i
< latin_len
; i
++)
1997 n
+= u8_uctomb (u8_result
+ n
, str
[i
], u8_len
- n
);
2007 (Note that this includes code points that are not allowed by Unicode,
2008 but since this function has no way to report an error, and its
2009 purpose is to determine the size of destination buffers for
2010 libunicode conversion functions, we err on the safe side and handle
2011 everything that libunicode might conceivably handle, now or in the
2014 Char. number range | UTF-8 octet sequence
2015 (hexadecimal) | (binary)
2016 --------------------+------------------------------------------------------
2017 0000 0000-0000 007F | 0xxxxxxx
2018 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2019 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2020 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2021 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2022 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2026 u32_u8_length_in_bytes (const scm_t_uint32
*str
, size_t len
)
2030 for (i
= 0, ret
= 0; i
< len
; i
++)
2032 scm_t_uint32 c
= str
[i
];
2036 else if (c
<= 0x7ff)
2038 else if (c
<= 0xffff)
2040 else if (c
<= 0x1fffff)
2042 else if (c
<= 0x3ffffff)
2052 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
2053 #define FUNC_NAME "scm_to_utf8_stringn"
2055 SCM_VALIDATE_STRING (1, str
);
2057 if (scm_i_is_narrow_string (str
))
2058 return (char *) latin1_to_u8 ((scm_t_uint8
*) scm_i_string_chars (str
),
2059 scm_i_string_length (str
),
2063 scm_t_uint32
*chars
= (scm_t_uint32
*) scm_i_string_wide_chars (str
);
2064 scm_t_uint8
*buf
, *ret
;
2065 size_t num_chars
= scm_i_string_length (str
);
2066 size_t num_bytes_predicted
, num_bytes_actual
;
2068 num_bytes_predicted
= u32_u8_length_in_bytes (chars
, num_chars
);
2072 *lenp
= num_bytes_predicted
;
2073 buf
= scm_malloc (num_bytes_predicted
);
2077 buf
= scm_malloc (num_bytes_predicted
+ 1);
2078 buf
[num_bytes_predicted
] = 0;
2081 num_bytes_actual
= num_bytes_predicted
;
2082 ret
= u32_to_u8 (chars
, num_chars
, buf
, &num_bytes_actual
);
2084 if (SCM_LIKELY (ret
== buf
&& num_bytes_actual
== num_bytes_predicted
))
2085 return (char *) ret
;
2087 /* An error: a bad codepoint. */
2089 int saved_errno
= errno
;
2095 scm_decoding_error ("scm_to_utf8_stringn", errno
,
2096 "invalid codepoint in string", str
);
2106 scm_to_utf32_string (SCM str
)
2108 return scm_to_utf32_stringn (str
, NULL
);
2112 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
2113 #define FUNC_NAME "scm_to_utf32_stringn"
2115 scm_t_wchar
*result
;
2117 SCM_VALIDATE_STRING (1, str
);
2119 if (scm_i_is_narrow_string (str
))
2121 scm_t_uint8
*codepoints
;
2124 codepoints
= (scm_t_uint8
*) scm_i_string_chars (str
);
2125 len
= scm_i_string_length (str
);
2129 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2130 for (i
= 0; i
< len
; i
++)
2131 result
[i
] = codepoints
[i
];
2138 len
= scm_i_string_length (str
);
2142 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2143 memcpy (result
, scm_i_string_wide_chars (str
),
2144 len
* sizeof (scm_t_wchar
));
2153 scm_to_port_string (SCM str
, SCM port
)
2155 return scm_to_port_stringn (str
, NULL
, port
);
2159 scm_to_port_stringn (SCM str
, size_t *lenp
, SCM port
)
2161 scm_t_port
*pt
= SCM_PTAB_ENTRY (port
);
2162 scm_t_port_internal
*pti
= SCM_PORT_GET_INTERNAL (port
);
2164 if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_LATIN1
2165 && pt
->ilseq_handler
== SCM_FAILED_CONVERSION_ERROR
)
2166 return scm_to_latin1_stringn (str
, lenp
);
2167 else if (pti
->encoding_mode
== SCM_PORT_ENCODING_MODE_UTF8
)
2168 return scm_to_utf8_stringn (str
, lenp
);
2170 return scm_to_stringn (str
, lenp
, pt
->encoding
, pt
->ilseq_handler
);
2173 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2174 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2175 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2176 defined by HANDLER. */
2178 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
2179 scm_t_string_failed_conversion_handler handler
)
2182 size_t ilen
, len
, i
;
2186 if (!scm_is_string (str
))
2187 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2188 ilen
= scm_i_string_length (str
);
2192 buf
= scm_malloc (1);
2200 for (i
= 0; i
< ilen
; i
++)
2201 if (scm_i_string_ref (str
, i
) == '\0')
2202 scm_misc_error (NULL
,
2203 "string contains #\\nul character: ~S",
2206 if (scm_i_is_narrow_string (str
)
2207 && c_strcasecmp (encoding
, "ISO-8859-1") == 0)
2209 /* If using native Latin-1 encoding, just copy the string
2213 buf
= scm_malloc (ilen
);
2214 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2220 buf
= scm_malloc (ilen
+ 1);
2221 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2233 if (scm_i_is_narrow_string (str
))
2235 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
2237 (enum iconv_ilseq_handler
) handler
, NULL
,
2241 scm_encoding_error (__func__
, errno
,
2242 "cannot convert narrow string to output locale",
2244 /* FIXME: Faulty character unknown. */
2249 buf
= u32_conv_to_encoding (enc
,
2250 (enum iconv_ilseq_handler
) handler
,
2251 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
2256 scm_encoding_error (__func__
, errno
,
2257 "cannot convert wide string to output locale",
2259 /* FIXME: Faulty character unknown. */
2262 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
2264 if (SCM_R6RS_ESCAPES_P
)
2266 /* The worst case is if the input string contains all 4-digit
2267 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2268 (seven characters). Make BUF large enough to hold
2270 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
2271 unistring_escapes_to_r6rs_escapes (buf
, &len
);
2274 unistring_escapes_to_guile_escapes (buf
, &len
);
2276 buf
= scm_realloc (buf
, len
);
2282 buf
= scm_realloc (buf
, len
+ 1);
2286 scm_remember_upto_here_1 (str
);
2291 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
2294 char *result
= NULL
;
2295 if (!scm_is_string (str
))
2296 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2297 result
= scm_to_locale_stringn (str
, &len
);
2299 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
2302 scm_remember_upto_here_1 (str
);
2307 /* Unicode string normalization. */
2309 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2310 libguile/i18n.c. It would be useful to have this factored out into a more
2311 convenient location, but its use of alloca makes that tricky to do. */
2314 normalize_str (SCM string
, uninorm_t form
)
2317 scm_t_uint32
*w_str
;
2319 size_t rlen
, len
= scm_i_string_length (string
);
2321 if (scm_i_is_narrow_string (string
))
2324 const char *buf
= scm_i_string_chars (string
);
2326 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
2328 for (i
= 0; i
< len
; i
++)
2329 w_str
[i
] = (unsigned char) buf
[i
];
2333 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2335 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2337 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2338 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2341 scm_i_try_narrow_string (ret
);
2346 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2348 "Returns the NFC normalized form of @var{string}.")
2349 #define FUNC_NAME s_scm_string_normalize_nfc
2351 SCM_VALIDATE_STRING (1, string
);
2352 return normalize_str (string
, UNINORM_NFC
);
2356 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2358 "Returns the NFD normalized form of @var{string}.")
2359 #define FUNC_NAME s_scm_string_normalize_nfd
2361 SCM_VALIDATE_STRING (1, string
);
2362 return normalize_str (string
, UNINORM_NFD
);
2366 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2368 "Returns the NFKC normalized form of @var{string}.")
2369 #define FUNC_NAME s_scm_string_normalize_nfkc
2371 SCM_VALIDATE_STRING (1, string
);
2372 return normalize_str (string
, UNINORM_NFKC
);
2376 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2378 "Returns the NFKD normalized form of @var{string}.")
2379 #define FUNC_NAME s_scm_string_normalize_nfkd
2381 SCM_VALIDATE_STRING (1, string
);
2382 return normalize_str (string
, UNINORM_NFKD
);
2386 /* converts C scm_array of strings to SCM scm_list of strings.
2387 If argc < 0, a null terminated scm_array is assumed.
2388 The current locale encoding is assumed */
2390 scm_makfromstrs (int argc
, char **argv
)
2395 for (i
= 0; argv
[i
]; i
++);
2397 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2401 /* Return a newly allocated array of char pointers to each of the strings
2402 in args, with a terminating NULL pointer. The strings are encoded using
2403 the current locale. */
2406 scm_i_allocate_string_pointers (SCM list
)
2407 #define FUNC_NAME "scm_i_allocate_string_pointers"
2410 int list_len
= scm_ilength (list
);
2414 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2416 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2418 result
[list_len
] = NULL
;
2420 /* The list might have been modified in another thread, so
2421 we check LIST before each access.
2423 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2425 SCM str
= SCM_CAR (list
);
2426 size_t len
; /* String length in bytes */
2427 char *c_str
= scm_to_locale_stringn (str
, &len
);
2429 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2430 scm_malloc to allocate the returned string, which must be
2431 explicitly deallocated. This forces us to copy the string a
2432 second time into a new buffer. Ideally there would be variants
2433 of scm_to_*_stringn that can return garbage-collected buffers. */
2435 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2436 memcpy (result
[i
], c_str
, len
);
2437 result
[i
][len
] = '\0';
2440 list
= SCM_CDR (list
);
2448 scm_i_get_substring_spec (size_t len
,
2449 SCM start
, size_t *cstart
,
2450 SCM end
, size_t *cend
)
2452 if (SCM_UNBNDP (start
))
2455 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2457 if (SCM_UNBNDP (end
))
2460 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2464 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2466 return scm_c_string_ref (h
->array
, index
);
2470 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2472 scm_c_string_set_x (h
->array
, index
, val
);
2476 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2482 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2484 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2485 h
->elements
= h
->writable_elements
= NULL
;
2488 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2489 string_handle_ref
, string_handle_set
,
2491 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2496 scm_nullstr
= scm_i_make_string (0, NULL
, 0);
2498 #include "libguile/strings.x"