1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "striconveh.h"
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
94 #define SET_STRINGBUF_SHARED(buf) \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist
[1001];
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
111 make_stringbuf (size_t len
)
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
129 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
132 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
133 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
135 STRINGBUF_CHARS (buf
)[len
] = 0;
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
143 make_wide_stringbuf (size_t len
)
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
155 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
156 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
159 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
160 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
162 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
170 wide_stringbuf (SCM buf
)
174 if (STRINGBUF_WIDE (buf
))
181 len
= STRINGBUF_LENGTH (buf
);
183 new_buf
= make_wide_stringbuf (len
);
185 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
186 for (i
= 0; i
< len
; i
++)
187 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
197 narrow_stringbuf (SCM buf
)
201 if (!STRINGBUF_WIDE (buf
))
209 len
= STRINGBUF_LENGTH (buf
);
210 wmem
= STRINGBUF_WIDE_CHARS (buf
);
212 for (i
= 0; i
< len
; i
++)
214 /* BUF cannot be narrowed. */
217 new_buf
= make_stringbuf (len
);
219 mem
= STRINGBUF_CHARS (new_buf
);
220 for (i
= 0; i
< len
; i
++)
221 mem
[i
] = (unsigned char) wmem
[i
];
228 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
229 SCM_PTHREAD_ATFORK_LOCK_STATIC_MUTEX (stringbuf_write_mutex
);
232 /* Copy-on-write strings.
235 #define STRING_TAG scm_tc7_string
237 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
238 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
239 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
241 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
242 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
244 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
246 /* Read-only strings.
249 #define RO_STRING_TAG scm_tc7_ro_string
250 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
252 /* Mutation-sharing substrings
255 #define SH_STRING_TAG (scm_tc7_string + 0x100)
257 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
258 /* START and LENGTH as for STRINGs. */
260 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
264 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
265 characters. CHARSP, if not NULL, will be set to location of the
266 char array. If READ_ONLY_P, the returned string is read-only;
267 otherwise it is writable. */
269 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
271 static SCM null_stringbuf
= SCM_BOOL_F
;
277 if (SCM_UNLIKELY (scm_is_false (null_stringbuf
)))
279 null_stringbuf
= make_stringbuf (0);
280 SET_STRINGBUF_SHARED (null_stringbuf
);
282 buf
= null_stringbuf
;
285 buf
= make_stringbuf (len
);
288 *charsp
= (char *) STRINGBUF_CHARS (buf
);
289 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
291 (scm_t_bits
) 0, (scm_t_bits
) len
);
295 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
296 characters. CHARSP, if not NULL, will be set to location of the
297 character array. If READ_ONLY_P, the returned string is read-only;
298 otherwise it is writable. */
300 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
302 SCM buf
= make_wide_stringbuf (len
);
305 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
306 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
308 (scm_t_bits
) 0, (scm_t_bits
) len
);
313 validate_substring_args (SCM str
, size_t start
, size_t end
)
315 if (!IS_STRING (str
))
316 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
317 if (start
> STRING_LENGTH (str
))
318 scm_out_of_range (NULL
, scm_from_size_t (start
));
319 if (end
> STRING_LENGTH (str
) || end
< start
)
320 scm_out_of_range (NULL
, scm_from_size_t (end
));
324 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
326 *start
= STRING_START (*str
);
327 if (IS_SH_STRING (*str
))
329 *str
= SH_STRING_STRING (*str
);
330 *start
+= STRING_START (*str
);
332 *buf
= STRING_STRINGBUF (*str
);
336 scm_i_substring (SCM str
, size_t start
, size_t end
)
339 return scm_i_make_string (0, NULL
, 0);
344 get_str_buf_start (&str
, &buf
, &str_start
);
345 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
346 SET_STRINGBUF_SHARED (buf
);
347 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
348 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
349 (scm_t_bits
)str_start
+ start
,
350 (scm_t_bits
) end
- start
);
355 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
358 return scm_i_make_string (0, NULL
, 1);
363 get_str_buf_start (&str
, &buf
, &str_start
);
364 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
365 SET_STRINGBUF_SHARED (buf
);
366 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
367 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
368 (scm_t_bits
)str_start
+ start
,
369 (scm_t_bits
) end
- start
);
374 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
377 return scm_i_make_string (0, NULL
, 0);
380 size_t len
= end
- start
;
381 SCM buf
, my_buf
, substr
;
384 get_str_buf_start (&str
, &buf
, &str_start
);
385 if (scm_i_is_narrow_string (str
))
387 my_buf
= make_stringbuf (len
);
388 memcpy (STRINGBUF_CHARS (my_buf
),
389 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
393 my_buf
= make_wide_stringbuf (len
);
394 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
395 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
399 scm_remember_upto_here_1 (buf
);
400 substr
= scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
401 (scm_t_bits
) 0, (scm_t_bits
) len
);
403 scm_i_try_narrow_string (substr
);
409 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
411 if (start
== 0 && end
== STRING_LENGTH (str
))
413 else if (start
== end
)
414 return scm_i_make_string (0, NULL
, 0);
417 size_t len
= end
- start
;
418 if (IS_SH_STRING (str
))
420 start
+= STRING_START (str
);
421 str
= SH_STRING_STRING (str
);
423 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
424 (scm_t_bits
)start
, (scm_t_bits
) len
);
429 scm_c_substring (SCM str
, size_t start
, size_t end
)
431 validate_substring_args (str
, start
, end
);
432 return scm_i_substring (str
, start
, end
);
436 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
438 validate_substring_args (str
, start
, end
);
439 return scm_i_substring_read_only (str
, start
, end
);
443 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
445 validate_substring_args (str
, start
, end
);
446 return scm_i_substring_copy (str
, start
, end
);
450 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
452 validate_substring_args (str
, start
, end
);
453 return scm_i_substring_shared (str
, start
, end
);
457 /* Internal accessors
460 /* Returns the number of characters in STR. This may be different
461 than the memory size of the string storage. */
463 scm_i_string_length (SCM str
)
465 return STRING_LENGTH (str
);
468 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
469 encoding. False if it is 'wide', having a 32-bit UCS-4
472 scm_i_is_narrow_string (SCM str
)
474 if (IS_SH_STRING (str
))
475 str
= SH_STRING_STRING (str
);
477 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
480 /* Try to coerce a string to be narrow. It if is narrow already, do
481 nothing. If it is wide, shrink it to narrow if none of its
482 characters are above 0xFF. Return true if the string is narrow or
483 was made to be narrow. */
485 scm_i_try_narrow_string (SCM str
)
487 if (IS_SH_STRING (str
))
488 str
= SH_STRING_STRING (str
);
490 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
492 return scm_i_is_narrow_string (str
);
495 /* Return a pointer to the raw data of the string, which can be either Latin-1
496 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
498 scm_i_string_data (SCM str
)
504 get_str_buf_start (&str
, &buf
, &start
);
506 data
= STRINGBUF_CONTENTS (buf
);
507 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
512 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
515 scm_i_string_chars (SCM str
)
519 get_str_buf_start (&str
, &buf
, &start
);
520 if (scm_i_is_narrow_string (str
))
521 return (const char *) STRINGBUF_CHARS (buf
) + start
;
523 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
528 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
531 scm_i_string_wide_chars (SCM str
)
536 get_str_buf_start (&str
, &buf
, &start
);
537 if (!scm_i_is_narrow_string (str
))
538 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
540 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
544 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
545 a new string buffer, so that it can be modified without modifying
546 other strings. Also, lock the string mutex. Later, one must call
547 scm_i_string_stop_writing to unlock the mutex. */
549 scm_i_string_start_writing (SCM orig_str
)
551 SCM buf
, str
= orig_str
;
554 get_str_buf_start (&str
, &buf
, &start
);
555 if (IS_RO_STRING (str
))
556 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
558 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
559 if (STRINGBUF_SHARED (buf
))
561 /* Clone the stringbuf. */
562 size_t len
= STRING_LENGTH (str
);
565 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
567 if (scm_i_is_narrow_string (str
))
569 new_buf
= make_stringbuf (len
);
570 memcpy (STRINGBUF_CHARS (new_buf
),
571 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
576 new_buf
= make_wide_stringbuf (len
);
577 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
578 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
579 + STRING_START (str
)), len
);
582 SET_STRING_STRINGBUF (str
, new_buf
);
583 start
-= STRING_START (str
);
585 /* FIXME: The following operations are not atomic, so other threads
586 looking at STR may see an inconsistent state. Nevertheless it can't
587 hurt much since (i) accessing STR while it is being mutated can't
588 yield a crash, and (ii) concurrent accesses to STR should be
589 protected by a mutex at the application level. The latter may not
590 apply when STR != ORIG_STR, though. */
591 SET_STRING_START (str
, 0);
592 SET_STRING_STRINGBUF (str
, new_buf
);
596 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
601 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
603 scm_i_string_writable_chars (SCM str
)
608 get_str_buf_start (&str
, &buf
, &start
);
609 if (scm_i_is_narrow_string (str
))
610 return (char *) STRINGBUF_CHARS (buf
) + start
;
612 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
617 /* Return a pointer to the UCS-4 codepoints of a string. */
619 scm_i_string_writable_wide_chars (SCM str
)
624 get_str_buf_start (&str
, &buf
, &start
);
625 if (!scm_i_is_narrow_string (str
))
626 return STRINGBUF_WIDE_CHARS (buf
) + start
;
628 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
632 /* Unlock the string mutex that was locked when
633 scm_i_string_start_writing was called. */
635 scm_i_string_stop_writing (void)
637 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
640 /* Return the Xth character of STR as a UCS-4 codepoint. */
642 scm_i_string_ref (SCM str
, size_t x
)
644 if (scm_i_is_narrow_string (str
))
645 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
647 return scm_i_string_wide_chars (str
)[x
];
650 /* Returns index+1 of the first char in STR that matches C, or
651 0 if the char is not found. */
653 scm_i_string_contains_char (SCM str
, char ch
)
656 size_t len
= scm_i_string_length (str
);
659 if (scm_i_is_narrow_string (str
))
663 if (scm_i_string_chars (str
)[i
] == ch
)
672 if (scm_i_string_wide_chars (str
)[i
]
673 == (unsigned char) ch
)
682 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
684 if (scm_i_is_narrow_string (sstr
))
686 const char *a
= scm_i_string_chars (sstr
) + start_x
;
687 const char *b
= cstr
;
688 return strncmp (a
, b
, strlen(b
));
693 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
694 const char *b
= cstr
;
695 for (i
= 0; i
< strlen (b
); i
++)
697 if (a
[i
] != (unsigned char) b
[i
])
704 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
706 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
708 if (IS_SH_STRING (str
))
710 p
+= STRING_START (str
);
711 str
= SH_STRING_STRING (str
);
714 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
715 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
717 if (scm_i_is_narrow_string (str
))
719 char *dst
= scm_i_string_writable_chars (str
);
724 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
732 Basic symbol creation and accessing is done here, the rest is in
733 symbols.[hc]. This has been done to keep stringbufs and the
734 internals of strings and string-like objects confined to this file.
737 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
740 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
741 unsigned long hash
, SCM props
)
744 size_t start
= STRING_START (name
);
745 size_t length
= STRING_LENGTH (name
);
747 if (IS_SH_STRING (name
))
749 name
= SH_STRING_STRING (name
);
750 start
+= STRING_START (name
);
752 buf
= SYMBOL_STRINGBUF (name
);
754 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
757 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
758 SET_STRINGBUF_SHARED (buf
);
759 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
764 if (scm_i_is_narrow_string (name
))
766 SCM new_buf
= make_stringbuf (length
);
767 memcpy (STRINGBUF_CHARS (new_buf
),
768 STRINGBUF_CHARS (buf
) + start
, length
);
773 SCM new_buf
= make_wide_stringbuf (length
);
774 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
775 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
780 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
781 (scm_t_bits
) hash
, SCM_UNPACK (props
));
785 scm_i_c_make_symbol (const char *name
, size_t len
,
786 scm_t_bits flags
, unsigned long hash
, SCM props
)
788 SCM buf
= make_stringbuf (len
);
789 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
791 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
792 (scm_t_bits
) hash
, SCM_UNPACK (props
));
795 /* Returns the number of characters in SYM. This may be different
796 from the memory size of SYM. */
798 scm_i_symbol_length (SCM sym
)
800 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
804 scm_c_symbol_length (SCM sym
)
805 #define FUNC_NAME "scm_c_symbol_length"
807 SCM_VALIDATE_SYMBOL (1, sym
);
809 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
813 /* True if the name of SYM is stored as a Latin-1 encoded string.
814 False if it is stored as a 32-bit UCS-4-encoded string. */
816 scm_i_is_narrow_symbol (SCM sym
)
820 buf
= SYMBOL_STRINGBUF (sym
);
821 return !STRINGBUF_WIDE (buf
);
824 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
825 contains the name of SYM. */
827 scm_i_symbol_chars (SCM sym
)
831 buf
= SYMBOL_STRINGBUF (sym
);
832 if (!STRINGBUF_WIDE (buf
))
833 return (const char *) STRINGBUF_CHARS (buf
);
835 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
839 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
842 scm_i_symbol_wide_chars (SCM sym
)
846 buf
= SYMBOL_STRINGBUF (sym
);
847 if (STRINGBUF_WIDE (buf
))
848 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
850 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
855 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
857 SCM buf
= SYMBOL_STRINGBUF (sym
);
858 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
859 SET_STRINGBUF_SHARED (buf
);
860 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
861 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
862 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
865 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
867 scm_i_symbol_ref (SCM sym
, size_t x
)
869 if (scm_i_is_narrow_symbol (sym
))
870 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
872 return scm_i_symbol_wide_chars (sym
)[x
];
878 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
879 "Returns an association list containing debugging information\n"
880 "for @var{str}. The association list has the following entries."
883 "The string itself.\n"
885 "The start index of the string into its stringbuf\n"
887 "The length of the string\n"
889 "If this string is a substring, it returns its parent string.\n"
890 "Otherwise, it returns @code{#f}\n"
892 "@code{#t} if the string is read-only\n"
893 "@item stringbuf-chars\n"
894 "A new string containing this string's stringbuf's characters\n"
895 "@item stringbuf-length\n"
896 "The number of characters in this stringbuf\n"
897 "@item stringbuf-shared\n"
898 "@code{#t} if this stringbuf is shared\n"
899 "@item stringbuf-wide\n"
900 "@code{#t} if this stringbuf's characters are stored in a\n"
901 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
904 #define FUNC_NAME s_scm_sys_string_dump
906 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
908 SCM_VALIDATE_STRING (1, str
);
911 e1
= scm_cons (scm_from_latin1_symbol ("string"),
913 e2
= scm_cons (scm_from_latin1_symbol ("start"),
914 scm_from_size_t (STRING_START (str
)));
915 e3
= scm_cons (scm_from_latin1_symbol ("length"),
916 scm_from_size_t (STRING_LENGTH (str
)));
918 if (IS_SH_STRING (str
))
920 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
921 SH_STRING_STRING (str
));
922 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
926 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
928 buf
= STRING_STRINGBUF (str
);
931 if (IS_RO_STRING (str
))
932 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
935 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
939 if (!STRINGBUF_WIDE (buf
))
941 size_t len
= STRINGBUF_LENGTH (buf
);
943 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
944 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
945 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
950 size_t len
= STRINGBUF_LENGTH (buf
);
952 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
953 u32_cpy ((scm_t_uint32
*) cbuf
,
954 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
955 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
958 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
959 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
960 if (STRINGBUF_SHARED (buf
))
961 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
964 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
966 if (STRINGBUF_WIDE (buf
))
967 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
970 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
973 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
977 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
978 "Returns an association list containing debugging information\n"
979 "for @var{sym}. The association list has the following entries."
982 "The symbol itself\n"
986 "@code{#t} if it is an interned symbol\n"
987 "@item stringbuf-chars\n"
988 "A new string containing this symbols's stringbuf's characters\n"
989 "@item stringbuf-length\n"
990 "The number of characters in this stringbuf\n"
991 "@item stringbuf-shared\n"
992 "@code{#t} if this stringbuf is shared\n"
993 "@item stringbuf-wide\n"
994 "@code{#t} if this stringbuf's characters are stored in a\n"
995 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
998 #define FUNC_NAME s_scm_sys_symbol_dump
1000 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
1002 SCM_VALIDATE_SYMBOL (1, sym
);
1003 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
1005 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
1006 scm_from_ulong (scm_i_symbol_hash (sym
)));
1007 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
1008 scm_symbol_interned_p (sym
));
1009 buf
= SYMBOL_STRINGBUF (sym
);
1011 /* Stringbuf info */
1012 if (!STRINGBUF_WIDE (buf
))
1014 size_t len
= STRINGBUF_LENGTH (buf
);
1016 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
1017 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
1018 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1023 size_t len
= STRINGBUF_LENGTH (buf
);
1025 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
1026 u32_cpy ((scm_t_uint32
*) cbuf
,
1027 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
1028 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1031 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1032 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
1033 if (STRINGBUF_SHARED (buf
))
1034 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1039 if (STRINGBUF_WIDE (buf
))
1040 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1045 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1050 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1052 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1053 #define FUNC_NAME s_scm_sys_stringbuf_hist
1056 for (i
= 0; i
< 1000; i
++)
1058 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1059 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1060 return SCM_UNSPECIFIED
;
1068 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1070 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1071 #define FUNC_NAME s_scm_string_p
1073 return scm_from_bool (IS_STRING (obj
));
1078 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1080 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1082 "@deffnx {Scheme Procedure} list->string chrs\n"
1083 "Return a newly allocated string composed of the arguments,\n"
1085 #define FUNC_NAME s_scm_string
1087 SCM result
= SCM_BOOL_F
;
1094 /* Verify that this is a list of chars. */
1095 i
= scm_ilength (chrs
);
1096 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1101 while (len
> 0 && scm_is_pair (rest
))
1103 SCM elt
= SCM_CAR (rest
);
1104 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1105 if (SCM_CHAR (elt
) > 0xFF)
1107 rest
= SCM_CDR (rest
);
1109 scm_remember_upto_here_1 (elt
);
1112 /* Construct a string containing this list of chars. */
1120 result
= scm_i_make_string (len
, NULL
, 0);
1121 result
= scm_i_string_start_writing (result
);
1122 buf
= scm_i_string_writable_chars (result
);
1123 while (len
> 0 && scm_is_pair (rest
))
1125 SCM elt
= SCM_CAR (rest
);
1126 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1128 rest
= SCM_CDR (rest
);
1130 scm_remember_upto_here_1 (elt
);
1137 result
= scm_i_make_wide_string (len
, NULL
, 0);
1138 result
= scm_i_string_start_writing (result
);
1139 buf
= scm_i_string_writable_wide_chars (result
);
1140 while (len
> 0 && scm_is_pair (rest
))
1142 SCM elt
= SCM_CAR (rest
);
1143 buf
[p
] = SCM_CHAR (elt
);
1145 rest
= SCM_CDR (rest
);
1147 scm_remember_upto_here_1 (elt
);
1150 scm_i_string_stop_writing ();
1153 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1154 if (!scm_is_null (rest
))
1155 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1161 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1163 "Return a newly allocated string of\n"
1164 "length @var{k}. If @var{chr} is given, then all elements of\n"
1165 "the string are initialized to @var{chr}, otherwise the contents\n"
1166 "of the string are all set to @code{#\nul}.")
1167 #define FUNC_NAME s_scm_make_string
1169 return scm_c_make_string (scm_to_size_t (k
), chr
);
1174 scm_c_make_string (size_t len
, SCM chr
)
1175 #define FUNC_NAME NULL
1178 char *contents
= NULL
;
1179 SCM res
= scm_i_make_string (len
, &contents
, 0);
1181 /* If no char is given, initialize string contents to NULL. */
1182 if (SCM_UNBNDP (chr
))
1183 memset (contents
, 0, len
);
1186 SCM_VALIDATE_CHAR (0, chr
);
1187 res
= scm_i_string_start_writing (res
);
1188 for (p
= 0; p
< len
; p
++)
1189 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1190 scm_i_string_stop_writing ();
1197 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1199 "Return the number of characters in @var{string}.")
1200 #define FUNC_NAME s_scm_string_length
1202 SCM_VALIDATE_STRING (1, string
);
1203 return scm_from_size_t (STRING_LENGTH (string
));
1207 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1209 "Return the bytes used to represent a character in @var{string}."
1210 "This will return 1 or 4.")
1211 #define FUNC_NAME s_scm_string_bytes_per_char
1213 SCM_VALIDATE_STRING (1, string
);
1214 if (!scm_i_is_narrow_string (string
))
1215 return scm_from_int (4);
1217 return scm_from_int (1);
1222 scm_c_string_length (SCM string
)
1224 if (!IS_STRING (string
))
1225 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1226 return STRING_LENGTH (string
);
1229 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1231 "Return character @var{k} of @var{str} using zero-origin\n"
1232 "indexing. @var{k} must be a valid index of @var{str}.")
1233 #define FUNC_NAME s_scm_string_ref
1238 SCM_VALIDATE_STRING (1, str
);
1240 len
= scm_i_string_length (str
);
1241 if (SCM_LIKELY (len
> 0))
1242 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1244 scm_out_of_range (NULL
, k
);
1246 if (scm_i_is_narrow_string (str
))
1247 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1249 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1254 scm_c_string_ref (SCM str
, size_t p
)
1256 if (p
>= scm_i_string_length (str
))
1257 scm_out_of_range (NULL
, scm_from_size_t (p
));
1258 if (scm_i_is_narrow_string (str
))
1259 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1261 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1265 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1266 (SCM str
, SCM k
, SCM chr
),
1267 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1268 "an unspecified value. @var{k} must be a valid index of\n"
1270 #define FUNC_NAME s_scm_string_set_x
1275 SCM_VALIDATE_STRING (1, str
);
1277 len
= scm_i_string_length (str
);
1278 if (SCM_LIKELY (len
> 0))
1279 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1281 scm_out_of_range (NULL
, k
);
1283 SCM_VALIDATE_CHAR (3, chr
);
1284 str
= scm_i_string_start_writing (str
);
1285 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1286 scm_i_string_stop_writing ();
1288 return SCM_UNSPECIFIED
;
1293 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1295 if (p
>= scm_i_string_length (str
))
1296 scm_out_of_range (NULL
, scm_from_size_t (p
));
1297 str
= scm_i_string_start_writing (str
);
1298 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1299 scm_i_string_stop_writing ();
1302 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1303 (SCM str
, SCM start
, SCM end
),
1304 "Return a newly allocated string formed from the characters\n"
1305 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1306 "ending with index @var{end} (exclusive).\n"
1307 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1308 "exact integers satisfying:\n\n"
1309 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1310 #define FUNC_NAME s_scm_substring
1312 size_t len
, from
, to
;
1314 SCM_VALIDATE_STRING (1, str
);
1315 len
= scm_i_string_length (str
);
1316 from
= scm_to_unsigned_integer (start
, 0, len
);
1317 if (SCM_UNBNDP (end
))
1320 to
= scm_to_unsigned_integer (end
, from
, len
);
1321 return scm_i_substring (str
, from
, to
);
1325 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1326 (SCM str
, SCM start
, SCM end
),
1327 "Return a newly allocated string formed from the characters\n"
1328 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1329 "ending with index @var{end} (exclusive).\n"
1330 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1331 "exact integers satisfying:\n"
1333 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1335 "The returned string is read-only.\n")
1336 #define FUNC_NAME s_scm_substring_read_only
1338 size_t len
, from
, to
;
1340 SCM_VALIDATE_STRING (1, str
);
1341 len
= scm_i_string_length (str
);
1342 from
= scm_to_unsigned_integer (start
, 0, len
);
1343 if (SCM_UNBNDP (end
))
1346 to
= scm_to_unsigned_integer (end
, from
, len
);
1347 return scm_i_substring_read_only (str
, from
, to
);
1351 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1352 (SCM str
, SCM start
, SCM end
),
1353 "Return a newly allocated string formed from the characters\n"
1354 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1355 "ending with index @var{end} (exclusive).\n"
1356 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1357 "exact integers satisfying:\n\n"
1358 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1359 #define FUNC_NAME s_scm_substring_copy
1361 /* For the Scheme version, START is mandatory, but for the C
1362 version, it is optional. See scm_string_copy in srfi-13.c for a
1368 SCM_VALIDATE_STRING (1, str
);
1369 scm_i_get_substring_spec (scm_i_string_length (str
),
1370 start
, &from
, end
, &to
);
1371 return scm_i_substring_copy (str
, from
, to
);
1375 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1376 (SCM str
, SCM start
, SCM end
),
1377 "Return string that indirectly refers to the characters\n"
1378 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379 "ending with index @var{end} (exclusive).\n"
1380 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381 "exact integers satisfying:\n\n"
1382 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383 #define FUNC_NAME s_scm_substring_shared
1385 size_t len
, from
, to
;
1387 SCM_VALIDATE_STRING (1, str
);
1388 len
= scm_i_string_length (str
);
1389 from
= scm_to_unsigned_integer (start
, 0, len
);
1390 if (SCM_UNBNDP (end
))
1393 to
= scm_to_unsigned_integer (end
, from
, len
);
1394 return scm_i_substring_shared (str
, from
, to
);
1398 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1400 "Return a newly allocated string whose characters form the\n"
1401 "concatenation of the given strings, @var{args}.")
1402 #define FUNC_NAME s_scm_string_append
1415 SCM_VALIDATE_REST_ARGUMENT (args
);
1416 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1419 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1420 len
+= scm_i_string_length (s
);
1421 if (!scm_i_is_narrow_string (s
))
1426 res
= scm_i_make_string (len
, &data
.narrow
, 0);
1428 res
= scm_i_make_wide_string (len
, &data
.wide
, 0);
1430 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1434 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1435 len
= scm_i_string_length (s
);
1438 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1443 if (scm_i_is_narrow_string (s
))
1445 for (i
= 0; i
< scm_i_string_length (s
); i
++)
1446 data
.wide
[i
] = (unsigned char) scm_i_string_chars (s
)[i
];
1449 u32_cpy ((scm_t_uint32
*) data
.wide
,
1450 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1453 scm_remember_upto_here_1 (s
);
1461 /* Charset conversion error handling. */
1463 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1464 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1466 /* Raise an exception informing that character CHR could not be written
1467 to PORT in its current encoding. */
1469 scm_encoding_error (const char *subr
, int err
, const char *message
,
1472 scm_throw (scm_encoding_error_key
,
1473 scm_list_n (scm_from_latin1_string (subr
),
1474 scm_from_latin1_string (message
),
1480 /* Raise an exception informing of an encoding error on PORT. This
1481 means that a character could not be written in PORT's encoding. */
1483 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1485 scm_throw (scm_decoding_error_key
,
1486 scm_list_n (scm_from_latin1_string (subr
),
1487 scm_from_latin1_string (message
),
1494 /* String conversion to/from C. */
1497 decoding_error (const char *func_name
, int errno_save
,
1498 const char *str
, size_t len
)
1500 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1505 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1506 memcpy (buf
, str
, len
);
1507 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1509 scm_decoding_error (func_name
, errno_save
,
1510 "input locale conversion error", bv
);
1514 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1515 scm_t_string_failed_conversion_handler handler
)
1522 /* The order of these checks is important. */
1523 if (!str
&& len
!= 0)
1524 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1525 if (len
== (size_t) -1)
1528 if (encoding
== NULL
|| len
== 0)
1529 return scm_from_latin1_stringn (str
, len
);
1530 else if (strcmp (encoding
, "UTF-8") == 0)
1531 return scm_from_utf8_stringn (str
, len
);
1534 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1535 (enum iconv_ilseq_handler
)
1541 if (SCM_UNLIKELY (u32
== NULL
))
1542 decoding_error (__func__
, errno
, str
, len
);
1546 if (u32
[i
++] > 0xFF)
1555 res
= scm_i_make_string (u32len
, &dst
, 0);
1556 for (i
= 0; i
< u32len
; i
++)
1557 dst
[i
] = (unsigned char) u32
[i
];
1563 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1564 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1573 scm_from_locale_string (const char *str
)
1575 return scm_from_locale_stringn (str
, -1);
1579 scm_from_locale_stringn (const char *str
, size_t len
)
1581 return scm_from_stringn (str
, len
, locale_charset (),
1582 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1586 scm_from_latin1_string (const char *str
)
1588 return scm_from_latin1_stringn (str
, -1);
1592 scm_from_latin1_stringn (const char *str
, size_t len
)
1597 if (len
== (size_t) -1)
1600 /* Make a narrow string and copy STR as is. */
1601 result
= scm_i_make_string (len
, &buf
, 0);
1602 memcpy (buf
, str
, len
);
1608 scm_from_utf8_string (const char *str
)
1610 return scm_from_utf8_stringn (str
, -1);
1614 scm_from_utf8_stringn (const char *str
, size_t len
)
1617 const scm_t_uint8
*ustr
= (const scm_t_uint8
*) str
;
1618 int ascii
= 1, narrow
= 1;
1621 if (len
== (size_t) -1)
1641 nbytes
= u8_mbtouc (&c
, ustr
+ i
, len
- i
);
1645 decoding_error (__func__
, errno
, str
, len
);
1658 res
= scm_i_make_string (char_len
, &dst
, 0);
1659 memcpy (dst
, str
, len
);
1667 res
= scm_i_make_string (char_len
, &dst
, 0);
1669 for (i
= 0, j
= 0; i
< len
; j
++)
1671 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1672 dst
[j
] = (signed char) c
;
1681 res
= scm_i_make_wide_string (char_len
, &dst
, 0);
1683 for (i
= 0, j
= 0; i
< len
; j
++)
1685 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1694 scm_from_utf32_string (const scm_t_wchar
*str
)
1696 return scm_from_utf32_stringn (str
, -1);
1700 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1705 if (len
== (size_t) -1)
1706 len
= u32_strlen ((uint32_t *) str
);
1708 result
= scm_i_make_wide_string (len
, &buf
, 0);
1709 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1710 scm_i_try_narrow_string (result
);
1715 /* Create a new scheme string from the C string STR. The memory of
1716 STR may be used directly as storage for the new string. */
1717 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1718 would be to register a finalizer to eventually free(3) STR, which isn't
1719 worth it. Should we just deprecate the `scm_take_' functions? */
1721 scm_take_locale_stringn (char *str
, size_t len
)
1725 res
= scm_from_locale_stringn (str
, len
);
1732 scm_take_locale_string (char *str
)
1734 return scm_take_locale_stringn (str
, -1);
1737 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1738 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1739 Set *LENP to the size of the resulting string.
1741 FIXME: This is a hack we should get rid of. See
1742 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1745 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1747 char *before
, *after
;
1756 if ((i
<= *lenp
- 6)
1757 && before
[i
] == '\\'
1758 && before
[i
+ 1] == 'u'
1759 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1761 /* Convert \u00NN to \xNN */
1764 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1765 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1769 else if ((i
<= *lenp
- 10)
1770 && before
[i
] == '\\'
1771 && before
[i
+ 1] == 'U'
1772 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1774 /* Convert \U00NNNNNN to \UNNNNNN */
1777 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1778 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1779 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1780 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1781 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1782 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1788 after
[j
] = before
[i
];
1796 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1797 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1798 of the resulting string. BUF must be large enough to handle the
1799 worst case when `\uXXXX' escapes (6 characters) are replaced by
1800 `\xXXXX;' (7 characters). */
1802 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1804 char *before
, *after
;
1806 /* The worst case is if the input string contains all 4-digit hex escapes.
1807 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1808 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1809 size_t nzeros
, ndigits
;
1812 after
= alloca (max_out_len
);
1817 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1818 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1820 if (before
[i
+ 1] == 'u')
1822 else if (before
[i
+ 1] == 'U')
1827 /* Add the R6RS hex escape initial sequence. */
1831 /* Move string positions to the start of the hex numbers. */
1835 /* Find the number of initial zeros in this hex number. */
1837 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1840 /* Copy the number, skipping initial zeros, and then move the string
1842 if (nzeros
== ndigits
)
1851 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1852 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1854 j
+= (ndigits
- nzeros
);
1857 /* Add terminating semicolon. */
1863 after
[j
] = before
[i
];
1869 memcpy (before
, after
, j
);
1873 scm_to_locale_string (SCM str
)
1875 return scm_to_locale_stringn (str
, NULL
);
1879 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1881 return scm_to_stringn (str
, lenp
,
1883 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1887 scm_to_latin1_string (SCM str
)
1889 return scm_to_latin1_stringn (str
, NULL
);
1893 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1894 #define FUNC_NAME "scm_to_latin1_stringn"
1898 SCM_VALIDATE_STRING (1, str
);
1900 if (scm_i_is_narrow_string (str
))
1902 size_t len
= scm_i_string_length (str
);
1907 result
= scm_strndup (scm_i_string_data (str
), len
);
1910 result
= scm_to_stringn (str
, lenp
, NULL
,
1911 SCM_FAILED_CONVERSION_ERROR
);
1918 scm_to_utf8_string (SCM str
)
1920 return scm_to_utf8_stringn (str
, NULL
);
1924 latin1_u8_strlen (const scm_t_uint8
*str
, size_t len
)
1927 for (i
= 0, ret
= 0; i
< len
; i
++)
1928 ret
+= (str
[i
] < 128) ? 1 : 2;
1933 latin1_to_u8 (const scm_t_uint8
*str
, size_t latin_len
,
1934 scm_t_uint8
*u8_result
, size_t *u8_lenp
)
1937 size_t u8_len
= latin1_u8_strlen (str
, latin_len
);
1939 if (!(u8_result
&& u8_lenp
&& *u8_lenp
> u8_len
))
1940 u8_result
= scm_malloc (u8_len
+ 1);
1944 for (i
= 0, n
= 0; i
< latin_len
; i
++)
1945 n
+= u8_uctomb (u8_result
+ n
, str
[i
], u8_len
- n
);
1954 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
1956 if (scm_i_is_narrow_string (str
))
1957 return (char *) latin1_to_u8 ((scm_t_uint8
*) scm_i_string_chars (str
),
1958 scm_i_string_length (str
),
1961 return (char *) u32_to_u8 ((scm_t_uint32
*)scm_i_string_wide_chars (str
),
1962 scm_i_string_length (str
),
1967 scm_to_utf32_string (SCM str
)
1969 return scm_to_utf32_stringn (str
, NULL
);
1973 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
1974 #define FUNC_NAME "scm_to_utf32_stringn"
1976 scm_t_wchar
*result
;
1978 SCM_VALIDATE_STRING (1, str
);
1980 if (scm_i_is_narrow_string (str
))
1982 scm_t_uint8
*codepoints
;
1985 codepoints
= (scm_t_uint8
*) scm_i_string_chars (str
);
1986 len
= scm_i_string_length (str
);
1990 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
1991 for (i
= 0; i
< len
; i
++)
1992 result
[i
] = codepoints
[i
];
1999 len
= scm_i_string_length (str
);
2003 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
2004 memcpy (result
, scm_i_string_wide_chars (str
),
2005 len
* sizeof (scm_t_wchar
));
2013 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2014 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2015 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2016 defined by HANDLER. */
2018 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
2019 scm_t_string_failed_conversion_handler handler
)
2022 size_t ilen
, len
, i
;
2026 if (!scm_is_string (str
))
2027 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2028 ilen
= scm_i_string_length (str
);
2032 buf
= scm_malloc (1);
2040 for (i
= 0; i
< ilen
; i
++)
2041 if (scm_i_string_ref (str
, i
) == '\0')
2042 scm_misc_error (NULL
,
2043 "string contains #\\nul character: ~S",
2046 if (scm_i_is_narrow_string (str
) && (encoding
== NULL
))
2048 /* If using native Latin-1 encoding, just copy the string
2052 buf
= scm_malloc (ilen
);
2053 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2059 buf
= scm_malloc (ilen
+ 1);
2060 memcpy (buf
, scm_i_string_chars (str
), ilen
);
2072 if (scm_i_is_narrow_string (str
))
2074 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
2076 (enum iconv_ilseq_handler
) handler
, NULL
,
2080 scm_encoding_error (__func__
, errno
,
2081 "cannot convert narrow string to output locale",
2083 /* FIXME: Faulty character unknown. */
2088 buf
= u32_conv_to_encoding (enc
,
2089 (enum iconv_ilseq_handler
) handler
,
2090 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
2095 scm_encoding_error (__func__
, errno
,
2096 "cannot convert wide string to output locale",
2098 /* FIXME: Faulty character unknown. */
2101 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
2103 if (SCM_R6RS_ESCAPES_P
)
2105 /* The worst case is if the input string contains all 4-digit
2106 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2107 (seven characters). Make BUF large enough to hold
2109 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
2110 unistring_escapes_to_r6rs_escapes (buf
, &len
);
2113 unistring_escapes_to_guile_escapes (buf
, &len
);
2115 buf
= scm_realloc (buf
, len
);
2121 buf
= scm_realloc (buf
, len
+ 1);
2125 scm_remember_upto_here_1 (str
);
2130 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
2133 char *result
= NULL
;
2134 if (!scm_is_string (str
))
2135 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2136 result
= scm_to_locale_stringn (str
, &len
);
2138 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
2141 scm_remember_upto_here_1 (str
);
2146 /* Unicode string normalization. */
2148 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2149 libguile/i18n.c. It would be useful to have this factored out into a more
2150 convenient location, but its use of alloca makes that tricky to do. */
2153 normalize_str (SCM string
, uninorm_t form
)
2156 scm_t_uint32
*w_str
;
2158 size_t rlen
, len
= scm_i_string_length (string
);
2160 if (scm_i_is_narrow_string (string
))
2163 const char *buf
= scm_i_string_chars (string
);
2165 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
2167 for (i
= 0; i
< len
; i
++)
2168 w_str
[i
] = (unsigned char) buf
[i
];
2172 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2174 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2176 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2177 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2180 scm_i_try_narrow_string (ret
);
2185 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2187 "Returns the NFC normalized form of @var{string}.")
2188 #define FUNC_NAME s_scm_string_normalize_nfc
2190 SCM_VALIDATE_STRING (1, string
);
2191 return normalize_str (string
, UNINORM_NFC
);
2195 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2197 "Returns the NFD normalized form of @var{string}.")
2198 #define FUNC_NAME s_scm_string_normalize_nfd
2200 SCM_VALIDATE_STRING (1, string
);
2201 return normalize_str (string
, UNINORM_NFD
);
2205 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2207 "Returns the NFKC normalized form of @var{string}.")
2208 #define FUNC_NAME s_scm_string_normalize_nfkc
2210 SCM_VALIDATE_STRING (1, string
);
2211 return normalize_str (string
, UNINORM_NFKC
);
2215 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2217 "Returns the NFKD normalized form of @var{string}.")
2218 #define FUNC_NAME s_scm_string_normalize_nfkd
2220 SCM_VALIDATE_STRING (1, string
);
2221 return normalize_str (string
, UNINORM_NFKD
);
2225 /* converts C scm_array of strings to SCM scm_list of strings.
2226 If argc < 0, a null terminated scm_array is assumed.
2227 The current locale encoding is assumed */
2229 scm_makfromstrs (int argc
, char **argv
)
2234 for (i
= 0; argv
[i
]; i
++);
2236 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2240 /* Return a newly allocated array of char pointers to each of the strings
2241 in args, with a terminating NULL pointer. The strings are encoded using
2242 the current locale. */
2245 scm_i_allocate_string_pointers (SCM list
)
2246 #define FUNC_NAME "scm_i_allocate_string_pointers"
2249 int list_len
= scm_ilength (list
);
2253 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2255 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2257 result
[list_len
] = NULL
;
2259 /* The list might have been modified in another thread, so
2260 we check LIST before each access.
2262 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2264 SCM str
= SCM_CAR (list
);
2265 size_t len
; /* String length in bytes */
2266 char *c_str
= scm_to_locale_stringn (str
, &len
);
2268 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2269 scm_malloc to allocate the returned string, which must be
2270 explicitly deallocated. This forces us to copy the string a
2271 second time into a new buffer. Ideally there would be variants
2272 of scm_to_*_stringn that can return garbage-collected buffers. */
2274 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2275 memcpy (result
[i
], c_str
, len
);
2276 result
[i
][len
] = '\0';
2279 list
= SCM_CDR (list
);
2287 scm_i_get_substring_spec (size_t len
,
2288 SCM start
, size_t *cstart
,
2289 SCM end
, size_t *cend
)
2291 if (SCM_UNBNDP (start
))
2294 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2296 if (SCM_UNBNDP (end
))
2299 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2303 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2305 return scm_c_string_ref (h
->array
, index
);
2309 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2311 scm_c_string_set_x (h
->array
, index
, val
);
2315 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2321 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2323 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2324 h
->elements
= h
->writable_elements
= NULL
;
2327 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2328 string_handle_ref
, string_handle_set
,
2330 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2335 scm_nullstr
= scm_i_make_string (0, NULL
, 0);
2337 #include "libguile/strings.x"