1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include "striconveh.h"
34 #include "libguile/_scm.h"
35 #include "libguile/chars.h"
36 #include "libguile/root.h"
37 #include "libguile/strings.h"
38 #include "libguile/error.h"
39 #include "libguile/generalized-vectors.h"
40 #include "libguile/deprecation.h"
41 #include "libguile/validate.h"
42 #include "libguile/private-options.h"
52 * XXX - keeping an accurate refcount during GC seems to be quite
53 * tricky, so we just keep score of whether a stringbuf might be
54 * shared, not whether it definitely is.
56 * The scheme I (mvo) tried to keep an accurate reference count would
57 * recount all strings that point to a stringbuf during the mark-phase
58 * of the GC. This was done since one cannot access the stringbuf of
59 * a string when that string is freed (in order to decrease the
60 * reference count). The memory of the stringbuf might have been
61 * reused already for something completely different.
63 * This recounted worked for a small number of threads beating on
64 * cow-strings, but it failed randomly with more than 10 threads, say.
65 * I couldn't figure out what went wrong, so I used the conservative
66 * approach implemented below.
68 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
69 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
73 /* The size in words of the stringbuf header (type tag + size). */
74 #define STRINGBUF_HEADER_SIZE 2U
76 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
79 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81 #define STRINGBUF_TAG scm_tc7_stringbuf
82 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
83 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85 #define STRINGBUF_CHARS(buf) ((unsigned char *) \
86 SCM_CELL_OBJECT_LOC (buf, \
87 STRINGBUF_HEADER_SIZE))
88 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
92 #define SET_STRINGBUF_SHARED(buf) \
95 /* Don't modify BUF if it's already marked as shared since it might be \
96 a read-only, statically allocated stringbuf. */ \
97 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
98 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
102 #ifdef SCM_STRING_LENGTH_HISTOGRAM
103 static size_t lenhist
[1001];
106 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 make_stringbuf (size_t len
)
111 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
112 scm_i_symbol_chars, all stringbufs are null-terminated. Once
113 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
114 has been changed for scm_i_symbol_chars, this null-termination
120 #ifdef SCM_STRING_LENGTH_HISTOGRAM
127 buf
= PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
130 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
131 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
133 STRINGBUF_CHARS (buf
)[len
] = 0;
138 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 make_wide_stringbuf (size_t len
)
146 #ifdef SCM_STRING_LENGTH_HISTOGRAM
153 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
154 buf
= PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
157 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
158 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
160 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
165 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
166 characters from BUF. */
168 wide_stringbuf (SCM buf
)
172 if (STRINGBUF_WIDE (buf
))
179 len
= STRINGBUF_LENGTH (buf
);
181 new_buf
= make_wide_stringbuf (len
);
183 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
184 for (i
= 0; i
< len
; i
++)
185 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
192 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
193 characters from BUF, if possible. */
195 narrow_stringbuf (SCM buf
)
199 if (!STRINGBUF_WIDE (buf
))
207 len
= STRINGBUF_LENGTH (buf
);
208 wmem
= STRINGBUF_WIDE_CHARS (buf
);
210 for (i
= 0; i
< len
; i
++)
212 /* BUF cannot be narrowed. */
215 new_buf
= make_stringbuf (len
);
217 mem
= STRINGBUF_CHARS (new_buf
);
218 for (i
= 0; i
< len
; i
++)
219 mem
[i
] = (unsigned char) wmem
[i
];
226 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
229 /* Copy-on-write strings.
232 #define STRING_TAG scm_tc7_string
234 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
235 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
236 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
238 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
239 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
241 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
243 /* Read-only strings.
246 #define RO_STRING_TAG scm_tc7_ro_string
247 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
249 /* Mutation-sharing substrings
252 #define SH_STRING_TAG (scm_tc7_string + 0x100)
254 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
255 /* START and LENGTH as for STRINGs. */
257 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
261 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
262 characters. CHARSP, if not NULL, will be set to location of the
265 scm_i_make_string (size_t len
, char **charsp
)
267 SCM buf
= make_stringbuf (len
);
270 *charsp
= (char *) STRINGBUF_CHARS (buf
);
271 res
= scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
272 (scm_t_bits
)0, (scm_t_bits
) len
);
276 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
277 characters. CHARSP, if not NULL, will be set to location of the
280 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
)
282 SCM buf
= make_wide_stringbuf (len
);
285 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
286 res
= scm_double_cell (STRING_TAG
, SCM_UNPACK (buf
),
287 (scm_t_bits
) 0, (scm_t_bits
) len
);
292 validate_substring_args (SCM str
, size_t start
, size_t end
)
294 if (!IS_STRING (str
))
295 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
296 if (start
> STRING_LENGTH (str
))
297 scm_out_of_range (NULL
, scm_from_size_t (start
));
298 if (end
> STRING_LENGTH (str
) || end
< start
)
299 scm_out_of_range (NULL
, scm_from_size_t (end
));
303 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
305 *start
= STRING_START (*str
);
306 if (IS_SH_STRING (*str
))
308 *str
= SH_STRING_STRING (*str
);
309 *start
+= STRING_START (*str
);
311 *buf
= STRING_STRINGBUF (*str
);
315 scm_i_substring (SCM str
, size_t start
, size_t end
)
319 get_str_buf_start (&str
, &buf
, &str_start
);
320 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
321 SET_STRINGBUF_SHARED (buf
);
322 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
323 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
324 (scm_t_bits
)str_start
+ start
,
325 (scm_t_bits
) end
- start
);
329 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
333 get_str_buf_start (&str
, &buf
, &str_start
);
334 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
335 SET_STRINGBUF_SHARED (buf
);
336 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
337 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
338 (scm_t_bits
)str_start
+ start
,
339 (scm_t_bits
) end
- start
);
343 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
345 size_t len
= end
- start
;
348 get_str_buf_start (&str
, &buf
, &str_start
);
349 if (scm_i_is_narrow_string (str
))
351 my_buf
= make_stringbuf (len
);
352 memcpy (STRINGBUF_CHARS (my_buf
),
353 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
357 my_buf
= make_wide_stringbuf (len
);
358 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
359 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
361 /* Even though this string is wide, the substring may be narrow.
362 Consider adding code to narrow the string. */
364 scm_remember_upto_here_1 (buf
);
365 return scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
366 (scm_t_bits
) 0, (scm_t_bits
) len
);
370 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
372 if (start
== 0 && end
== STRING_LENGTH (str
))
376 size_t len
= end
- start
;
377 if (IS_SH_STRING (str
))
379 start
+= STRING_START (str
);
380 str
= SH_STRING_STRING (str
);
382 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
383 (scm_t_bits
)start
, (scm_t_bits
) len
);
388 scm_c_substring (SCM str
, size_t start
, size_t end
)
390 validate_substring_args (str
, start
, end
);
391 return scm_i_substring (str
, start
, end
);
395 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
397 validate_substring_args (str
, start
, end
);
398 return scm_i_substring_read_only (str
, start
, end
);
402 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
404 validate_substring_args (str
, start
, end
);
405 return scm_i_substring_copy (str
, start
, end
);
409 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
411 validate_substring_args (str
, start
, end
);
412 return scm_i_substring_shared (str
, start
, end
);
416 /* Internal accessors
419 /* Returns the number of characters in STR. This may be different
420 than the memory size of the string storage. */
422 scm_i_string_length (SCM str
)
424 return STRING_LENGTH (str
);
427 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
428 encoding. False if it is 'wide', having a 32-bit UCS-4
431 scm_i_is_narrow_string (SCM str
)
433 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
436 /* Try to coerce a string to be narrow. It if is narrow already, do
437 nothing. If it is wide, shrink it to narrow if none of its
438 characters are above 0xFF. Return true if the string is narrow or
439 was made to be narrow. */
441 scm_i_try_narrow_string (SCM str
)
443 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
445 return scm_i_is_narrow_string (str
);
448 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
451 scm_i_string_chars (SCM str
)
455 get_str_buf_start (&str
, &buf
, &start
);
456 if (scm_i_is_narrow_string (str
))
457 return (const char *) STRINGBUF_CHARS (buf
) + start
;
459 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
464 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
467 scm_i_string_wide_chars (SCM str
)
472 get_str_buf_start (&str
, &buf
, &start
);
473 if (!scm_i_is_narrow_string (str
))
474 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
476 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
480 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
481 a new string buffer, so that it can be modified without modifying
482 other strings. Also, lock the string mutex. Later, one must call
483 scm_i_string_stop_writing to unlock the mutex. */
485 scm_i_string_start_writing (SCM orig_str
)
487 SCM buf
, str
= orig_str
;
490 get_str_buf_start (&str
, &buf
, &start
);
491 if (IS_RO_STRING (str
))
492 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
494 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
495 if (STRINGBUF_SHARED (buf
))
497 /* Clone the stringbuf. */
498 size_t len
= STRING_LENGTH (str
);
501 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
503 if (scm_i_is_narrow_string (str
))
505 new_buf
= make_stringbuf (len
);
506 memcpy (STRINGBUF_CHARS (new_buf
),
507 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
512 new_buf
= make_wide_stringbuf (len
);
513 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
514 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
515 + STRING_START (str
)), len
);
518 SET_STRING_STRINGBUF (str
, new_buf
);
519 start
-= STRING_START (str
);
521 /* FIXME: The following operations are not atomic, so other threads
522 looking at STR may see an inconsistent state. Nevertheless it can't
523 hurt much since (i) accessing STR while it is being mutated can't
524 yield a crash, and (ii) concurrent accesses to STR should be
525 protected by a mutex at the application level. The latter may not
526 apply when STR != ORIG_STR, though. */
527 SET_STRING_START (str
, 0);
528 SET_STRING_STRINGBUF (str
, new_buf
);
532 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
537 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
539 scm_i_string_writable_chars (SCM str
)
544 get_str_buf_start (&str
, &buf
, &start
);
545 if (scm_i_is_narrow_string (str
))
546 return (char *) STRINGBUF_CHARS (buf
) + start
;
548 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
553 /* Return a pointer to the UCS-4 codepoints of a string. */
555 scm_i_string_writable_wide_chars (SCM str
)
560 get_str_buf_start (&str
, &buf
, &start
);
561 if (!scm_i_is_narrow_string (str
))
562 return STRINGBUF_WIDE_CHARS (buf
) + start
;
564 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
568 /* Unlock the string mutex that was locked when
569 scm_i_string_start_writing was called. */
571 scm_i_string_stop_writing (void)
573 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
576 /* Return the Xth character of STR as a UCS-4 codepoint. */
578 scm_i_string_ref (SCM str
, size_t x
)
580 if (scm_i_is_narrow_string (str
))
581 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
583 return scm_i_string_wide_chars (str
)[x
];
586 /* Returns index+1 of the first char in STR that matches C, or
587 0 if the char is not found. */
589 scm_i_string_contains_char (SCM str
, char ch
)
592 size_t len
= scm_i_string_length (str
);
595 if (scm_i_is_narrow_string (str
))
599 if (scm_i_string_chars (str
)[i
] == ch
)
608 if (scm_i_string_wide_chars (str
)[i
]
609 == (unsigned char) ch
)
618 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
620 if (scm_i_is_narrow_string (sstr
))
622 const char *a
= scm_i_string_chars (sstr
) + start_x
;
623 const char *b
= cstr
;
624 return strncmp (a
, b
, strlen(b
));
629 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
630 const char *b
= cstr
;
631 for (i
= 0; i
< strlen (b
); i
++)
633 if (a
[i
] != (unsigned char) b
[i
])
640 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
642 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
644 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
645 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
647 if (scm_i_is_narrow_string (str
))
649 char *dst
= scm_i_string_writable_chars (str
);
654 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
662 Basic symbol creation and accessing is done here, the rest is in
663 symbols.[hc]. This has been done to keep stringbufs and the
664 internals of strings and string-like objects confined to this file.
667 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
670 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
671 unsigned long hash
, SCM props
)
674 size_t start
= STRING_START (name
);
675 size_t length
= STRING_LENGTH (name
);
677 if (IS_SH_STRING (name
))
679 name
= SH_STRING_STRING (name
);
680 start
+= STRING_START (name
);
682 buf
= SYMBOL_STRINGBUF (name
);
684 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
687 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
688 SET_STRINGBUF_SHARED (buf
);
689 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
694 if (scm_i_is_narrow_string (name
))
696 SCM new_buf
= make_stringbuf (length
);
697 memcpy (STRINGBUF_CHARS (new_buf
),
698 STRINGBUF_CHARS (buf
) + start
, length
);
703 SCM new_buf
= make_wide_stringbuf (length
);
704 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
705 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
710 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
711 (scm_t_bits
) hash
, SCM_UNPACK (props
));
715 scm_i_c_make_symbol (const char *name
, size_t len
,
716 scm_t_bits flags
, unsigned long hash
, SCM props
)
718 SCM buf
= make_stringbuf (len
);
719 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
721 return scm_immutable_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
722 (scm_t_bits
) hash
, SCM_UNPACK (props
));
725 /* Returns the number of characters in SYM. This may be different
726 from the memory size of SYM. */
728 scm_i_symbol_length (SCM sym
)
730 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
734 scm_c_symbol_length (SCM sym
)
735 #define FUNC_NAME "scm_c_symbol_length"
737 SCM_VALIDATE_SYMBOL (1, sym
);
739 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
743 /* True if the name of SYM is stored as a Latin-1 encoded string.
744 False if it is stored as a 32-bit UCS-4-encoded string. */
746 scm_i_is_narrow_symbol (SCM sym
)
750 buf
= SYMBOL_STRINGBUF (sym
);
751 return !STRINGBUF_WIDE (buf
);
754 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
755 contains the name of SYM. */
757 scm_i_symbol_chars (SCM sym
)
761 buf
= SYMBOL_STRINGBUF (sym
);
762 if (!STRINGBUF_WIDE (buf
))
763 return (const char *) STRINGBUF_CHARS (buf
);
765 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
769 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
772 scm_i_symbol_wide_chars (SCM sym
)
776 buf
= SYMBOL_STRINGBUF (sym
);
777 if (STRINGBUF_WIDE (buf
))
778 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
780 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
785 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
787 SCM buf
= SYMBOL_STRINGBUF (sym
);
788 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
789 SET_STRINGBUF_SHARED (buf
);
790 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
791 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
792 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
795 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
797 scm_i_symbol_ref (SCM sym
, size_t x
)
799 if (scm_i_is_narrow_symbol (sym
))
800 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
802 return scm_i_symbol_wide_chars (sym
)[x
];
808 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
809 "Returns an association list containing debugging information\n"
810 "for @var{str}. The association list has the following entries."
813 "The string itself.\n"
815 "The start index of the string into its stringbuf\n"
817 "The length of the string\n"
819 "If this string is a substring, it returns its parent string.\n"
820 "Otherwise, it returns @code{#f}\n"
822 "@code{#t} if the string is read-only\n"
823 "@item stringbuf-chars\n"
824 "A new string containing this string's stringbuf's characters\n"
825 "@item stringbuf-length\n"
826 "The number of characters in this stringbuf\n"
827 "@item stringbuf-shared\n"
828 "@code{#t} if this stringbuf is shared\n"
829 "@item stringbuf-wide\n"
830 "@code{#t} if this stringbuf's characters are stored in a\n"
831 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
834 #define FUNC_NAME s_scm_sys_string_dump
836 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
838 SCM_VALIDATE_STRING (1, str
);
841 e1
= scm_cons (scm_from_locale_symbol ("string"),
843 e2
= scm_cons (scm_from_locale_symbol ("start"),
844 scm_from_size_t (STRING_START (str
)));
845 e3
= scm_cons (scm_from_locale_symbol ("length"),
846 scm_from_size_t (STRING_LENGTH (str
)));
848 if (IS_SH_STRING (str
))
850 e4
= scm_cons (scm_from_locale_symbol ("shared"),
851 SH_STRING_STRING (str
));
852 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
856 e4
= scm_cons (scm_from_locale_symbol ("shared"),
858 buf
= STRING_STRINGBUF (str
);
861 if (IS_RO_STRING (str
))
862 e5
= scm_cons (scm_from_locale_symbol ("read-only"),
865 e5
= scm_cons (scm_from_locale_symbol ("read-only"),
869 if (!STRINGBUF_WIDE (buf
))
871 size_t len
= STRINGBUF_LENGTH (buf
);
873 SCM sbc
= scm_i_make_string (len
, &cbuf
);
874 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
875 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
880 size_t len
= STRINGBUF_LENGTH (buf
);
882 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
);
883 u32_cpy ((scm_t_uint32
*) cbuf
,
884 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
885 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
888 e7
= scm_cons (scm_from_locale_symbol ("stringbuf-length"),
889 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
890 if (STRINGBUF_SHARED (buf
))
891 e8
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
894 e8
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
896 if (STRINGBUF_WIDE (buf
))
897 e9
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
900 e9
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
903 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
907 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
908 "Returns an association list containing debugging information\n"
909 "for @var{sym}. The association list has the following entries."
912 "The symbol itself\n"
916 "@code{#t} if it is an interned symbol\n"
917 "@item stringbuf-chars\n"
918 "A new string containing this symbols's stringbuf's characters\n"
919 "@item stringbuf-length\n"
920 "The number of characters in this stringbuf\n"
921 "@item stringbuf-shared\n"
922 "@code{#t} if this stringbuf is shared\n"
923 "@item stringbuf-wide\n"
924 "@code{#t} if this stringbuf's characters are stored in a\n"
925 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
928 #define FUNC_NAME s_scm_sys_symbol_dump
930 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
932 SCM_VALIDATE_SYMBOL (1, sym
);
933 e1
= scm_cons (scm_from_locale_symbol ("symbol"),
935 e2
= scm_cons (scm_from_locale_symbol ("hash"),
936 scm_from_ulong (scm_i_symbol_hash (sym
)));
937 e3
= scm_cons (scm_from_locale_symbol ("interned"),
938 scm_symbol_interned_p (sym
));
939 buf
= SYMBOL_STRINGBUF (sym
);
942 if (!STRINGBUF_WIDE (buf
))
944 size_t len
= STRINGBUF_LENGTH (buf
);
946 SCM sbc
= scm_i_make_string (len
, &cbuf
);
947 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
948 e4
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
953 size_t len
= STRINGBUF_LENGTH (buf
);
955 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
);
956 u32_cpy ((scm_t_uint32
*) cbuf
,
957 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
958 e4
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
961 e5
= scm_cons (scm_from_locale_symbol ("stringbuf-length"),
962 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
963 if (STRINGBUF_SHARED (buf
))
964 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
967 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
969 if (STRINGBUF_WIDE (buf
))
970 e7
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
973 e7
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
975 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
980 #ifdef SCM_STRING_LENGTH_HISTOGRAM
982 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
983 #define FUNC_NAME s_scm_sys_stringbuf_hist
986 for (i
= 0; i
< 1000; i
++)
988 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
989 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
990 return SCM_UNSPECIFIED
;
998 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1000 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1001 #define FUNC_NAME s_scm_string_p
1003 return scm_from_bool (IS_STRING (obj
));
1008 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1010 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1012 "@deffnx {Scheme Procedure} list->string chrs\n"
1013 "Return a newly allocated string composed of the arguments,\n"
1015 #define FUNC_NAME s_scm_string
1017 SCM result
= SCM_BOOL_F
;
1024 /* Verify that this is a list of chars. */
1025 i
= scm_ilength (chrs
);
1026 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1031 while (len
> 0 && scm_is_pair (rest
))
1033 SCM elt
= SCM_CAR (rest
);
1034 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1035 if (SCM_CHAR (elt
) > 0xFF)
1037 rest
= SCM_CDR (rest
);
1039 scm_remember_upto_here_1 (elt
);
1042 /* Construct a string containing this list of chars. */
1050 result
= scm_i_make_string (len
, NULL
);
1051 result
= scm_i_string_start_writing (result
);
1052 buf
= scm_i_string_writable_chars (result
);
1053 while (len
> 0 && scm_is_pair (rest
))
1055 SCM elt
= SCM_CAR (rest
);
1056 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1058 rest
= SCM_CDR (rest
);
1060 scm_remember_upto_here_1 (elt
);
1067 result
= scm_i_make_wide_string (len
, NULL
);
1068 result
= scm_i_string_start_writing (result
);
1069 buf
= scm_i_string_writable_wide_chars (result
);
1070 while (len
> 0 && scm_is_pair (rest
))
1072 SCM elt
= SCM_CAR (rest
);
1073 buf
[p
] = SCM_CHAR (elt
);
1075 rest
= SCM_CDR (rest
);
1077 scm_remember_upto_here_1 (elt
);
1080 scm_i_string_stop_writing ();
1083 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1084 if (!scm_is_null (rest
))
1085 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1091 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1093 "Return a newly allocated string of\n"
1094 "length @var{k}. If @var{chr} is given, then all elements of\n"
1095 "the string are initialized to @var{chr}, otherwise the contents\n"
1096 "of the @var{string} are unspecified.")
1097 #define FUNC_NAME s_scm_make_string
1099 return scm_c_make_string (scm_to_size_t (k
), chr
);
1104 scm_c_make_string (size_t len
, SCM chr
)
1105 #define FUNC_NAME NULL
1108 SCM res
= scm_i_make_string (len
, NULL
);
1110 if (!SCM_UNBNDP (chr
))
1112 SCM_VALIDATE_CHAR (0, chr
);
1113 res
= scm_i_string_start_writing (res
);
1114 for (p
= 0; p
< len
; p
++)
1115 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1116 scm_i_string_stop_writing ();
1123 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1125 "Return the number of characters in @var{string}.")
1126 #define FUNC_NAME s_scm_string_length
1128 SCM_VALIDATE_STRING (1, string
);
1129 return scm_from_size_t (STRING_LENGTH (string
));
1133 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1135 "Return the bytes used to represent a character in @var{string}."
1136 "This will return 1 or 4.")
1137 #define FUNC_NAME s_scm_string_bytes_per_char
1139 SCM_VALIDATE_STRING (1, string
);
1140 if (!scm_i_is_narrow_string (string
))
1141 return scm_from_int (4);
1143 return scm_from_int (1);
1148 scm_c_string_length (SCM string
)
1150 if (!IS_STRING (string
))
1151 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1152 return STRING_LENGTH (string
);
1155 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1157 "Return character @var{k} of @var{str} using zero-origin\n"
1158 "indexing. @var{k} must be a valid index of @var{str}.")
1159 #define FUNC_NAME s_scm_string_ref
1164 SCM_VALIDATE_STRING (1, str
);
1166 len
= scm_i_string_length (str
);
1167 if (SCM_LIKELY (len
> 0))
1168 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1170 scm_out_of_range (NULL
, k
);
1172 if (scm_i_is_narrow_string (str
))
1173 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1175 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1180 scm_c_string_ref (SCM str
, size_t p
)
1182 if (p
>= scm_i_string_length (str
))
1183 scm_out_of_range (NULL
, scm_from_size_t (p
));
1184 if (scm_i_is_narrow_string (str
))
1185 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1187 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1191 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1192 (SCM str
, SCM k
, SCM chr
),
1193 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1194 "an unspecified value. @var{k} must be a valid index of\n"
1196 #define FUNC_NAME s_scm_string_set_x
1201 SCM_VALIDATE_STRING (1, str
);
1203 len
= scm_i_string_length (str
);
1204 if (SCM_LIKELY (len
> 0))
1205 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1207 scm_out_of_range (NULL
, k
);
1209 SCM_VALIDATE_CHAR (3, chr
);
1210 str
= scm_i_string_start_writing (str
);
1211 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1212 scm_i_string_stop_writing ();
1214 return SCM_UNSPECIFIED
;
1219 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1221 if (p
>= scm_i_string_length (str
))
1222 scm_out_of_range (NULL
, scm_from_size_t (p
));
1223 str
= scm_i_string_start_writing (str
);
1224 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1225 scm_i_string_stop_writing ();
1228 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1229 (SCM str
, SCM start
, SCM end
),
1230 "Return a newly allocated string formed from the characters\n"
1231 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1232 "ending with index @var{end} (exclusive).\n"
1233 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1234 "exact integers satisfying:\n\n"
1235 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1236 #define FUNC_NAME s_scm_substring
1238 size_t len
, from
, to
;
1240 SCM_VALIDATE_STRING (1, str
);
1241 len
= scm_i_string_length (str
);
1242 from
= scm_to_unsigned_integer (start
, 0, len
);
1243 if (SCM_UNBNDP (end
))
1246 to
= scm_to_unsigned_integer (end
, from
, len
);
1247 return scm_i_substring (str
, from
, to
);
1251 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1252 (SCM str
, SCM start
, SCM end
),
1253 "Return a newly allocated string formed from the characters\n"
1254 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1255 "ending with index @var{end} (exclusive).\n"
1256 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1257 "exact integers satisfying:\n"
1259 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1261 "The returned string is read-only.\n")
1262 #define FUNC_NAME s_scm_substring_read_only
1264 size_t len
, from
, to
;
1266 SCM_VALIDATE_STRING (1, str
);
1267 len
= scm_i_string_length (str
);
1268 from
= scm_to_unsigned_integer (start
, 0, len
);
1269 if (SCM_UNBNDP (end
))
1272 to
= scm_to_unsigned_integer (end
, from
, len
);
1273 return scm_i_substring_read_only (str
, from
, to
);
1277 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1278 (SCM str
, SCM start
, SCM end
),
1279 "Return a newly allocated string formed from the characters\n"
1280 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1281 "ending with index @var{end} (exclusive).\n"
1282 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1283 "exact integers satisfying:\n\n"
1284 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1285 #define FUNC_NAME s_scm_substring_copy
1287 /* For the Scheme version, START is mandatory, but for the C
1288 version, it is optional. See scm_string_copy in srfi-13.c for a
1294 SCM_VALIDATE_STRING (1, str
);
1295 scm_i_get_substring_spec (scm_i_string_length (str
),
1296 start
, &from
, end
, &to
);
1297 return scm_i_substring_copy (str
, from
, to
);
1301 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1302 (SCM str
, SCM start
, SCM end
),
1303 "Return string that indirectly refers to the characters\n"
1304 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1305 "ending with index @var{end} (exclusive).\n"
1306 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1307 "exact integers satisfying:\n\n"
1308 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1309 #define FUNC_NAME s_scm_substring_shared
1311 size_t len
, from
, to
;
1313 SCM_VALIDATE_STRING (1, str
);
1314 len
= scm_i_string_length (str
);
1315 from
= scm_to_unsigned_integer (start
, 0, len
);
1316 if (SCM_UNBNDP (end
))
1319 to
= scm_to_unsigned_integer (end
, from
, len
);
1320 return scm_i_substring_shared (str
, from
, to
);
1324 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1326 "Return a newly allocated string whose characters form the\n"
1327 "concatenation of the given strings, @var{args}.")
1328 #define FUNC_NAME s_scm_string_append
1341 SCM_VALIDATE_REST_ARGUMENT (args
);
1342 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1345 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1346 len
+= scm_i_string_length (s
);
1347 if (!scm_i_is_narrow_string (s
))
1352 res
= scm_i_make_string (len
, &data
.narrow
);
1354 res
= scm_i_make_wide_string (len
, &data
.wide
);
1356 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1360 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1361 len
= scm_i_string_length (s
);
1364 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1369 if (scm_i_is_narrow_string (s
))
1371 for (i
= 0; i
< scm_i_string_length (s
); i
++)
1372 data
.wide
[i
] = (unsigned char) scm_i_string_chars (s
)[i
];
1375 u32_cpy ((scm_t_uint32
*) data
.wide
,
1376 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1379 scm_remember_upto_here_1 (s
);
1386 scm_is_string (SCM obj
)
1388 return IS_STRING (obj
);
1392 /* Conversion to/from other encodings. */
1394 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1396 scm_encoding_error (const char *subr
, int err
, const char *message
,
1397 const char *from
, const char *to
, SCM string_or_bv
)
1399 /* Raise an exception that conveys all the information needed to debug the
1400 problem. Only perform locale conversions that are safe; in particular,
1401 don't try to display STRING_OR_BV when it's a string since converting it to
1402 the output locale may fail. */
1403 scm_throw (scm_encoding_error_key
,
1404 scm_list_n (scm_from_locale_string (subr
),
1405 scm_from_locale_string (message
),
1407 scm_from_locale_string (from
),
1408 scm_from_locale_string (to
),
1414 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1415 scm_t_string_failed_conversion_handler handler
)
1425 if (encoding
== NULL
)
1427 /* If encoding is null, use Latin-1. */
1429 res
= scm_i_make_string (len
, &buf
);
1430 memcpy (buf
, str
, len
);
1435 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1436 (enum iconv_ilseq_handler
)
1442 if (SCM_UNLIKELY (u32
== NULL
))
1444 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1449 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1450 memcpy (buf
, str
, len
);
1451 bv
= scm_c_take_bytevector (buf
, len
);
1453 scm_encoding_error (__func__
, errno
,
1454 "input locale conversion error",
1455 encoding
, "UTF-32", bv
);
1460 if (u32
[i
++] > 0xFF)
1469 res
= scm_i_make_string (u32len
, &dst
);
1470 for (i
= 0; i
< u32len
; i
++)
1471 dst
[i
] = (unsigned char) u32
[i
];
1477 res
= scm_i_make_wide_string (u32len
, &wdst
);
1478 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1487 scm_from_locale_stringn (const char *str
, size_t len
)
1490 scm_t_string_failed_conversion_handler hndl
;
1494 if (len
== (size_t) -1)
1499 inport
= scm_current_input_port ();
1500 if (!SCM_UNBNDP (inport
) && SCM_OPINPORTP (inport
))
1502 pt
= SCM_PTAB_ENTRY (inport
);
1504 hndl
= pt
->ilseq_handler
;
1509 hndl
= SCM_FAILED_CONVERSION_ERROR
;
1512 return scm_from_stringn (str
, len
, enc
, hndl
);
1516 scm_from_locale_string (const char *str
)
1521 return scm_from_locale_stringn (str
, -1);
1525 scm_i_from_utf8_string (const scm_t_uint8
*str
)
1527 return scm_from_stringn ((const char *) str
,
1528 strlen ((char *) str
), "UTF-8",
1529 SCM_FAILED_CONVERSION_ERROR
);
1532 /* Create a new scheme string from the C string STR. The memory of
1533 STR may be used directly as storage for the new string. */
1534 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1535 would be to register a finalizer to eventually free(3) STR, which isn't
1536 worth it. Should we just deprecate the `scm_take_' functions? */
1538 scm_take_locale_stringn (char *str
, size_t len
)
1542 res
= scm_from_locale_stringn (str
, len
);
1549 scm_take_locale_string (char *str
)
1551 return scm_take_locale_stringn (str
, -1);
1554 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1557 unistring_escapes_to_guile_escapes (char **bufp
, size_t *lenp
)
1559 char *before
, *after
;
1568 if ((i
<= *lenp
- 6)
1569 && before
[i
] == '\\'
1570 && before
[i
+ 1] == 'u'
1571 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1573 /* Convert \u00NN to \xNN */
1576 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1577 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1581 else if ((i
<= *lenp
- 10)
1582 && before
[i
] == '\\'
1583 && before
[i
+ 1] == 'U'
1584 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1586 /* Convert \U00NNNNNN to \UNNNNNN */
1589 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1590 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1591 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1592 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1593 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1594 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1600 after
[j
] = before
[i
];
1606 after
= scm_realloc (after
, j
);
1609 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXXXX; */
1611 unistring_escapes_to_r6rs_escapes (char **bufp
, size_t *lenp
)
1613 char *before
, *after
;
1615 /* The worst case is if the input string contains all 4-digit hex escapes.
1616 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1617 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1618 size_t nzeros
, ndigits
;
1621 after
= alloca (max_out_len
);
1626 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1627 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1629 if (before
[i
+ 1] == 'u')
1631 else if (before
[i
+ 1] == 'U')
1636 /* Add the R6RS hex escape initial sequence. */
1640 /* Move string positions to the start of the hex numbers. */
1644 /* Find the number of initial zeros in this hex number. */
1646 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1649 /* Copy the number, skipping initial zeros, and then move the string
1651 if (nzeros
== ndigits
)
1660 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1661 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1663 j
+= (ndigits
- nzeros
);
1666 /* Add terminating semicolon. */
1672 after
[j
] = before
[i
];
1678 before
= scm_realloc (before
, j
);
1679 memcpy (before
, after
, j
);
1684 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1690 outport
= scm_current_output_port ();
1691 if (!SCM_UNBNDP (outport
) && SCM_OPOUTPORTP (outport
))
1693 pt
= SCM_PTAB_ENTRY (outport
);
1699 return scm_to_stringn (str
, lenp
,
1701 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1704 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1705 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1706 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1707 defined by HANDLER. */
1709 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
1710 scm_t_string_failed_conversion_handler handler
)
1713 size_t ilen
, len
, i
;
1717 if (!scm_is_string (str
))
1718 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1719 ilen
= scm_i_string_length (str
);
1723 buf
= scm_malloc (1);
1731 for (i
= 0; i
< ilen
; i
++)
1732 if (scm_i_string_ref (str
, i
) == '\0')
1733 scm_misc_error (NULL
,
1734 "string contains #\\nul character: ~S",
1737 if (scm_i_is_narrow_string (str
) && (encoding
== NULL
))
1739 /* If using native Latin-1 encoding, just copy the string
1743 buf
= scm_malloc (ilen
);
1744 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1750 buf
= scm_malloc (ilen
+ 1);
1751 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1763 if (scm_i_is_narrow_string (str
))
1765 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
1767 (enum iconv_ilseq_handler
) handler
, NULL
,
1771 scm_encoding_error (__func__
, errno
,
1772 "cannot convert to output locale",
1773 "ISO-8859-1", enc
, str
);
1777 buf
= u32_conv_to_encoding (enc
,
1778 (enum iconv_ilseq_handler
) handler
,
1779 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
1784 scm_encoding_error (__func__
, errno
,
1785 "cannot convert to output locale",
1786 "UTF-32", enc
, str
);
1788 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
1790 if (SCM_R6RS_ESCAPES_P
)
1791 unistring_escapes_to_r6rs_escapes (&buf
, &len
);
1793 unistring_escapes_to_guile_escapes (&buf
, &len
);
1799 buf
= scm_realloc (buf
, len
+ 1);
1803 scm_remember_upto_here_1 (str
);
1808 scm_to_locale_string (SCM str
)
1810 return scm_to_locale_stringn (str
, NULL
);
1814 scm_i_to_utf8_string (SCM str
)
1817 u8str
= scm_to_stringn (str
, NULL
, "UTF-8", SCM_FAILED_CONVERSION_ERROR
);
1818 return (scm_t_uint8
*) u8str
;
1822 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
1825 char *result
= NULL
;
1826 if (!scm_is_string (str
))
1827 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1828 result
= scm_to_locale_stringn (str
, &len
);
1830 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
1833 scm_remember_upto_here_1 (str
);
1838 /* Unicode string normalization. */
1840 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1841 libguile/i18n.c. It would be useful to have this factored out into a more
1842 convenient location, but its use of alloca makes that tricky to do. */
1845 normalize_str (SCM string
, uninorm_t form
)
1848 scm_t_uint32
*w_str
;
1850 size_t rlen
, len
= scm_i_string_length (string
);
1852 if (scm_i_is_narrow_string (string
))
1855 const char *buf
= scm_i_string_chars (string
);
1857 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
1859 for (i
= 0; i
< len
; i
++)
1860 w_str
[i
] = (unsigned char) buf
[i
];
1864 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
1866 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
1868 ret
= scm_i_make_wide_string (rlen
, &cbuf
);
1869 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
1872 scm_i_try_narrow_string (ret
);
1877 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
1879 "Returns the NFC normalized form of @var{string}.")
1880 #define FUNC_NAME s_scm_string_normalize_nfc
1882 SCM_VALIDATE_STRING (1, string
);
1883 return normalize_str (string
, UNINORM_NFC
);
1887 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
1889 "Returns the NFD normalized form of @var{string}.")
1890 #define FUNC_NAME s_scm_string_normalize_nfd
1892 SCM_VALIDATE_STRING (1, string
);
1893 return normalize_str (string
, UNINORM_NFD
);
1897 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
1899 "Returns the NFKC normalized form of @var{string}.")
1900 #define FUNC_NAME s_scm_string_normalize_nfkc
1902 SCM_VALIDATE_STRING (1, string
);
1903 return normalize_str (string
, UNINORM_NFKC
);
1907 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
1909 "Returns the NFKD normalized form of @var{string}.")
1910 #define FUNC_NAME s_scm_string_normalize_nfkd
1912 SCM_VALIDATE_STRING (1, string
);
1913 return normalize_str (string
, UNINORM_NFKD
);
1917 /* converts C scm_array of strings to SCM scm_list of strings. */
1918 /* If argc < 0, a null terminated scm_array is assumed. */
1920 scm_makfromstrs (int argc
, char **argv
)
1925 for (i
= 0; argv
[i
]; i
++);
1927 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
1931 /* Return a newly allocated array of char pointers to each of the strings
1932 in args, with a terminating NULL pointer. */
1935 scm_i_allocate_string_pointers (SCM list
)
1936 #define FUNC_NAME "scm_i_allocate_string_pointers"
1939 int len
= scm_ilength (list
);
1943 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
1945 result
= scm_gc_malloc ((len
+ 1) * sizeof (char *),
1949 /* The list might be have been modified in another thread, so
1950 we check LIST before each access.
1952 for (i
= 0; i
< len
&& scm_is_pair (list
); i
++)
1957 str
= SCM_CAR (list
);
1958 len
= scm_c_string_length (str
);
1960 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string pointers");
1961 memcpy (result
[i
], scm_i_string_chars (str
), len
);
1962 result
[i
][len
] = '\0';
1964 list
= SCM_CDR (list
);
1972 scm_i_get_substring_spec (size_t len
,
1973 SCM start
, size_t *cstart
,
1974 SCM end
, size_t *cend
)
1976 if (SCM_UNBNDP (start
))
1979 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
1981 if (SCM_UNBNDP (end
))
1984 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
1987 #if SCM_ENABLE_DEPRECATED
1989 /* When these definitions are removed, it becomes reasonable to use
1990 read-only strings for string literals. For that, change the reader
1991 to create string literals with scm_c_substring_read_only instead of
1992 with scm_c_substring_copy.
1996 scm_i_deprecated_stringp (SCM str
)
1998 scm_c_issue_deprecation_warning
1999 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2001 return scm_is_string (str
);
2005 scm_i_deprecated_string_chars (SCM str
)
2009 scm_c_issue_deprecation_warning
2010 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2012 /* We don't accept shared substrings here since they are not
2015 if (IS_SH_STRING (str
))
2016 scm_misc_error (NULL
,
2017 "SCM_STRING_CHARS does not work with shared substrings",
2020 /* We explicitly test for read-only strings to produce a better
2024 if (IS_RO_STRING (str
))
2025 scm_misc_error (NULL
,
2026 "SCM_STRING_CHARS does not work with read-only strings",
2029 /* The following is still wrong, of course...
2031 str
= scm_i_string_start_writing (str
);
2032 chars
= scm_i_string_writable_chars (str
);
2033 scm_i_string_stop_writing ();
2038 scm_i_deprecated_string_length (SCM str
)
2040 scm_c_issue_deprecation_warning
2041 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2042 return scm_c_string_length (str
);
2048 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2050 return scm_c_string_ref (h
->array
, index
);
2054 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2056 scm_c_string_set_x (h
->array
, index
, val
);
2060 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2066 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2068 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2069 h
->elements
= h
->writable_elements
= NULL
;
2072 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2073 string_handle_ref
, string_handle_set
,
2075 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2080 scm_nullstr
= scm_i_make_string (0, NULL
);
2082 #include "libguile/strings.x"