1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include "striconveh.h"
34 #include "libguile/_scm.h"
35 #include "libguile/chars.h"
36 #include "libguile/root.h"
37 #include "libguile/strings.h"
38 #include "libguile/error.h"
39 #include "libguile/generalized-vectors.h"
40 #include "libguile/deprecation.h"
41 #include "libguile/validate.h"
42 #include "libguile/private-options.h"
52 * XXX - keeping an accurate refcount during GC seems to be quite
53 * tricky, so we just keep score of whether a stringbuf might be
54 * shared, not whether it definitely is.
56 * The scheme I (mvo) tried to keep an accurate reference count would
57 * recount all strings that point to a stringbuf during the mark-phase
58 * of the GC. This was done since one cannot access the stringbuf of
59 * a string when that string is freed (in order to decrease the
60 * reference count). The memory of the stringbuf might have been
61 * reused already for something completely different.
63 * This recounted worked for a small number of threads beating on
64 * cow-strings, but it failed randomly with more than 10 threads, say.
65 * I couldn't figure out what went wrong, so I used the conservative
66 * approach implemented below.
68 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
69 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
73 /* The size in words of the stringbuf header (type tag + size). */
74 #define STRINGBUF_HEADER_SIZE 2U
76 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
79 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
81 #define STRINGBUF_TAG scm_tc7_stringbuf
82 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
83 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
85 #define STRINGBUF_CONTENTS(buf) ((void *) \
86 SCM_CELL_OBJECT_LOC (buf, \
87 STRINGBUF_HEADER_SIZE))
88 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
89 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
91 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
93 #define SET_STRINGBUF_SHARED(buf) \
96 /* Don't modify BUF if it's already marked as shared since it might be \
97 a read-only, statically allocated stringbuf. */ \
98 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
99 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
103 #ifdef SCM_STRING_LENGTH_HISTOGRAM
104 static size_t lenhist
[1001];
107 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
110 make_stringbuf (size_t len
)
112 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
113 scm_i_symbol_chars, all stringbufs are null-terminated. Once
114 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
115 has been changed for scm_i_symbol_chars, this null-termination
121 #ifdef SCM_STRING_LENGTH_HISTOGRAM
128 buf
= PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
131 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
132 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
134 STRINGBUF_CHARS (buf
)[len
] = 0;
139 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
142 make_wide_stringbuf (size_t len
)
147 #ifdef SCM_STRING_LENGTH_HISTOGRAM
154 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
155 buf
= PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
158 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
159 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
161 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
166 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
167 characters from BUF. */
169 wide_stringbuf (SCM buf
)
173 if (STRINGBUF_WIDE (buf
))
180 len
= STRINGBUF_LENGTH (buf
);
182 new_buf
= make_wide_stringbuf (len
);
184 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
185 for (i
= 0; i
< len
; i
++)
186 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
193 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
194 characters from BUF, if possible. */
196 narrow_stringbuf (SCM buf
)
200 if (!STRINGBUF_WIDE (buf
))
208 len
= STRINGBUF_LENGTH (buf
);
209 wmem
= STRINGBUF_WIDE_CHARS (buf
);
211 for (i
= 0; i
< len
; i
++)
213 /* BUF cannot be narrowed. */
216 new_buf
= make_stringbuf (len
);
218 mem
= STRINGBUF_CHARS (new_buf
);
219 for (i
= 0; i
< len
; i
++)
220 mem
[i
] = (unsigned char) wmem
[i
];
227 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
230 /* Copy-on-write strings.
233 #define STRING_TAG scm_tc7_string
235 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
236 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
237 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
239 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
240 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242 #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244 /* Read-only strings.
247 #define RO_STRING_TAG scm_tc7_ro_string
248 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250 /* Mutation-sharing substrings
253 #define SH_STRING_TAG (scm_tc7_string + 0x100)
255 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
256 /* START and LENGTH as for STRINGs. */
258 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
262 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
263 characters. CHARSP, if not NULL, will be set to location of the
266 scm_i_make_string (size_t len
, char **charsp
)
268 SCM buf
= make_stringbuf (len
);
271 *charsp
= (char *) STRINGBUF_CHARS (buf
);
272 res
= scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
273 (scm_t_bits
)0, (scm_t_bits
) len
);
277 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
278 characters. CHARSP, if not NULL, will be set to location of the
281 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
)
283 SCM buf
= make_wide_stringbuf (len
);
286 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
287 res
= scm_double_cell (STRING_TAG
, SCM_UNPACK (buf
),
288 (scm_t_bits
) 0, (scm_t_bits
) len
);
293 validate_substring_args (SCM str
, size_t start
, size_t end
)
295 if (!IS_STRING (str
))
296 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
297 if (start
> STRING_LENGTH (str
))
298 scm_out_of_range (NULL
, scm_from_size_t (start
));
299 if (end
> STRING_LENGTH (str
) || end
< start
)
300 scm_out_of_range (NULL
, scm_from_size_t (end
));
304 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
306 *start
= STRING_START (*str
);
307 if (IS_SH_STRING (*str
))
309 *str
= SH_STRING_STRING (*str
);
310 *start
+= STRING_START (*str
);
312 *buf
= STRING_STRINGBUF (*str
);
316 scm_i_substring (SCM str
, size_t start
, size_t end
)
320 get_str_buf_start (&str
, &buf
, &str_start
);
321 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
322 SET_STRINGBUF_SHARED (buf
);
323 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
324 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
325 (scm_t_bits
)str_start
+ start
,
326 (scm_t_bits
) end
- start
);
330 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
334 get_str_buf_start (&str
, &buf
, &str_start
);
335 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
336 SET_STRINGBUF_SHARED (buf
);
337 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
338 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
339 (scm_t_bits
)str_start
+ start
,
340 (scm_t_bits
) end
- start
);
344 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
346 size_t len
= end
- start
;
349 get_str_buf_start (&str
, &buf
, &str_start
);
350 if (scm_i_is_narrow_string (str
))
352 my_buf
= make_stringbuf (len
);
353 memcpy (STRINGBUF_CHARS (my_buf
),
354 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
358 my_buf
= make_wide_stringbuf (len
);
359 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
360 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
362 /* Even though this string is wide, the substring may be narrow.
363 Consider adding code to narrow the string. */
365 scm_remember_upto_here_1 (buf
);
366 return scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
367 (scm_t_bits
) 0, (scm_t_bits
) len
);
371 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
373 if (start
== 0 && end
== STRING_LENGTH (str
))
377 size_t len
= end
- start
;
378 if (IS_SH_STRING (str
))
380 start
+= STRING_START (str
);
381 str
= SH_STRING_STRING (str
);
383 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
384 (scm_t_bits
)start
, (scm_t_bits
) len
);
389 scm_c_substring (SCM str
, size_t start
, size_t end
)
391 validate_substring_args (str
, start
, end
);
392 return scm_i_substring (str
, start
, end
);
396 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
398 validate_substring_args (str
, start
, end
);
399 return scm_i_substring_read_only (str
, start
, end
);
403 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
405 validate_substring_args (str
, start
, end
);
406 return scm_i_substring_copy (str
, start
, end
);
410 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
412 validate_substring_args (str
, start
, end
);
413 return scm_i_substring_shared (str
, start
, end
);
417 /* Internal accessors
420 /* Returns the number of characters in STR. This may be different
421 than the memory size of the string storage. */
423 scm_i_string_length (SCM str
)
425 return STRING_LENGTH (str
);
428 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
429 encoding. False if it is 'wide', having a 32-bit UCS-4
432 scm_i_is_narrow_string (SCM str
)
434 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
437 /* Try to coerce a string to be narrow. It if is narrow already, do
438 nothing. If it is wide, shrink it to narrow if none of its
439 characters are above 0xFF. Return true if the string is narrow or
440 was made to be narrow. */
442 scm_i_try_narrow_string (SCM str
)
444 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
446 return scm_i_is_narrow_string (str
);
449 /* Return a pointer to the raw data of the string, which can be either Latin-1
450 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
452 scm_i_string_data (SCM str
)
458 get_str_buf_start (&str
, &buf
, &start
);
460 data
= STRINGBUF_CONTENTS (buf
);
461 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
466 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
469 scm_i_string_chars (SCM str
)
473 get_str_buf_start (&str
, &buf
, &start
);
474 if (scm_i_is_narrow_string (str
))
475 return (const char *) STRINGBUF_CHARS (buf
) + start
;
477 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
482 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
485 scm_i_string_wide_chars (SCM str
)
490 get_str_buf_start (&str
, &buf
, &start
);
491 if (!scm_i_is_narrow_string (str
))
492 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
494 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
498 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
499 a new string buffer, so that it can be modified without modifying
500 other strings. Also, lock the string mutex. Later, one must call
501 scm_i_string_stop_writing to unlock the mutex. */
503 scm_i_string_start_writing (SCM orig_str
)
505 SCM buf
, str
= orig_str
;
508 get_str_buf_start (&str
, &buf
, &start
);
509 if (IS_RO_STRING (str
))
510 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
512 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
513 if (STRINGBUF_SHARED (buf
))
515 /* Clone the stringbuf. */
516 size_t len
= STRING_LENGTH (str
);
519 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
521 if (scm_i_is_narrow_string (str
))
523 new_buf
= make_stringbuf (len
);
524 memcpy (STRINGBUF_CHARS (new_buf
),
525 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
530 new_buf
= make_wide_stringbuf (len
);
531 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
532 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
533 + STRING_START (str
)), len
);
536 SET_STRING_STRINGBUF (str
, new_buf
);
537 start
-= STRING_START (str
);
539 /* FIXME: The following operations are not atomic, so other threads
540 looking at STR may see an inconsistent state. Nevertheless it can't
541 hurt much since (i) accessing STR while it is being mutated can't
542 yield a crash, and (ii) concurrent accesses to STR should be
543 protected by a mutex at the application level. The latter may not
544 apply when STR != ORIG_STR, though. */
545 SET_STRING_START (str
, 0);
546 SET_STRING_STRINGBUF (str
, new_buf
);
550 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
555 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
557 scm_i_string_writable_chars (SCM str
)
562 get_str_buf_start (&str
, &buf
, &start
);
563 if (scm_i_is_narrow_string (str
))
564 return (char *) STRINGBUF_CHARS (buf
) + start
;
566 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
571 /* Return a pointer to the UCS-4 codepoints of a string. */
573 scm_i_string_writable_wide_chars (SCM str
)
578 get_str_buf_start (&str
, &buf
, &start
);
579 if (!scm_i_is_narrow_string (str
))
580 return STRINGBUF_WIDE_CHARS (buf
) + start
;
582 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
586 /* Unlock the string mutex that was locked when
587 scm_i_string_start_writing was called. */
589 scm_i_string_stop_writing (void)
591 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
594 /* Return the Xth character of STR as a UCS-4 codepoint. */
596 scm_i_string_ref (SCM str
, size_t x
)
598 if (scm_i_is_narrow_string (str
))
599 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
601 return scm_i_string_wide_chars (str
)[x
];
604 /* Returns index+1 of the first char in STR that matches C, or
605 0 if the char is not found. */
607 scm_i_string_contains_char (SCM str
, char ch
)
610 size_t len
= scm_i_string_length (str
);
613 if (scm_i_is_narrow_string (str
))
617 if (scm_i_string_chars (str
)[i
] == ch
)
626 if (scm_i_string_wide_chars (str
)[i
]
627 == (unsigned char) ch
)
636 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
638 if (scm_i_is_narrow_string (sstr
))
640 const char *a
= scm_i_string_chars (sstr
) + start_x
;
641 const char *b
= cstr
;
642 return strncmp (a
, b
, strlen(b
));
647 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
648 const char *b
= cstr
;
649 for (i
= 0; i
< strlen (b
); i
++)
651 if (a
[i
] != (unsigned char) b
[i
])
658 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
660 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
662 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
663 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
665 if (scm_i_is_narrow_string (str
))
667 char *dst
= scm_i_string_writable_chars (str
);
672 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
680 Basic symbol creation and accessing is done here, the rest is in
681 symbols.[hc]. This has been done to keep stringbufs and the
682 internals of strings and string-like objects confined to this file.
685 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
688 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
689 unsigned long hash
, SCM props
)
692 size_t start
= STRING_START (name
);
693 size_t length
= STRING_LENGTH (name
);
695 if (IS_SH_STRING (name
))
697 name
= SH_STRING_STRING (name
);
698 start
+= STRING_START (name
);
700 buf
= SYMBOL_STRINGBUF (name
);
702 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
705 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
706 SET_STRINGBUF_SHARED (buf
);
707 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
712 if (scm_i_is_narrow_string (name
))
714 SCM new_buf
= make_stringbuf (length
);
715 memcpy (STRINGBUF_CHARS (new_buf
),
716 STRINGBUF_CHARS (buf
) + start
, length
);
721 SCM new_buf
= make_wide_stringbuf (length
);
722 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
723 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
728 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
729 (scm_t_bits
) hash
, SCM_UNPACK (props
));
733 scm_i_c_make_symbol (const char *name
, size_t len
,
734 scm_t_bits flags
, unsigned long hash
, SCM props
)
736 SCM buf
= make_stringbuf (len
);
737 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
739 return scm_immutable_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
740 (scm_t_bits
) hash
, SCM_UNPACK (props
));
743 /* Returns the number of characters in SYM. This may be different
744 from the memory size of SYM. */
746 scm_i_symbol_length (SCM sym
)
748 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
752 scm_c_symbol_length (SCM sym
)
753 #define FUNC_NAME "scm_c_symbol_length"
755 SCM_VALIDATE_SYMBOL (1, sym
);
757 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
761 /* True if the name of SYM is stored as a Latin-1 encoded string.
762 False if it is stored as a 32-bit UCS-4-encoded string. */
764 scm_i_is_narrow_symbol (SCM sym
)
768 buf
= SYMBOL_STRINGBUF (sym
);
769 return !STRINGBUF_WIDE (buf
);
772 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
773 contains the name of SYM. */
775 scm_i_symbol_chars (SCM sym
)
779 buf
= SYMBOL_STRINGBUF (sym
);
780 if (!STRINGBUF_WIDE (buf
))
781 return (const char *) STRINGBUF_CHARS (buf
);
783 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
787 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
790 scm_i_symbol_wide_chars (SCM sym
)
794 buf
= SYMBOL_STRINGBUF (sym
);
795 if (STRINGBUF_WIDE (buf
))
796 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
798 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
803 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
805 SCM buf
= SYMBOL_STRINGBUF (sym
);
806 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
807 SET_STRINGBUF_SHARED (buf
);
808 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
809 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
810 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
813 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
815 scm_i_symbol_ref (SCM sym
, size_t x
)
817 if (scm_i_is_narrow_symbol (sym
))
818 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
820 return scm_i_symbol_wide_chars (sym
)[x
];
826 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
827 "Returns an association list containing debugging information\n"
828 "for @var{str}. The association list has the following entries."
831 "The string itself.\n"
833 "The start index of the string into its stringbuf\n"
835 "The length of the string\n"
837 "If this string is a substring, it returns its parent string.\n"
838 "Otherwise, it returns @code{#f}\n"
840 "@code{#t} if the string is read-only\n"
841 "@item stringbuf-chars\n"
842 "A new string containing this string's stringbuf's characters\n"
843 "@item stringbuf-length\n"
844 "The number of characters in this stringbuf\n"
845 "@item stringbuf-shared\n"
846 "@code{#t} if this stringbuf is shared\n"
847 "@item stringbuf-wide\n"
848 "@code{#t} if this stringbuf's characters are stored in a\n"
849 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
852 #define FUNC_NAME s_scm_sys_string_dump
854 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
856 SCM_VALIDATE_STRING (1, str
);
859 e1
= scm_cons (scm_from_locale_symbol ("string"),
861 e2
= scm_cons (scm_from_locale_symbol ("start"),
862 scm_from_size_t (STRING_START (str
)));
863 e3
= scm_cons (scm_from_locale_symbol ("length"),
864 scm_from_size_t (STRING_LENGTH (str
)));
866 if (IS_SH_STRING (str
))
868 e4
= scm_cons (scm_from_locale_symbol ("shared"),
869 SH_STRING_STRING (str
));
870 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
874 e4
= scm_cons (scm_from_locale_symbol ("shared"),
876 buf
= STRING_STRINGBUF (str
);
879 if (IS_RO_STRING (str
))
880 e5
= scm_cons (scm_from_locale_symbol ("read-only"),
883 e5
= scm_cons (scm_from_locale_symbol ("read-only"),
887 if (!STRINGBUF_WIDE (buf
))
889 size_t len
= STRINGBUF_LENGTH (buf
);
891 SCM sbc
= scm_i_make_string (len
, &cbuf
);
892 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
893 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
898 size_t len
= STRINGBUF_LENGTH (buf
);
900 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
);
901 u32_cpy ((scm_t_uint32
*) cbuf
,
902 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
903 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
906 e7
= scm_cons (scm_from_locale_symbol ("stringbuf-length"),
907 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
908 if (STRINGBUF_SHARED (buf
))
909 e8
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
912 e8
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
914 if (STRINGBUF_WIDE (buf
))
915 e9
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
918 e9
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
921 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
925 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
926 "Returns an association list containing debugging information\n"
927 "for @var{sym}. The association list has the following entries."
930 "The symbol itself\n"
934 "@code{#t} if it is an interned symbol\n"
935 "@item stringbuf-chars\n"
936 "A new string containing this symbols's stringbuf's characters\n"
937 "@item stringbuf-length\n"
938 "The number of characters in this stringbuf\n"
939 "@item stringbuf-shared\n"
940 "@code{#t} if this stringbuf is shared\n"
941 "@item stringbuf-wide\n"
942 "@code{#t} if this stringbuf's characters are stored in a\n"
943 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
946 #define FUNC_NAME s_scm_sys_symbol_dump
948 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
950 SCM_VALIDATE_SYMBOL (1, sym
);
951 e1
= scm_cons (scm_from_locale_symbol ("symbol"),
953 e2
= scm_cons (scm_from_locale_symbol ("hash"),
954 scm_from_ulong (scm_i_symbol_hash (sym
)));
955 e3
= scm_cons (scm_from_locale_symbol ("interned"),
956 scm_symbol_interned_p (sym
));
957 buf
= SYMBOL_STRINGBUF (sym
);
960 if (!STRINGBUF_WIDE (buf
))
962 size_t len
= STRINGBUF_LENGTH (buf
);
964 SCM sbc
= scm_i_make_string (len
, &cbuf
);
965 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
966 e4
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
971 size_t len
= STRINGBUF_LENGTH (buf
);
973 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
);
974 u32_cpy ((scm_t_uint32
*) cbuf
,
975 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
976 e4
= scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
979 e5
= scm_cons (scm_from_locale_symbol ("stringbuf-length"),
980 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
981 if (STRINGBUF_SHARED (buf
))
982 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
985 e6
= scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
987 if (STRINGBUF_WIDE (buf
))
988 e7
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
991 e7
= scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
993 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
998 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1000 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1001 #define FUNC_NAME s_scm_sys_stringbuf_hist
1004 for (i
= 0; i
< 1000; i
++)
1006 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1007 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1008 return SCM_UNSPECIFIED
;
1016 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1018 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1019 #define FUNC_NAME s_scm_string_p
1021 return scm_from_bool (IS_STRING (obj
));
1026 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1028 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1030 "@deffnx {Scheme Procedure} list->string chrs\n"
1031 "Return a newly allocated string composed of the arguments,\n"
1033 #define FUNC_NAME s_scm_string
1035 SCM result
= SCM_BOOL_F
;
1042 /* Verify that this is a list of chars. */
1043 i
= scm_ilength (chrs
);
1044 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1049 while (len
> 0 && scm_is_pair (rest
))
1051 SCM elt
= SCM_CAR (rest
);
1052 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1053 if (SCM_CHAR (elt
) > 0xFF)
1055 rest
= SCM_CDR (rest
);
1057 scm_remember_upto_here_1 (elt
);
1060 /* Construct a string containing this list of chars. */
1068 result
= scm_i_make_string (len
, NULL
);
1069 result
= scm_i_string_start_writing (result
);
1070 buf
= scm_i_string_writable_chars (result
);
1071 while (len
> 0 && scm_is_pair (rest
))
1073 SCM elt
= SCM_CAR (rest
);
1074 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1076 rest
= SCM_CDR (rest
);
1078 scm_remember_upto_here_1 (elt
);
1085 result
= scm_i_make_wide_string (len
, NULL
);
1086 result
= scm_i_string_start_writing (result
);
1087 buf
= scm_i_string_writable_wide_chars (result
);
1088 while (len
> 0 && scm_is_pair (rest
))
1090 SCM elt
= SCM_CAR (rest
);
1091 buf
[p
] = SCM_CHAR (elt
);
1093 rest
= SCM_CDR (rest
);
1095 scm_remember_upto_here_1 (elt
);
1098 scm_i_string_stop_writing ();
1101 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1102 if (!scm_is_null (rest
))
1103 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1109 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1111 "Return a newly allocated string of\n"
1112 "length @var{k}. If @var{chr} is given, then all elements of\n"
1113 "the string are initialized to @var{chr}, otherwise the contents\n"
1114 "of the @var{string} are unspecified.")
1115 #define FUNC_NAME s_scm_make_string
1117 return scm_c_make_string (scm_to_size_t (k
), chr
);
1122 scm_c_make_string (size_t len
, SCM chr
)
1123 #define FUNC_NAME NULL
1126 SCM res
= scm_i_make_string (len
, NULL
);
1128 if (!SCM_UNBNDP (chr
))
1130 SCM_VALIDATE_CHAR (0, chr
);
1131 res
= scm_i_string_start_writing (res
);
1132 for (p
= 0; p
< len
; p
++)
1133 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1134 scm_i_string_stop_writing ();
1141 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1143 "Return the number of characters in @var{string}.")
1144 #define FUNC_NAME s_scm_string_length
1146 SCM_VALIDATE_STRING (1, string
);
1147 return scm_from_size_t (STRING_LENGTH (string
));
1151 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1153 "Return the bytes used to represent a character in @var{string}."
1154 "This will return 1 or 4.")
1155 #define FUNC_NAME s_scm_string_bytes_per_char
1157 SCM_VALIDATE_STRING (1, string
);
1158 if (!scm_i_is_narrow_string (string
))
1159 return scm_from_int (4);
1161 return scm_from_int (1);
1166 scm_c_string_length (SCM string
)
1168 if (!IS_STRING (string
))
1169 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1170 return STRING_LENGTH (string
);
1173 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1175 "Return character @var{k} of @var{str} using zero-origin\n"
1176 "indexing. @var{k} must be a valid index of @var{str}.")
1177 #define FUNC_NAME s_scm_string_ref
1182 SCM_VALIDATE_STRING (1, str
);
1184 len
= scm_i_string_length (str
);
1185 if (SCM_LIKELY (len
> 0))
1186 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1188 scm_out_of_range (NULL
, k
);
1190 if (scm_i_is_narrow_string (str
))
1191 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1193 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1198 scm_c_string_ref (SCM str
, size_t p
)
1200 if (p
>= scm_i_string_length (str
))
1201 scm_out_of_range (NULL
, scm_from_size_t (p
));
1202 if (scm_i_is_narrow_string (str
))
1203 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1205 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1209 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1210 (SCM str
, SCM k
, SCM chr
),
1211 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1212 "an unspecified value. @var{k} must be a valid index of\n"
1214 #define FUNC_NAME s_scm_string_set_x
1219 SCM_VALIDATE_STRING (1, str
);
1221 len
= scm_i_string_length (str
);
1222 if (SCM_LIKELY (len
> 0))
1223 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1225 scm_out_of_range (NULL
, k
);
1227 SCM_VALIDATE_CHAR (3, chr
);
1228 str
= scm_i_string_start_writing (str
);
1229 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1230 scm_i_string_stop_writing ();
1232 return SCM_UNSPECIFIED
;
1237 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1239 if (p
>= scm_i_string_length (str
))
1240 scm_out_of_range (NULL
, scm_from_size_t (p
));
1241 str
= scm_i_string_start_writing (str
);
1242 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1243 scm_i_string_stop_writing ();
1246 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1247 (SCM str
, SCM start
, SCM end
),
1248 "Return a newly allocated string formed from the characters\n"
1249 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1250 "ending with index @var{end} (exclusive).\n"
1251 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1252 "exact integers satisfying:\n\n"
1253 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1254 #define FUNC_NAME s_scm_substring
1256 size_t len
, from
, to
;
1258 SCM_VALIDATE_STRING (1, str
);
1259 len
= scm_i_string_length (str
);
1260 from
= scm_to_unsigned_integer (start
, 0, len
);
1261 if (SCM_UNBNDP (end
))
1264 to
= scm_to_unsigned_integer (end
, from
, len
);
1265 return scm_i_substring (str
, from
, to
);
1269 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1270 (SCM str
, SCM start
, SCM end
),
1271 "Return a newly allocated string formed from the characters\n"
1272 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1273 "ending with index @var{end} (exclusive).\n"
1274 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1275 "exact integers satisfying:\n"
1277 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1279 "The returned string is read-only.\n")
1280 #define FUNC_NAME s_scm_substring_read_only
1282 size_t len
, from
, to
;
1284 SCM_VALIDATE_STRING (1, str
);
1285 len
= scm_i_string_length (str
);
1286 from
= scm_to_unsigned_integer (start
, 0, len
);
1287 if (SCM_UNBNDP (end
))
1290 to
= scm_to_unsigned_integer (end
, from
, len
);
1291 return scm_i_substring_read_only (str
, from
, to
);
1295 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1296 (SCM str
, SCM start
, SCM end
),
1297 "Return a newly allocated string formed from the characters\n"
1298 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1299 "ending with index @var{end} (exclusive).\n"
1300 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1301 "exact integers satisfying:\n\n"
1302 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1303 #define FUNC_NAME s_scm_substring_copy
1305 /* For the Scheme version, START is mandatory, but for the C
1306 version, it is optional. See scm_string_copy in srfi-13.c for a
1312 SCM_VALIDATE_STRING (1, str
);
1313 scm_i_get_substring_spec (scm_i_string_length (str
),
1314 start
, &from
, end
, &to
);
1315 return scm_i_substring_copy (str
, from
, to
);
1319 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1320 (SCM str
, SCM start
, SCM end
),
1321 "Return string that indirectly refers to the characters\n"
1322 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323 "ending with index @var{end} (exclusive).\n"
1324 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325 "exact integers satisfying:\n\n"
1326 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1327 #define FUNC_NAME s_scm_substring_shared
1329 size_t len
, from
, to
;
1331 SCM_VALIDATE_STRING (1, str
);
1332 len
= scm_i_string_length (str
);
1333 from
= scm_to_unsigned_integer (start
, 0, len
);
1334 if (SCM_UNBNDP (end
))
1337 to
= scm_to_unsigned_integer (end
, from
, len
);
1338 return scm_i_substring_shared (str
, from
, to
);
1342 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1344 "Return a newly allocated string whose characters form the\n"
1345 "concatenation of the given strings, @var{args}.")
1346 #define FUNC_NAME s_scm_string_append
1359 SCM_VALIDATE_REST_ARGUMENT (args
);
1360 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1363 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1364 len
+= scm_i_string_length (s
);
1365 if (!scm_i_is_narrow_string (s
))
1370 res
= scm_i_make_string (len
, &data
.narrow
);
1372 res
= scm_i_make_wide_string (len
, &data
.wide
);
1374 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1378 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1379 len
= scm_i_string_length (s
);
1382 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1387 if (scm_i_is_narrow_string (s
))
1389 for (i
= 0; i
< scm_i_string_length (s
); i
++)
1390 data
.wide
[i
] = (unsigned char) scm_i_string_chars (s
)[i
];
1393 u32_cpy ((scm_t_uint32
*) data
.wide
,
1394 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1397 scm_remember_upto_here_1 (s
);
1404 scm_is_string (SCM obj
)
1406 return IS_STRING (obj
);
1410 /* Conversion to/from other encodings. */
1412 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1414 scm_encoding_error (const char *subr
, int err
, const char *message
,
1415 const char *from
, const char *to
, SCM string_or_bv
)
1417 /* Raise an exception that conveys all the information needed to debug the
1418 problem. Only perform locale conversions that are safe; in particular,
1419 don't try to display STRING_OR_BV when it's a string since converting it to
1420 the output locale may fail. */
1421 scm_throw (scm_encoding_error_key
,
1422 scm_list_n (scm_from_locale_string (subr
),
1423 scm_from_locale_string (message
),
1425 scm_from_locale_string (from
),
1426 scm_from_locale_string (to
),
1432 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1433 scm_t_string_failed_conversion_handler handler
)
1443 if (encoding
== NULL
)
1445 /* If encoding is null, use Latin-1. */
1447 res
= scm_i_make_string (len
, &buf
);
1448 memcpy (buf
, str
, len
);
1453 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1454 (enum iconv_ilseq_handler
)
1460 if (SCM_UNLIKELY (u32
== NULL
))
1462 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1467 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1468 memcpy (buf
, str
, len
);
1469 bv
= scm_c_take_bytevector (buf
, len
);
1471 scm_encoding_error (__func__
, errno
,
1472 "input locale conversion error",
1473 encoding
, "UTF-32", bv
);
1478 if (u32
[i
++] > 0xFF)
1487 res
= scm_i_make_string (u32len
, &dst
);
1488 for (i
= 0; i
< u32len
; i
++)
1489 dst
[i
] = (unsigned char) u32
[i
];
1495 res
= scm_i_make_wide_string (u32len
, &wdst
);
1496 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1505 scm_from_latin1_stringn (const char *str
, size_t len
)
1507 return scm_from_stringn (str
, len
, NULL
, SCM_FAILED_CONVERSION_ERROR
);
1511 scm_from_locale_stringn (const char *str
, size_t len
)
1514 scm_t_string_failed_conversion_handler hndl
;
1518 if (len
== (size_t) -1)
1523 inport
= scm_current_input_port ();
1524 if (!SCM_UNBNDP (inport
) && SCM_OPINPORTP (inport
))
1526 pt
= SCM_PTAB_ENTRY (inport
);
1528 hndl
= pt
->ilseq_handler
;
1533 hndl
= SCM_FAILED_CONVERSION_ERROR
;
1536 return scm_from_stringn (str
, len
, enc
, hndl
);
1540 scm_from_locale_string (const char *str
)
1545 return scm_from_locale_stringn (str
, -1);
1549 scm_i_from_utf8_string (const scm_t_uint8
*str
)
1551 return scm_from_stringn ((const char *) str
,
1552 strlen ((char *) str
), "UTF-8",
1553 SCM_FAILED_CONVERSION_ERROR
);
1556 /* Create a new scheme string from the C string STR. The memory of
1557 STR may be used directly as storage for the new string. */
1558 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1559 would be to register a finalizer to eventually free(3) STR, which isn't
1560 worth it. Should we just deprecate the `scm_take_' functions? */
1562 scm_take_locale_stringn (char *str
, size_t len
)
1566 res
= scm_from_locale_stringn (str
, len
);
1573 scm_take_locale_string (char *str
)
1575 return scm_take_locale_stringn (str
, -1);
1578 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1579 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1580 Set *LENP to the size of the resulting string. */
1582 scm_i_unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1584 char *before
, *after
;
1593 if ((i
<= *lenp
- 6)
1594 && before
[i
] == '\\'
1595 && before
[i
+ 1] == 'u'
1596 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1598 /* Convert \u00NN to \xNN */
1601 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1602 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1606 else if ((i
<= *lenp
- 10)
1607 && before
[i
] == '\\'
1608 && before
[i
+ 1] == 'U'
1609 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1611 /* Convert \U00NNNNNN to \UNNNNNN */
1614 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1615 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1616 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1617 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1618 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1619 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1625 after
[j
] = before
[i
];
1633 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1634 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1635 of the resulting string. BUF must be large enough to handle the
1636 worst case when `\uXXXX' escapes (6 characters) are replaced by
1637 `\xXXXX;' (7 characters). */
1639 scm_i_unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1641 char *before
, *after
;
1643 /* The worst case is if the input string contains all 4-digit hex escapes.
1644 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1645 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1646 size_t nzeros
, ndigits
;
1649 after
= alloca (max_out_len
);
1654 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1655 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1657 if (before
[i
+ 1] == 'u')
1659 else if (before
[i
+ 1] == 'U')
1664 /* Add the R6RS hex escape initial sequence. */
1668 /* Move string positions to the start of the hex numbers. */
1672 /* Find the number of initial zeros in this hex number. */
1674 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1677 /* Copy the number, skipping initial zeros, and then move the string
1679 if (nzeros
== ndigits
)
1688 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1689 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1691 j
+= (ndigits
- nzeros
);
1694 /* Add terminating semicolon. */
1700 after
[j
] = before
[i
];
1706 memcpy (before
, after
, j
);
1710 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1712 return scm_to_stringn (str
, lenp
, NULL
, SCM_FAILED_CONVERSION_ERROR
);
1716 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1722 outport
= scm_current_output_port ();
1723 if (!SCM_UNBNDP (outport
) && SCM_OPOUTPORTP (outport
))
1725 pt
= SCM_PTAB_ENTRY (outport
);
1731 return scm_to_stringn (str
, lenp
,
1733 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1736 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1737 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1738 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1739 defined by HANDLER. */
1741 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
1742 scm_t_string_failed_conversion_handler handler
)
1745 size_t ilen
, len
, i
;
1749 if (!scm_is_string (str
))
1750 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1751 ilen
= scm_i_string_length (str
);
1755 buf
= scm_malloc (1);
1763 for (i
= 0; i
< ilen
; i
++)
1764 if (scm_i_string_ref (str
, i
) == '\0')
1765 scm_misc_error (NULL
,
1766 "string contains #\\nul character: ~S",
1769 if (scm_i_is_narrow_string (str
) && (encoding
== NULL
))
1771 /* If using native Latin-1 encoding, just copy the string
1775 buf
= scm_malloc (ilen
);
1776 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1782 buf
= scm_malloc (ilen
+ 1);
1783 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1795 if (scm_i_is_narrow_string (str
))
1797 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
1799 (enum iconv_ilseq_handler
) handler
, NULL
,
1803 scm_encoding_error (__func__
, errno
,
1804 "cannot convert to output locale",
1805 "ISO-8859-1", enc
, str
);
1809 buf
= u32_conv_to_encoding (enc
,
1810 (enum iconv_ilseq_handler
) handler
,
1811 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
1816 scm_encoding_error (__func__
, errno
,
1817 "cannot convert to output locale",
1818 "UTF-32", enc
, str
);
1820 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
1822 if (SCM_R6RS_ESCAPES_P
)
1824 /* The worst case is if the input string contains all 4-digit
1825 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1826 (seven characters). Make BUF large enough to hold
1828 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
1829 scm_i_unistring_escapes_to_r6rs_escapes (buf
, &len
);
1832 scm_i_unistring_escapes_to_guile_escapes (buf
, &len
);
1834 buf
= scm_realloc (buf
, len
);
1840 buf
= scm_realloc (buf
, len
+ 1);
1844 scm_remember_upto_here_1 (str
);
1849 scm_to_locale_string (SCM str
)
1851 return scm_to_locale_stringn (str
, NULL
);
1855 scm_i_to_utf8_string (SCM str
)
1858 u8str
= scm_to_stringn (str
, NULL
, "UTF-8", SCM_FAILED_CONVERSION_ERROR
);
1859 return (scm_t_uint8
*) u8str
;
1863 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
1866 char *result
= NULL
;
1867 if (!scm_is_string (str
))
1868 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1869 result
= scm_to_locale_stringn (str
, &len
);
1871 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
1874 scm_remember_upto_here_1 (str
);
1879 /* Unicode string normalization. */
1881 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1882 libguile/i18n.c. It would be useful to have this factored out into a more
1883 convenient location, but its use of alloca makes that tricky to do. */
1886 normalize_str (SCM string
, uninorm_t form
)
1889 scm_t_uint32
*w_str
;
1891 size_t rlen
, len
= scm_i_string_length (string
);
1893 if (scm_i_is_narrow_string (string
))
1896 const char *buf
= scm_i_string_chars (string
);
1898 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
1900 for (i
= 0; i
< len
; i
++)
1901 w_str
[i
] = (unsigned char) buf
[i
];
1905 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
1907 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
1909 ret
= scm_i_make_wide_string (rlen
, &cbuf
);
1910 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
1913 scm_i_try_narrow_string (ret
);
1918 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
1920 "Returns the NFC normalized form of @var{string}.")
1921 #define FUNC_NAME s_scm_string_normalize_nfc
1923 SCM_VALIDATE_STRING (1, string
);
1924 return normalize_str (string
, UNINORM_NFC
);
1928 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
1930 "Returns the NFD normalized form of @var{string}.")
1931 #define FUNC_NAME s_scm_string_normalize_nfd
1933 SCM_VALIDATE_STRING (1, string
);
1934 return normalize_str (string
, UNINORM_NFD
);
1938 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
1940 "Returns the NFKC normalized form of @var{string}.")
1941 #define FUNC_NAME s_scm_string_normalize_nfkc
1943 SCM_VALIDATE_STRING (1, string
);
1944 return normalize_str (string
, UNINORM_NFKC
);
1948 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
1950 "Returns the NFKD normalized form of @var{string}.")
1951 #define FUNC_NAME s_scm_string_normalize_nfkd
1953 SCM_VALIDATE_STRING (1, string
);
1954 return normalize_str (string
, UNINORM_NFKD
);
1958 /* converts C scm_array of strings to SCM scm_list of strings. */
1959 /* If argc < 0, a null terminated scm_array is assumed. */
1961 scm_makfromstrs (int argc
, char **argv
)
1966 for (i
= 0; argv
[i
]; i
++);
1968 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
1972 /* Return a newly allocated array of char pointers to each of the strings
1973 in args, with a terminating NULL pointer. */
1976 scm_i_allocate_string_pointers (SCM list
)
1977 #define FUNC_NAME "scm_i_allocate_string_pointers"
1980 int len
= scm_ilength (list
);
1984 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
1986 result
= scm_gc_malloc ((len
+ 1) * sizeof (char *),
1990 /* The list might be have been modified in another thread, so
1991 we check LIST before each access.
1993 for (i
= 0; i
< len
&& scm_is_pair (list
); i
++)
1998 str
= SCM_CAR (list
);
1999 len
= scm_c_string_length (str
);
2001 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string pointers");
2002 memcpy (result
[i
], scm_i_string_chars (str
), len
);
2003 result
[i
][len
] = '\0';
2005 list
= SCM_CDR (list
);
2013 scm_i_get_substring_spec (size_t len
,
2014 SCM start
, size_t *cstart
,
2015 SCM end
, size_t *cend
)
2017 if (SCM_UNBNDP (start
))
2020 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2022 if (SCM_UNBNDP (end
))
2025 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2028 #if SCM_ENABLE_DEPRECATED
2030 /* When these definitions are removed, it becomes reasonable to use
2031 read-only strings for string literals. For that, change the reader
2032 to create string literals with scm_c_substring_read_only instead of
2033 with scm_c_substring_copy.
2037 scm_i_deprecated_stringp (SCM str
)
2039 scm_c_issue_deprecation_warning
2040 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2042 return scm_is_string (str
);
2046 scm_i_deprecated_string_chars (SCM str
)
2050 scm_c_issue_deprecation_warning
2051 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2053 /* We don't accept shared substrings here since they are not
2056 if (IS_SH_STRING (str
))
2057 scm_misc_error (NULL
,
2058 "SCM_STRING_CHARS does not work with shared substrings",
2061 /* We explicitly test for read-only strings to produce a better
2065 if (IS_RO_STRING (str
))
2066 scm_misc_error (NULL
,
2067 "SCM_STRING_CHARS does not work with read-only strings",
2070 /* The following is still wrong, of course...
2072 str
= scm_i_string_start_writing (str
);
2073 chars
= scm_i_string_writable_chars (str
);
2074 scm_i_string_stop_writing ();
2079 scm_i_deprecated_string_length (SCM str
)
2081 scm_c_issue_deprecation_warning
2082 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2083 return scm_c_string_length (str
);
2089 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2091 return scm_c_string_ref (h
->array
, index
);
2095 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2097 scm_c_string_set_x (h
->array
, index
, val
);
2101 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2107 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2109 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2110 h
->elements
= h
->writable_elements
= NULL
;
2113 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2114 string_handle_ref
, string_handle_set
,
2116 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2121 scm_nullstr
= scm_i_make_string (0, NULL
);
2123 #include "libguile/strings.x"