1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "striconveh.h"
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
94 #define SET_STRINGBUF_SHARED(buf) \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist
[1001];
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
111 make_stringbuf (size_t len
)
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
129 buf
= PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
132 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
133 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
135 STRINGBUF_CHARS (buf
)[len
] = 0;
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
143 make_wide_stringbuf (size_t len
)
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
155 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
156 buf
= PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
159 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
160 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
162 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
170 wide_stringbuf (SCM buf
)
174 if (STRINGBUF_WIDE (buf
))
181 len
= STRINGBUF_LENGTH (buf
);
183 new_buf
= make_wide_stringbuf (len
);
185 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
186 for (i
= 0; i
< len
; i
++)
187 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
197 narrow_stringbuf (SCM buf
)
201 if (!STRINGBUF_WIDE (buf
))
209 len
= STRINGBUF_LENGTH (buf
);
210 wmem
= STRINGBUF_WIDE_CHARS (buf
);
212 for (i
= 0; i
< len
; i
++)
214 /* BUF cannot be narrowed. */
217 new_buf
= make_stringbuf (len
);
219 mem
= STRINGBUF_CHARS (new_buf
);
220 for (i
= 0; i
< len
; i
++)
221 mem
[i
] = (unsigned char) wmem
[i
];
228 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
231 /* Copy-on-write strings.
234 #define STRING_TAG scm_tc7_string
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
243 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
245 /* Read-only strings.
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
251 /* Mutation-sharing substrings
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
268 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
270 SCM buf
= make_stringbuf (len
);
273 *charsp
= (char *) STRINGBUF_CHARS (buf
);
274 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
276 (scm_t_bits
) 0, (scm_t_bits
) len
);
280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
285 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
287 SCM buf
= make_wide_stringbuf (len
);
290 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
291 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
293 (scm_t_bits
) 0, (scm_t_bits
) len
);
298 validate_substring_args (SCM str
, size_t start
, size_t end
)
300 if (!IS_STRING (str
))
301 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
302 if (start
> STRING_LENGTH (str
))
303 scm_out_of_range (NULL
, scm_from_size_t (start
));
304 if (end
> STRING_LENGTH (str
) || end
< start
)
305 scm_out_of_range (NULL
, scm_from_size_t (end
));
309 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
311 *start
= STRING_START (*str
);
312 if (IS_SH_STRING (*str
))
314 *str
= SH_STRING_STRING (*str
);
315 *start
+= STRING_START (*str
);
317 *buf
= STRING_STRINGBUF (*str
);
321 scm_i_substring (SCM str
, size_t start
, size_t end
)
325 get_str_buf_start (&str
, &buf
, &str_start
);
326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
327 SET_STRINGBUF_SHARED (buf
);
328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
329 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
330 (scm_t_bits
)str_start
+ start
,
331 (scm_t_bits
) end
- start
);
335 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
339 get_str_buf_start (&str
, &buf
, &str_start
);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
341 SET_STRINGBUF_SHARED (buf
);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
343 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
344 (scm_t_bits
)str_start
+ start
,
345 (scm_t_bits
) end
- start
);
349 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
351 size_t len
= end
- start
;
354 get_str_buf_start (&str
, &buf
, &str_start
);
355 if (scm_i_is_narrow_string (str
))
357 my_buf
= make_stringbuf (len
);
358 memcpy (STRINGBUF_CHARS (my_buf
),
359 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
363 my_buf
= make_wide_stringbuf (len
);
364 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
365 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
367 /* Even though this string is wide, the substring may be narrow.
368 Consider adding code to narrow the string. */
370 scm_remember_upto_here_1 (buf
);
371 return scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
372 (scm_t_bits
) 0, (scm_t_bits
) len
);
376 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
378 if (start
== 0 && end
== STRING_LENGTH (str
))
382 size_t len
= end
- start
;
383 if (IS_SH_STRING (str
))
385 start
+= STRING_START (str
);
386 str
= SH_STRING_STRING (str
);
388 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
389 (scm_t_bits
)start
, (scm_t_bits
) len
);
394 scm_c_substring (SCM str
, size_t start
, size_t end
)
396 validate_substring_args (str
, start
, end
);
397 return scm_i_substring (str
, start
, end
);
401 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
403 validate_substring_args (str
, start
, end
);
404 return scm_i_substring_read_only (str
, start
, end
);
408 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
410 validate_substring_args (str
, start
, end
);
411 return scm_i_substring_copy (str
, start
, end
);
415 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
417 validate_substring_args (str
, start
, end
);
418 return scm_i_substring_shared (str
, start
, end
);
422 /* Internal accessors
425 /* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
428 scm_i_string_length (SCM str
)
430 return STRING_LENGTH (str
);
433 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
437 scm_i_is_narrow_string (SCM str
)
439 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
442 /* Try to coerce a string to be narrow. It if is narrow already, do
443 nothing. If it is wide, shrink it to narrow if none of its
444 characters are above 0xFF. Return true if the string is narrow or
445 was made to be narrow. */
447 scm_i_try_narrow_string (SCM str
)
449 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
451 return scm_i_is_narrow_string (str
);
454 /* Return a pointer to the raw data of the string, which can be either Latin-1
455 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
457 scm_i_string_data (SCM str
)
463 get_str_buf_start (&str
, &buf
, &start
);
465 data
= STRINGBUF_CONTENTS (buf
);
466 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
471 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
474 scm_i_string_chars (SCM str
)
478 get_str_buf_start (&str
, &buf
, &start
);
479 if (scm_i_is_narrow_string (str
))
480 return (const char *) STRINGBUF_CHARS (buf
) + start
;
482 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
487 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
490 scm_i_string_wide_chars (SCM str
)
495 get_str_buf_start (&str
, &buf
, &start
);
496 if (!scm_i_is_narrow_string (str
))
497 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
499 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
503 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
504 a new string buffer, so that it can be modified without modifying
505 other strings. Also, lock the string mutex. Later, one must call
506 scm_i_string_stop_writing to unlock the mutex. */
508 scm_i_string_start_writing (SCM orig_str
)
510 SCM buf
, str
= orig_str
;
513 get_str_buf_start (&str
, &buf
, &start
);
514 if (IS_RO_STRING (str
))
515 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
517 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
518 if (STRINGBUF_SHARED (buf
))
520 /* Clone the stringbuf. */
521 size_t len
= STRING_LENGTH (str
);
524 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
526 if (scm_i_is_narrow_string (str
))
528 new_buf
= make_stringbuf (len
);
529 memcpy (STRINGBUF_CHARS (new_buf
),
530 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
535 new_buf
= make_wide_stringbuf (len
);
536 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
537 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
538 + STRING_START (str
)), len
);
541 SET_STRING_STRINGBUF (str
, new_buf
);
542 start
-= STRING_START (str
);
544 /* FIXME: The following operations are not atomic, so other threads
545 looking at STR may see an inconsistent state. Nevertheless it can't
546 hurt much since (i) accessing STR while it is being mutated can't
547 yield a crash, and (ii) concurrent accesses to STR should be
548 protected by a mutex at the application level. The latter may not
549 apply when STR != ORIG_STR, though. */
550 SET_STRING_START (str
, 0);
551 SET_STRING_STRINGBUF (str
, new_buf
);
555 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
560 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
562 scm_i_string_writable_chars (SCM str
)
567 get_str_buf_start (&str
, &buf
, &start
);
568 if (scm_i_is_narrow_string (str
))
569 return (char *) STRINGBUF_CHARS (buf
) + start
;
571 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
576 /* Return a pointer to the UCS-4 codepoints of a string. */
578 scm_i_string_writable_wide_chars (SCM str
)
583 get_str_buf_start (&str
, &buf
, &start
);
584 if (!scm_i_is_narrow_string (str
))
585 return STRINGBUF_WIDE_CHARS (buf
) + start
;
587 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
591 /* Unlock the string mutex that was locked when
592 scm_i_string_start_writing was called. */
594 scm_i_string_stop_writing (void)
596 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
599 /* Return the Xth character of STR as a UCS-4 codepoint. */
601 scm_i_string_ref (SCM str
, size_t x
)
603 if (scm_i_is_narrow_string (str
))
604 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
606 return scm_i_string_wide_chars (str
)[x
];
609 /* Returns index+1 of the first char in STR that matches C, or
610 0 if the char is not found. */
612 scm_i_string_contains_char (SCM str
, char ch
)
615 size_t len
= scm_i_string_length (str
);
618 if (scm_i_is_narrow_string (str
))
622 if (scm_i_string_chars (str
)[i
] == ch
)
631 if (scm_i_string_wide_chars (str
)[i
]
632 == (unsigned char) ch
)
641 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
643 if (scm_i_is_narrow_string (sstr
))
645 const char *a
= scm_i_string_chars (sstr
) + start_x
;
646 const char *b
= cstr
;
647 return strncmp (a
, b
, strlen(b
));
652 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
653 const char *b
= cstr
;
654 for (i
= 0; i
< strlen (b
); i
++)
656 if (a
[i
] != (unsigned char) b
[i
])
663 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
665 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
667 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
668 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
670 if (scm_i_is_narrow_string (str
))
672 char *dst
= scm_i_string_writable_chars (str
);
677 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
685 Basic symbol creation and accessing is done here, the rest is in
686 symbols.[hc]. This has been done to keep stringbufs and the
687 internals of strings and string-like objects confined to this file.
690 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
693 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
694 unsigned long hash
, SCM props
)
697 size_t start
= STRING_START (name
);
698 size_t length
= STRING_LENGTH (name
);
700 if (IS_SH_STRING (name
))
702 name
= SH_STRING_STRING (name
);
703 start
+= STRING_START (name
);
705 buf
= SYMBOL_STRINGBUF (name
);
707 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
710 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
711 SET_STRINGBUF_SHARED (buf
);
712 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
717 if (scm_i_is_narrow_string (name
))
719 SCM new_buf
= make_stringbuf (length
);
720 memcpy (STRINGBUF_CHARS (new_buf
),
721 STRINGBUF_CHARS (buf
) + start
, length
);
726 SCM new_buf
= make_wide_stringbuf (length
);
727 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
728 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
733 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
734 (scm_t_bits
) hash
, SCM_UNPACK (props
));
738 scm_i_c_make_symbol (const char *name
, size_t len
,
739 scm_t_bits flags
, unsigned long hash
, SCM props
)
741 SCM buf
= make_stringbuf (len
);
742 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
744 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
745 (scm_t_bits
) hash
, SCM_UNPACK (props
));
748 /* Returns the number of characters in SYM. This may be different
749 from the memory size of SYM. */
751 scm_i_symbol_length (SCM sym
)
753 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
757 scm_c_symbol_length (SCM sym
)
758 #define FUNC_NAME "scm_c_symbol_length"
760 SCM_VALIDATE_SYMBOL (1, sym
);
762 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
766 /* True if the name of SYM is stored as a Latin-1 encoded string.
767 False if it is stored as a 32-bit UCS-4-encoded string. */
769 scm_i_is_narrow_symbol (SCM sym
)
773 buf
= SYMBOL_STRINGBUF (sym
);
774 return !STRINGBUF_WIDE (buf
);
777 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
778 contains the name of SYM. */
780 scm_i_symbol_chars (SCM sym
)
784 buf
= SYMBOL_STRINGBUF (sym
);
785 if (!STRINGBUF_WIDE (buf
))
786 return (const char *) STRINGBUF_CHARS (buf
);
788 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
792 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
795 scm_i_symbol_wide_chars (SCM sym
)
799 buf
= SYMBOL_STRINGBUF (sym
);
800 if (STRINGBUF_WIDE (buf
))
801 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
803 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
808 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
810 SCM buf
= SYMBOL_STRINGBUF (sym
);
811 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
812 SET_STRINGBUF_SHARED (buf
);
813 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
814 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
815 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
818 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
820 scm_i_symbol_ref (SCM sym
, size_t x
)
822 if (scm_i_is_narrow_symbol (sym
))
823 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
825 return scm_i_symbol_wide_chars (sym
)[x
];
831 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
832 "Returns an association list containing debugging information\n"
833 "for @var{str}. The association list has the following entries."
836 "The string itself.\n"
838 "The start index of the string into its stringbuf\n"
840 "The length of the string\n"
842 "If this string is a substring, it returns its parent string.\n"
843 "Otherwise, it returns @code{#f}\n"
845 "@code{#t} if the string is read-only\n"
846 "@item stringbuf-chars\n"
847 "A new string containing this string's stringbuf's characters\n"
848 "@item stringbuf-length\n"
849 "The number of characters in this stringbuf\n"
850 "@item stringbuf-shared\n"
851 "@code{#t} if this stringbuf is shared\n"
852 "@item stringbuf-wide\n"
853 "@code{#t} if this stringbuf's characters are stored in a\n"
854 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
857 #define FUNC_NAME s_scm_sys_string_dump
859 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
861 SCM_VALIDATE_STRING (1, str
);
864 e1
= scm_cons (scm_from_latin1_symbol ("string"),
866 e2
= scm_cons (scm_from_latin1_symbol ("start"),
867 scm_from_size_t (STRING_START (str
)));
868 e3
= scm_cons (scm_from_latin1_symbol ("length"),
869 scm_from_size_t (STRING_LENGTH (str
)));
871 if (IS_SH_STRING (str
))
873 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
874 SH_STRING_STRING (str
));
875 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
879 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
881 buf
= STRING_STRINGBUF (str
);
884 if (IS_RO_STRING (str
))
885 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
888 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
892 if (!STRINGBUF_WIDE (buf
))
894 size_t len
= STRINGBUF_LENGTH (buf
);
896 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
897 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
898 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
903 size_t len
= STRINGBUF_LENGTH (buf
);
905 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
906 u32_cpy ((scm_t_uint32
*) cbuf
,
907 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
908 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
911 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
912 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
913 if (STRINGBUF_SHARED (buf
))
914 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
917 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
919 if (STRINGBUF_WIDE (buf
))
920 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
923 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
926 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
930 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
931 "Returns an association list containing debugging information\n"
932 "for @var{sym}. The association list has the following entries."
935 "The symbol itself\n"
939 "@code{#t} if it is an interned symbol\n"
940 "@item stringbuf-chars\n"
941 "A new string containing this symbols's stringbuf's characters\n"
942 "@item stringbuf-length\n"
943 "The number of characters in this stringbuf\n"
944 "@item stringbuf-shared\n"
945 "@code{#t} if this stringbuf is shared\n"
946 "@item stringbuf-wide\n"
947 "@code{#t} if this stringbuf's characters are stored in a\n"
948 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
951 #define FUNC_NAME s_scm_sys_symbol_dump
953 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
955 SCM_VALIDATE_SYMBOL (1, sym
);
956 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
958 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
959 scm_from_ulong (scm_i_symbol_hash (sym
)));
960 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
961 scm_symbol_interned_p (sym
));
962 buf
= SYMBOL_STRINGBUF (sym
);
965 if (!STRINGBUF_WIDE (buf
))
967 size_t len
= STRINGBUF_LENGTH (buf
);
969 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
970 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
971 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
976 size_t len
= STRINGBUF_LENGTH (buf
);
978 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
979 u32_cpy ((scm_t_uint32
*) cbuf
,
980 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
981 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
984 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
985 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
986 if (STRINGBUF_SHARED (buf
))
987 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
990 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
992 if (STRINGBUF_WIDE (buf
))
993 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
996 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
998 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1003 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1005 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1006 #define FUNC_NAME s_scm_sys_stringbuf_hist
1009 for (i
= 0; i
< 1000; i
++)
1011 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1012 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1013 return SCM_UNSPECIFIED
;
1021 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1023 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1024 #define FUNC_NAME s_scm_string_p
1026 return scm_from_bool (IS_STRING (obj
));
1031 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1033 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1035 "@deffnx {Scheme Procedure} list->string chrs\n"
1036 "Return a newly allocated string composed of the arguments,\n"
1038 #define FUNC_NAME s_scm_string
1040 SCM result
= SCM_BOOL_F
;
1047 /* Verify that this is a list of chars. */
1048 i
= scm_ilength (chrs
);
1049 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1054 while (len
> 0 && scm_is_pair (rest
))
1056 SCM elt
= SCM_CAR (rest
);
1057 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1058 if (SCM_CHAR (elt
) > 0xFF)
1060 rest
= SCM_CDR (rest
);
1062 scm_remember_upto_here_1 (elt
);
1065 /* Construct a string containing this list of chars. */
1073 result
= scm_i_make_string (len
, NULL
, 0);
1074 result
= scm_i_string_start_writing (result
);
1075 buf
= scm_i_string_writable_chars (result
);
1076 while (len
> 0 && scm_is_pair (rest
))
1078 SCM elt
= SCM_CAR (rest
);
1079 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1081 rest
= SCM_CDR (rest
);
1083 scm_remember_upto_here_1 (elt
);
1090 result
= scm_i_make_wide_string (len
, NULL
, 0);
1091 result
= scm_i_string_start_writing (result
);
1092 buf
= scm_i_string_writable_wide_chars (result
);
1093 while (len
> 0 && scm_is_pair (rest
))
1095 SCM elt
= SCM_CAR (rest
);
1096 buf
[p
] = SCM_CHAR (elt
);
1098 rest
= SCM_CDR (rest
);
1100 scm_remember_upto_here_1 (elt
);
1103 scm_i_string_stop_writing ();
1106 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1107 if (!scm_is_null (rest
))
1108 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1114 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1116 "Return a newly allocated string of\n"
1117 "length @var{k}. If @var{chr} is given, then all elements of\n"
1118 "the string are initialized to @var{chr}, otherwise the contents\n"
1119 "of the @var{string} are all set to @var{#\nul}.")
1120 #define FUNC_NAME s_scm_make_string
1122 return scm_c_make_string (scm_to_size_t (k
), chr
);
1127 scm_c_make_string (size_t len
, SCM chr
)
1128 #define FUNC_NAME NULL
1131 char *contents
= NULL
;
1132 SCM res
= scm_i_make_string (len
, &contents
, 0);
1134 /* If no char is given, initialize string contents to NULL. */
1135 if (SCM_UNBNDP (chr
))
1136 memset (contents
, 0, len
);
1139 SCM_VALIDATE_CHAR (0, chr
);
1140 res
= scm_i_string_start_writing (res
);
1141 for (p
= 0; p
< len
; p
++)
1142 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1143 scm_i_string_stop_writing ();
1150 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1152 "Return the number of characters in @var{string}.")
1153 #define FUNC_NAME s_scm_string_length
1155 SCM_VALIDATE_STRING (1, string
);
1156 return scm_from_size_t (STRING_LENGTH (string
));
1160 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1162 "Return the bytes used to represent a character in @var{string}."
1163 "This will return 1 or 4.")
1164 #define FUNC_NAME s_scm_string_bytes_per_char
1166 SCM_VALIDATE_STRING (1, string
);
1167 if (!scm_i_is_narrow_string (string
))
1168 return scm_from_int (4);
1170 return scm_from_int (1);
1175 scm_c_string_length (SCM string
)
1177 if (!IS_STRING (string
))
1178 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1179 return STRING_LENGTH (string
);
1182 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1184 "Return character @var{k} of @var{str} using zero-origin\n"
1185 "indexing. @var{k} must be a valid index of @var{str}.")
1186 #define FUNC_NAME s_scm_string_ref
1191 SCM_VALIDATE_STRING (1, str
);
1193 len
= scm_i_string_length (str
);
1194 if (SCM_LIKELY (len
> 0))
1195 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1197 scm_out_of_range (NULL
, k
);
1199 if (scm_i_is_narrow_string (str
))
1200 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1202 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1207 scm_c_string_ref (SCM str
, size_t p
)
1209 if (p
>= scm_i_string_length (str
))
1210 scm_out_of_range (NULL
, scm_from_size_t (p
));
1211 if (scm_i_is_narrow_string (str
))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1218 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1219 (SCM str
, SCM k
, SCM chr
),
1220 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1221 "an unspecified value. @var{k} must be a valid index of\n"
1223 #define FUNC_NAME s_scm_string_set_x
1228 SCM_VALIDATE_STRING (1, str
);
1230 len
= scm_i_string_length (str
);
1231 if (SCM_LIKELY (len
> 0))
1232 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1234 scm_out_of_range (NULL
, k
);
1236 SCM_VALIDATE_CHAR (3, chr
);
1237 str
= scm_i_string_start_writing (str
);
1238 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1239 scm_i_string_stop_writing ();
1241 return SCM_UNSPECIFIED
;
1246 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1248 if (p
>= scm_i_string_length (str
))
1249 scm_out_of_range (NULL
, scm_from_size_t (p
));
1250 str
= scm_i_string_start_writing (str
);
1251 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1252 scm_i_string_stop_writing ();
1255 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1256 (SCM str
, SCM start
, SCM end
),
1257 "Return a newly allocated string formed from the characters\n"
1258 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1259 "ending with index @var{end} (exclusive).\n"
1260 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1261 "exact integers satisfying:\n\n"
1262 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1263 #define FUNC_NAME s_scm_substring
1265 size_t len
, from
, to
;
1267 SCM_VALIDATE_STRING (1, str
);
1268 len
= scm_i_string_length (str
);
1269 from
= scm_to_unsigned_integer (start
, 0, len
);
1270 if (SCM_UNBNDP (end
))
1273 to
= scm_to_unsigned_integer (end
, from
, len
);
1274 return scm_i_substring (str
, from
, to
);
1278 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1279 (SCM str
, SCM start
, SCM end
),
1280 "Return a newly allocated string formed from the characters\n"
1281 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1282 "ending with index @var{end} (exclusive).\n"
1283 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1284 "exact integers satisfying:\n"
1286 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1288 "The returned string is read-only.\n")
1289 #define FUNC_NAME s_scm_substring_read_only
1291 size_t len
, from
, to
;
1293 SCM_VALIDATE_STRING (1, str
);
1294 len
= scm_i_string_length (str
);
1295 from
= scm_to_unsigned_integer (start
, 0, len
);
1296 if (SCM_UNBNDP (end
))
1299 to
= scm_to_unsigned_integer (end
, from
, len
);
1300 return scm_i_substring_read_only (str
, from
, to
);
1304 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1305 (SCM str
, SCM start
, SCM end
),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring_copy
1314 /* For the Scheme version, START is mandatory, but for the C
1315 version, it is optional. See scm_string_copy in srfi-13.c for a
1321 SCM_VALIDATE_STRING (1, str
);
1322 scm_i_get_substring_spec (scm_i_string_length (str
),
1323 start
, &from
, end
, &to
);
1324 return scm_i_substring_copy (str
, from
, to
);
1328 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1329 (SCM str
, SCM start
, SCM end
),
1330 "Return string that indirectly refers to the characters\n"
1331 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1332 "ending with index @var{end} (exclusive).\n"
1333 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1334 "exact integers satisfying:\n\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1336 #define FUNC_NAME s_scm_substring_shared
1338 size_t len
, from
, to
;
1340 SCM_VALIDATE_STRING (1, str
);
1341 len
= scm_i_string_length (str
);
1342 from
= scm_to_unsigned_integer (start
, 0, len
);
1343 if (SCM_UNBNDP (end
))
1346 to
= scm_to_unsigned_integer (end
, from
, len
);
1347 return scm_i_substring_shared (str
, from
, to
);
1351 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1353 "Return a newly allocated string whose characters form the\n"
1354 "concatenation of the given strings, @var{args}.")
1355 #define FUNC_NAME s_scm_string_append
1368 SCM_VALIDATE_REST_ARGUMENT (args
);
1369 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1372 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1373 len
+= scm_i_string_length (s
);
1374 if (!scm_i_is_narrow_string (s
))
1379 res
= scm_i_make_string (len
, &data
.narrow
, 0);
1381 res
= scm_i_make_wide_string (len
, &data
.wide
, 0);
1383 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1387 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1388 len
= scm_i_string_length (s
);
1391 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1396 if (scm_i_is_narrow_string (s
))
1398 for (i
= 0; i
< scm_i_string_length (s
); i
++)
1399 data
.wide
[i
] = (unsigned char) scm_i_string_chars (s
)[i
];
1402 u32_cpy ((scm_t_uint32
*) data
.wide
,
1403 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1406 scm_remember_upto_here_1 (s
);
1414 /* Charset conversion error handling. */
1416 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1417 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1419 /* Raise an exception informing that character CHR could not be written
1420 to PORT in its current encoding. */
1422 scm_encoding_error (const char *subr
, int err
, const char *message
,
1425 scm_throw (scm_encoding_error_key
,
1426 scm_list_n (scm_from_latin1_string (subr
),
1427 scm_from_latin1_string (message
),
1433 /* Raise an exception informing of an encoding error on PORT. This
1434 means that a character could not be written in PORT's encoding. */
1436 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1438 scm_throw (scm_decoding_error_key
,
1439 scm_list_n (scm_from_latin1_string (subr
),
1440 scm_from_latin1_string (message
),
1447 /* String conversion to/from C. */
1450 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1451 scm_t_string_failed_conversion_handler handler
)
1458 /* The order of these checks is important. */
1459 if (!str
&& len
!= 0)
1460 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1461 if (len
== (size_t) -1)
1466 if (encoding
== NULL
)
1468 /* If encoding is null, use Latin-1. */
1470 res
= scm_i_make_string (len
, &buf
, 0);
1471 memcpy (buf
, str
, len
);
1476 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1477 (enum iconv_ilseq_handler
)
1483 if (SCM_UNLIKELY (u32
== NULL
))
1485 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1490 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1491 memcpy (buf
, str
, len
);
1492 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1494 scm_decoding_error (__func__
, errno
,
1495 "input locale conversion error", bv
);
1500 if (u32
[i
++] > 0xFF)
1509 res
= scm_i_make_string (u32len
, &dst
, 0);
1510 for (i
= 0; i
< u32len
; i
++)
1511 dst
[i
] = (unsigned char) u32
[i
];
1517 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1518 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1527 scm_from_locale_string (const char *str
)
1529 return scm_from_locale_stringn (str
, -1);
1533 scm_from_locale_stringn (const char *str
, size_t len
)
1535 return scm_from_stringn (str
, len
, locale_charset (),
1536 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1540 scm_from_latin1_string (const char *str
)
1542 return scm_from_latin1_stringn (str
, -1);
1546 scm_from_latin1_stringn (const char *str
, size_t len
)
1551 if (len
== (size_t) -1)
1554 /* Make a narrow string and copy STR as is. */
1555 result
= scm_i_make_string (len
, &buf
, 0);
1556 memcpy (buf
, str
, len
);
1562 scm_from_utf8_string (const char *str
)
1564 return scm_from_utf8_stringn (str
, -1);
1568 scm_from_utf8_stringn (const char *str
, size_t len
)
1570 return scm_from_stringn (str
, len
, "UTF-8", SCM_FAILED_CONVERSION_ERROR
);
1574 scm_from_utf32_string (const scm_t_wchar
*str
)
1576 return scm_from_utf32_stringn (str
, -1);
1580 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1585 if (len
== (size_t) -1)
1586 len
= u32_strlen ((uint32_t *) str
);
1588 result
= scm_i_make_wide_string (len
, &buf
, 0);
1589 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1590 scm_i_try_narrow_string (result
);
1595 /* Create a new scheme string from the C string STR. The memory of
1596 STR may be used directly as storage for the new string. */
1597 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1598 would be to register a finalizer to eventually free(3) STR, which isn't
1599 worth it. Should we just deprecate the `scm_take_' functions? */
1601 scm_take_locale_stringn (char *str
, size_t len
)
1605 res
= scm_from_locale_stringn (str
, len
);
1612 scm_take_locale_string (char *str
)
1614 return scm_take_locale_stringn (str
, -1);
1617 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1618 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1619 Set *LENP to the size of the resulting string.
1621 FIXME: This is a hack we should get rid of. See
1622 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1625 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1627 char *before
, *after
;
1636 if ((i
<= *lenp
- 6)
1637 && before
[i
] == '\\'
1638 && before
[i
+ 1] == 'u'
1639 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1641 /* Convert \u00NN to \xNN */
1644 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1645 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1649 else if ((i
<= *lenp
- 10)
1650 && before
[i
] == '\\'
1651 && before
[i
+ 1] == 'U'
1652 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1654 /* Convert \U00NNNNNN to \UNNNNNN */
1657 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1658 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1659 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1660 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1661 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1662 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1668 after
[j
] = before
[i
];
1676 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1677 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1678 of the resulting string. BUF must be large enough to handle the
1679 worst case when `\uXXXX' escapes (6 characters) are replaced by
1680 `\xXXXX;' (7 characters). */
1682 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1684 char *before
, *after
;
1686 /* The worst case is if the input string contains all 4-digit hex escapes.
1687 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1688 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1689 size_t nzeros
, ndigits
;
1692 after
= alloca (max_out_len
);
1697 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1698 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1700 if (before
[i
+ 1] == 'u')
1702 else if (before
[i
+ 1] == 'U')
1707 /* Add the R6RS hex escape initial sequence. */
1711 /* Move string positions to the start of the hex numbers. */
1715 /* Find the number of initial zeros in this hex number. */
1717 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1720 /* Copy the number, skipping initial zeros, and then move the string
1722 if (nzeros
== ndigits
)
1731 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1732 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1734 j
+= (ndigits
- nzeros
);
1737 /* Add terminating semicolon. */
1743 after
[j
] = before
[i
];
1749 memcpy (before
, after
, j
);
1753 scm_to_locale_string (SCM str
)
1755 return scm_to_locale_stringn (str
, NULL
);
1759 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1761 return scm_to_stringn (str
, lenp
,
1763 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1767 scm_to_latin1_string (SCM str
)
1769 return scm_to_latin1_stringn (str
, NULL
);
1773 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1774 #define FUNC_NAME "scm_to_latin1_stringn"
1778 SCM_VALIDATE_STRING (1, str
);
1780 if (scm_i_is_narrow_string (str
))
1782 size_t len
= scm_i_string_length (str
);
1787 result
= scm_strndup (scm_i_string_data (str
), len
);
1790 result
= scm_to_stringn (str
, lenp
, NULL
,
1791 SCM_FAILED_CONVERSION_ERROR
);
1798 scm_to_utf8_string (SCM str
)
1800 return scm_to_utf8_stringn (str
, NULL
);
1804 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
1806 return scm_to_stringn (str
, lenp
, "UTF-8", SCM_FAILED_CONVERSION_ERROR
);
1810 scm_to_utf32_string (SCM str
)
1812 return scm_to_utf32_stringn (str
, NULL
);
1816 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
1817 #define FUNC_NAME "scm_to_utf32_stringn"
1819 scm_t_wchar
*result
;
1821 SCM_VALIDATE_STRING (1, str
);
1823 if (scm_i_is_narrow_string (str
))
1824 result
= (scm_t_wchar
*)
1825 scm_to_stringn (str
, lenp
, "UTF-32",
1826 SCM_FAILED_CONVERSION_ERROR
);
1831 len
= scm_i_string_length (str
);
1835 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
1836 memcpy (result
, scm_i_string_wide_chars (str
),
1837 len
* sizeof (scm_t_wchar
));
1845 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1846 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1847 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1848 defined by HANDLER. */
1850 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
1851 scm_t_string_failed_conversion_handler handler
)
1854 size_t ilen
, len
, i
;
1858 if (!scm_is_string (str
))
1859 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1860 ilen
= scm_i_string_length (str
);
1864 buf
= scm_malloc (1);
1872 for (i
= 0; i
< ilen
; i
++)
1873 if (scm_i_string_ref (str
, i
) == '\0')
1874 scm_misc_error (NULL
,
1875 "string contains #\\nul character: ~S",
1878 if (scm_i_is_narrow_string (str
) && (encoding
== NULL
))
1880 /* If using native Latin-1 encoding, just copy the string
1884 buf
= scm_malloc (ilen
);
1885 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1891 buf
= scm_malloc (ilen
+ 1);
1892 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1904 if (scm_i_is_narrow_string (str
))
1906 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
1908 (enum iconv_ilseq_handler
) handler
, NULL
,
1912 scm_encoding_error (__func__
, errno
,
1913 "cannot convert narrow string to output locale",
1915 /* FIXME: Faulty character unknown. */
1920 buf
= u32_conv_to_encoding (enc
,
1921 (enum iconv_ilseq_handler
) handler
,
1922 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
1927 scm_encoding_error (__func__
, errno
,
1928 "cannot convert wide string to output locale",
1930 /* FIXME: Faulty character unknown. */
1933 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
1935 if (SCM_R6RS_ESCAPES_P
)
1937 /* The worst case is if the input string contains all 4-digit
1938 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1939 (seven characters). Make BUF large enough to hold
1941 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
1942 unistring_escapes_to_r6rs_escapes (buf
, &len
);
1945 unistring_escapes_to_guile_escapes (buf
, &len
);
1947 buf
= scm_realloc (buf
, len
);
1953 buf
= scm_realloc (buf
, len
+ 1);
1957 scm_remember_upto_here_1 (str
);
1962 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
1965 char *result
= NULL
;
1966 if (!scm_is_string (str
))
1967 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1968 result
= scm_to_locale_stringn (str
, &len
);
1970 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
1973 scm_remember_upto_here_1 (str
);
1978 /* Unicode string normalization. */
1980 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1981 libguile/i18n.c. It would be useful to have this factored out into a more
1982 convenient location, but its use of alloca makes that tricky to do. */
1985 normalize_str (SCM string
, uninorm_t form
)
1988 scm_t_uint32
*w_str
;
1990 size_t rlen
, len
= scm_i_string_length (string
);
1992 if (scm_i_is_narrow_string (string
))
1995 const char *buf
= scm_i_string_chars (string
);
1997 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
1999 for (i
= 0; i
< len
; i
++)
2000 w_str
[i
] = (unsigned char) buf
[i
];
2004 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2006 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2008 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2009 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2012 scm_i_try_narrow_string (ret
);
2017 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2019 "Returns the NFC normalized form of @var{string}.")
2020 #define FUNC_NAME s_scm_string_normalize_nfc
2022 SCM_VALIDATE_STRING (1, string
);
2023 return normalize_str (string
, UNINORM_NFC
);
2027 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2029 "Returns the NFD normalized form of @var{string}.")
2030 #define FUNC_NAME s_scm_string_normalize_nfd
2032 SCM_VALIDATE_STRING (1, string
);
2033 return normalize_str (string
, UNINORM_NFD
);
2037 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2039 "Returns the NFKC normalized form of @var{string}.")
2040 #define FUNC_NAME s_scm_string_normalize_nfkc
2042 SCM_VALIDATE_STRING (1, string
);
2043 return normalize_str (string
, UNINORM_NFKC
);
2047 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2049 "Returns the NFKD normalized form of @var{string}.")
2050 #define FUNC_NAME s_scm_string_normalize_nfkd
2052 SCM_VALIDATE_STRING (1, string
);
2053 return normalize_str (string
, UNINORM_NFKD
);
2057 /* converts C scm_array of strings to SCM scm_list of strings.
2058 If argc < 0, a null terminated scm_array is assumed.
2059 The current locale encoding is assumed */
2061 scm_makfromstrs (int argc
, char **argv
)
2066 for (i
= 0; argv
[i
]; i
++);
2068 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2072 /* Return a newly allocated array of char pointers to each of the strings
2073 in args, with a terminating NULL pointer. The strings are encoded using
2074 the current locale. */
2077 scm_i_allocate_string_pointers (SCM list
)
2078 #define FUNC_NAME "scm_i_allocate_string_pointers"
2081 int list_len
= scm_ilength (list
);
2085 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2087 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2089 result
[list_len
] = NULL
;
2091 /* The list might have been modified in another thread, so
2092 we check LIST before each access.
2094 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2096 SCM str
= SCM_CAR (list
);
2097 size_t len
; /* String length in bytes */
2098 char *c_str
= scm_to_locale_stringn (str
, &len
);
2100 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2101 scm_malloc to allocate the returned string, which must be
2102 explicitly deallocated. This forces us to copy the string a
2103 second time into a new buffer. Ideally there would be variants
2104 of scm_to_*_stringn that can return garbage-collected buffers. */
2106 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2107 memcpy (result
[i
], c_str
, len
);
2108 result
[i
][len
] = '\0';
2111 list
= SCM_CDR (list
);
2119 scm_i_get_substring_spec (size_t len
,
2120 SCM start
, size_t *cstart
,
2121 SCM end
, size_t *cend
)
2123 if (SCM_UNBNDP (start
))
2126 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2128 if (SCM_UNBNDP (end
))
2131 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2135 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2137 return scm_c_string_ref (h
->array
, index
);
2141 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2143 scm_c_string_set_x (h
->array
, index
, val
);
2147 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2153 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2155 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2156 h
->elements
= h
->writable_elements
= NULL
;
2159 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2160 string_handle_ref
, string_handle_set
,
2162 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2167 scm_nullstr
= scm_i_make_string (0, NULL
, 1);
2169 #include "libguile/strings.x"