From 07f49ac786e0f1c007eb336e2fb7a572e8405316 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ludovic=20Court=C3=A8s?= Date: Tue, 14 Sep 2010 16:10:52 +0200 Subject: [PATCH] Factorize and optimize `write' for strings and characters. According to `write.bm', this makes `write' 2.6 times faster for strings. * libguile/print.c (iprin1): Use `write_character' when `SCM_WRITINGP (pstate)' and `SCM_CHARP (exp)' or `scm_is_string (exp)'. (scm_i_charprint): Remove. (display_character, write_character): New functions. (scm_write_char): Use `display_character' instead of `scm_i_charprint'. * libguile/print.h (scm_i_charprint): Remove declaration. * benchmark-suite/benchmarks/write.bm: New file. * benchmark-suite/Makefile.am (SCM_BENCHMARKS): Add `benchmarks/write.bm'. --- benchmark-suite/Makefile.am | 3 +- benchmark-suite/benchmarks/write.bm | 52 ++++ libguile/print.c | 401 ++++++++++++++-------------- libguile/print.h | 3 +- 4 files changed, 254 insertions(+), 205 deletions(-) create mode 100644 benchmark-suite/benchmarks/write.bm diff --git a/benchmark-suite/Makefile.am b/benchmark-suite/Makefile.am index b58219a24..9f49f2aad 100644 --- a/benchmark-suite/Makefile.am +++ b/benchmark-suite/Makefile.am @@ -11,7 +11,8 @@ SCM_BENCHMARKS = benchmarks/0-reference.bm \ benchmarks/subr.bm \ benchmarks/uniform-vector-read.bm \ benchmarks/vectors.bm \ - benchmarks/vlists.bm + benchmarks/vlists.bm \ + benchmarks/write.bm EXTRA_DIST = guile-benchmark lib.scm $(SCM_BENCHMARKS) \ ChangeLog-2008 diff --git a/benchmark-suite/benchmarks/write.bm b/benchmark-suite/benchmarks/write.bm new file mode 100644 index 000000000..e96f2efeb --- /dev/null +++ b/benchmark-suite/benchmarks/write.bm @@ -0,0 +1,52 @@ +;;; write.bm --- Exercise the printer. -*- Scheme -*- +;;; +;;; Copyright (C) 2008, 2010 Free Software Foundation, Inc. +;;; +;;; This program is free software; you can redistribute it and/or +;;; modify it under the terms of the GNU Lesser General Public License +;;; as published by the Free Software Foundation; either version 3, or +;;; (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU Lesser General Public License for more details. +;;; +;;; You should have received a copy of the GNU Lesser General Public +;;; License along with this software; see the file COPYING.LESSER. If +;;; not, write to the Free Software Foundation, Inc., 51 Franklin +;;; Street, Fifth Floor, Boston, MA 02110-1301 USA + +(define-module (benchmarks read) + #:use-module (benchmark-suite lib)) + +(define %len 50000) + +(define %string-with-escapes + (list->string (map integer->char (iota %len)))) + +(define %string-without-escapes + (make-string %len #\a)) + +;; Use Unicode-capable ports. +(fluid-set! %default-port-encoding "UTF-8") + +(define %null + (%make-void-port OPEN_WRITE)) + + +(with-benchmark-prefix "write" + + (benchmark "string with escapes" 50 + (write %string-with-escapes %null)) + + (benchmark "string without escapes" 50 + (write %string-without-escapes %null))) + +(with-benchmark-prefix "display" + + (benchmark "string with escapes" 1000 + (display %string-with-escapes %null)) + + (benchmark "string without escapes" 1000 + (display %string-without-escapes %null))) diff --git a/libguile/print.c b/libguile/print.c index 212b70d2b..5acb06b3e 100644 --- a/libguile/print.c +++ b/libguile/print.c @@ -54,6 +54,14 @@ +/* Character printers. */ + +static int display_character (scm_t_wchar, SCM, + scm_t_string_failed_conversion_handler); +static void write_character (scm_t_wchar, SCM, int); + + + /* {Names of immediate symbols} * * This table must agree with the declarations in scm.h: {Immediate Symbols}. @@ -461,79 +469,17 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate) case scm_tc3_imm24: if (SCM_CHARP (exp)) { - scm_t_wchar i = SCM_CHAR (exp); - const char *name; - if (SCM_WRITINGP (pstate)) + write_character (SCM_CHAR (exp), port, 0); + else { - scm_puts ("#\\", port); - name = scm_i_charname (exp); - if (name != NULL) - scm_puts (name, port); - else if (uc_is_general_category_withtable (i, UC_CATEGORY_MASK_L - | UC_CATEGORY_MASK_M - | UC_CATEGORY_MASK_N - | UC_CATEGORY_MASK_P - | UC_CATEGORY_MASK_S)) - /* Print the character if is graphic character. */ - { - scm_t_wchar *wbuf; - SCM wstr; - char *buf; - size_t len; - const char *enc; - - enc = scm_i_get_port_encoding (port); - if (uc_combining_class (i) == UC_CCC_NR) - { - wstr = scm_i_make_wide_string (1, &wbuf); - wbuf[0] = i; - } - else - { - /* Character is a combining character: print it connected - to a dotted circle instead of connecting it to the - backslash in '#\' */ - wstr = scm_i_make_wide_string (2, &wbuf); - wbuf[0] = SCM_CODEPOINT_DOTTED_CIRCLE; - wbuf[1] = i; - } - if (enc == NULL) - { - if (i <= 0xFF) - /* Character is graphic and Latin-1. Print it */ - scm_lfwrite_str (wstr, port); - else - /* Character is graphic but unrepresentable in - this port's encoding. */ - PRINT_CHAR_ESCAPE (i, port); - } - else - { - buf = u32_conv_to_encoding (enc, - iconveh_error, - (scm_t_uint32 *) wbuf, - 1, - NULL, - NULL, &len); - if (buf != NULL) - { - /* Character is graphic. Print it. */ - scm_lfwrite_str (wstr, port); - free (buf); - } - else - /* Character is graphic but unrepresentable in - this port's encoding. */ - PRINT_CHAR_ESCAPE (i, port); - } - } - else - /* Character is a non-graphical character. */ - PRINT_CHAR_ESCAPE (i, port); + if (!display_character (SCM_CHAR (exp), port, + scm_i_get_conversion_strategy (port))) + scm_encoding_error (__func__, errno, + "cannot convert to output locale", + "UTF-32", scm_i_get_port_encoding (port), + scm_string (scm_list_1 (exp))); } - else - scm_i_charprint (i, port); } else if (SCM_IFLAGP (exp) && ((size_t) SCM_IFLAGNUM (exp) < (sizeof iflagnames / sizeof (char *)))) @@ -597,132 +543,13 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate) case scm_tc7_string: if (SCM_WRITINGP (pstate)) { - size_t i, len; - static char const hex[] = "0123456789abcdef"; - char buf[9]; - + size_t len, i; scm_putc ('"', port); len = scm_i_string_length (exp); for (i = 0; i < len; ++i) - { - scm_t_wchar ch = scm_i_string_ref (exp, i); - int printed = 0; - - if (ch == ' ' || ch == '\n') - { - scm_putc (ch, port); - printed = 1; - } - else if (ch == '"' || ch == '\\') - { - scm_putc ('\\', port); - scm_i_charprint (ch, port); - printed = 1; - } - else - if (uc_is_general_category_withtable - (ch, - UC_CATEGORY_MASK_L | UC_CATEGORY_MASK_M | - UC_CATEGORY_MASK_N | UC_CATEGORY_MASK_P | - UC_CATEGORY_MASK_S)) - { - /* Print the character since it is a graphic - character. */ - scm_t_wchar *wbuf; - SCM wstr = scm_i_make_wide_string (1, &wbuf); - char *buf; - size_t len; - - if (scm_i_get_port_encoding (port)) - { - wstr = scm_i_make_wide_string (1, &wbuf); - wbuf[0] = ch; - buf = u32_conv_to_encoding (scm_i_get_port_encoding (port), - iconveh_error, - (scm_t_uint32 *) wbuf, - 1 , - NULL, - NULL, &len); - if (buf != NULL) - { - /* Character is graphic and representable in - this encoding. Print it. */ - scm_lfwrite_str (wstr, port); - free (buf); - printed = 1; - } - } - else - if (ch <= 0xFF) - { - scm_putc (ch, port); - printed = 1; - } - } - - if (!printed) - { - /* Character is graphic but unrepresentable in - this port's encoding or is not graphic. */ - if (!SCM_R6RS_ESCAPES_P) - { - if (ch <= 0xFF) - { - buf[0] = '\\'; - buf[1] = 'x'; - buf[2] = hex[ch / 16]; - buf[3] = hex[ch % 16]; - scm_lfwrite (buf, 4, port); - } - else if (ch <= 0xFFFF) - { - buf[0] = '\\'; - buf[1] = 'u'; - buf[2] = hex[(ch & 0xF000) >> 12]; - buf[3] = hex[(ch & 0xF00) >> 8]; - buf[4] = hex[(ch & 0xF0) >> 4]; - buf[5] = hex[(ch & 0xF)]; - scm_lfwrite (buf, 6, port); - } - else if (ch > 0xFFFF) - { - buf[0] = '\\'; - buf[1] = 'U'; - buf[2] = hex[(ch & 0xF00000) >> 20]; - buf[3] = hex[(ch & 0xF0000) >> 16]; - buf[4] = hex[(ch & 0xF000) >> 12]; - buf[5] = hex[(ch & 0xF00) >> 8]; - buf[6] = hex[(ch & 0xF0) >> 4]; - buf[7] = hex[(ch & 0xF)]; - scm_lfwrite (buf, 8, port); - } - } - else - { - scm_t_wchar ch2 = ch; - - /* Print an R6RS variable-length hex escape: "\xNNNN;" - */ - int i = 8; - buf[i] = ';'; - i --; - if (ch == 0) - buf[i--] = '0'; - else - while (ch2 > 0) - { - buf[i] = hex[ch2 & 0xF]; - ch2 >>= 4; - i --; - } - buf[i] = 'x'; - i --; - buf[i] = '\\'; - scm_lfwrite (buf + i, 9 - i, port); - } - } - } + write_character (scm_i_string_ref (exp, i), port, 1); + scm_putc ('"', port); scm_remember_upto_here_1 (exp); } @@ -917,16 +744,179 @@ scm_prin1 (SCM exp, SCM port, int writingp) } } -/* Print a character. - */ -void -scm_i_charprint (scm_t_wchar ch, SCM port) +/* Attempt to display CH to PORT according to STRATEGY. Return non-zero + if CH was successfully displayed, zero otherwise (e.g., if it was not + representable in PORT's encoding.) */ +static int +display_character (scm_t_wchar ch, SCM port, + scm_t_string_failed_conversion_handler strategy) { - scm_t_wchar *wbuf; - SCM wstr = scm_i_make_wide_string (1, &wbuf); + int printed; + const char *encoding; + + encoding = scm_i_get_port_encoding (port); + if (encoding == NULL) + { + if (ch <= 0xff) + { + scm_putc (ch, port); + printed = 1; + } + else + printed = 0; + } + else + { + size_t len; + char locale_encoded[sizeof (ch)], *result; + + len = sizeof (locale_encoded); + result = u32_conv_to_encoding (encoding, strategy, + (scm_t_uint32 *) &ch, 1, + NULL, locale_encoded, &len); + if (result != NULL) + { + /* CH is graphic; print it. */ + + if (strategy == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE) + { + /* Apply the same escaping syntax as in `write_character'. */ + if (SCM_R6RS_ESCAPES_P) + scm_i_unistring_escapes_to_r6rs_escapes (result, &len); + else + scm_i_unistring_escapes_to_guile_escapes (result, &len); + } - wbuf[0] = ch; - scm_lfwrite_str (wstr, port); + scm_lfwrite (result, len, port); + printed = 1; + + if (SCM_UNLIKELY (result != locale_encoded)) + free (result); + } + else + printed = 0; + } + + return printed; +} + +/* Write CH to PORT, escaping it if it's non-graphic or not + representable in PORT's encoding. If STRING_ESCAPES_P is true and CH + needs to be escaped, it is escaped using the in-string escape syntax; + otherwise the character escape syntax is used. */ +static void +write_character (scm_t_wchar ch, SCM port, int string_escapes_p) +{ + int printed = 0; + + if (string_escapes_p) + { + /* Check if CH deserves special treatment. */ + if (ch == '"' || ch == '\\') + { + scm_putc ('\\', port); + scm_putc (ch, port); + printed = 1; + } + else if (ch == ' ' || ch == '\n') + { + scm_putc (ch, port); + printed = 1; + } + } + else + scm_puts ("#\\", port); + + if (!printed + && uc_is_general_category_withtable (ch, + UC_CATEGORY_MASK_L | + UC_CATEGORY_MASK_M | + UC_CATEGORY_MASK_N | + UC_CATEGORY_MASK_P | + UC_CATEGORY_MASK_S)) + /* CH is graphic; attempt to display it. */ + printed = display_character (ch, port, iconveh_error); + + if (!printed) + { + /* CH isn't graphic or cannot be represented in PORT's + encoding. */ + + if (string_escapes_p) + { + /* Represent CH using the in-string escape syntax. */ + + static const char hex[] = "0123456789abcdef"; + char buf[9]; + + if (!SCM_R6RS_ESCAPES_P) + { + if (ch <= 0xFF) + { + buf[0] = '\\'; + buf[1] = 'x'; + buf[2] = hex[ch / 16]; + buf[3] = hex[ch % 16]; + scm_lfwrite (buf, 4, port); + } + else if (ch <= 0xFFFF) + { + buf[0] = '\\'; + buf[1] = 'u'; + buf[2] = hex[(ch & 0xF000) >> 12]; + buf[3] = hex[(ch & 0xF00) >> 8]; + buf[4] = hex[(ch & 0xF0) >> 4]; + buf[5] = hex[(ch & 0xF)]; + scm_lfwrite (buf, 6, port); + } + else if (ch > 0xFFFF) + { + buf[0] = '\\'; + buf[1] = 'U'; + buf[2] = hex[(ch & 0xF00000) >> 20]; + buf[3] = hex[(ch & 0xF0000) >> 16]; + buf[4] = hex[(ch & 0xF000) >> 12]; + buf[5] = hex[(ch & 0xF00) >> 8]; + buf[6] = hex[(ch & 0xF0) >> 4]; + buf[7] = hex[(ch & 0xF)]; + scm_lfwrite (buf, 8, port); + } + } + else + { + /* Print an R6RS variable-length hex escape: "\xNNNN;". */ + scm_t_wchar ch2 = ch; + + int i = 8; + buf[i] = ';'; + i --; + if (ch == 0) + buf[i--] = '0'; + else + while (ch2 > 0) + { + buf[i] = hex[ch2 & 0xF]; + ch2 >>= 4; + i --; + } + buf[i] = 'x'; + i --; + buf[i] = '\\'; + scm_lfwrite (buf + i, 9 - i, port); + } + } + else + { + /* Represent CH using the character escape syntax. */ + const char *name; + + name = scm_i_charname (SCM_MAKE_CHAR (ch)); + if (name != NULL) + scm_puts (name, port); + else + PRINT_CHAR_ESCAPE (ch, port); + } + } } /* Print an integer. @@ -1248,8 +1238,15 @@ SCM_DEFINE (scm_write_char, "write-char", 1, 1, 0, SCM_VALIDATE_CHAR (1, chr); SCM_VALIDATE_OPORT_VALUE (2, port); - - scm_i_charprint (SCM_CHAR (chr), SCM_COERCE_OUTPORT (port)); + + port = SCM_COERCE_OUTPORT (port); + if (!display_character (SCM_CHAR (chr), port, + scm_i_get_conversion_strategy (port))) + scm_encoding_error (__func__, errno, + "cannot convert to output locale", + "UTF-32", scm_i_get_port_encoding (port), + scm_string (scm_list_1 (chr))); + #if 0 #ifdef HAVE_PIPE # ifdef EPIPE diff --git a/libguile/print.h b/libguile/print.h index ae2aaef54..64d1f4bd8 100644 --- a/libguile/print.h +++ b/libguile/print.h @@ -3,7 +3,7 @@ #ifndef SCM_PRINT_H #define SCM_PRINT_H -/* Copyright (C) 1995,1996,1998,2000,2001, 2003, 2004, 2006, 2008 Free Software Foundation, Inc. +/* Copyright (C) 1995,1996,1998,2000,2001, 2003, 2004, 2006, 2008, 2010 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -78,7 +78,6 @@ SCM_API SCM scm_print_options (SCM setting); SCM_API SCM scm_make_print_state (void); SCM_API void scm_free_print_state (SCM print_state); SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state); -SCM_INTERNAL void scm_i_charprint (scm_t_wchar c, SCM port); SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port); SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port); SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port); -- 2.20.1