From fd5eec2b6e113f6d13028215a738417607432a2d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ludovic=20Court=C3=A8s?= Date: Wed, 15 Sep 2010 18:38:57 +0200 Subject: [PATCH] Optimize `peek-char'. This makes `peek-char' 40x faster on a port whose encoding is faster on a UTF-8 port containing multi-byte codepoints. The `xml->sxml' procedure is 4x faster on a 2.7 MiB XML file. * libguile/ports.c (get_codepoint): New procedure, moved here from `scm_getc', with the additional BUF and LEN parameters. (scm_getc): Use it. (scm_peek_char): Use it instead of the `scm_getc'/`scm_ungetc' sequence. * test-suite/tests/ports.test ("string ports")["peek-char [latin-1]", "peek-char [utf-8]"]: New tests. * benchmark-suite/Makefile.am (SCM_BENCHMARKS): Add `benchmarks/ports.bm'. * benchmark-suite/benchmarks/ports.bm: New file. --- benchmark-suite/Makefile.am | 1 + benchmark-suite/benchmarks/ports.bm | 67 +++++++++++++++++++++++++++++ libguile/ports.c | 58 +++++++++++++++++++------ test-suite/tests/ports.test | 20 ++++++++- 4 files changed, 132 insertions(+), 14 deletions(-) create mode 100644 benchmark-suite/benchmarks/ports.bm diff --git a/benchmark-suite/Makefile.am b/benchmark-suite/Makefile.am index 9f49f2aad..e2aad9148 100644 --- a/benchmark-suite/Makefile.am +++ b/benchmark-suite/Makefile.am @@ -4,6 +4,7 @@ SCM_BENCHMARKS = benchmarks/0-reference.bm \ benchmarks/continuations.bm \ benchmarks/if.bm \ benchmarks/logand.bm \ + benchmarks/ports.bm \ benchmarks/read.bm \ benchmarks/srfi-1.bm \ benchmarks/srfi-13.bm \ diff --git a/benchmark-suite/benchmarks/ports.bm b/benchmark-suite/benchmarks/ports.bm new file mode 100644 index 000000000..917a7ddbe --- /dev/null +++ b/benchmark-suite/benchmarks/ports.bm @@ -0,0 +1,67 @@ +;;; ports.bm --- Port I/O. -*- mode: scheme; coding: utf-8; -*- +;;; +;;; Copyright (C) 2010 Free Software Foundation, Inc. +;;; +;;; This program is free software; you can redistribute it and/or +;;; modify it under the terms of the GNU Lesser General Public License +;;; as published by the Free Software Foundation; either version 3, or +;;; (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU Lesser General Public License for more details. +;;; +;;; You should have received a copy of the GNU Lesser General Public +;;; License along with this software; see the file COPYING.LESSER. If +;;; not, write to the Free Software Foundation, Inc., 51 Franklin +;;; Street, Fifth Floor, Boston, MA 02110-1301 USA + +(define-module (benchmarks ports) + #:use-module (benchmark-suite lib)) + +(define %latin1-port + (with-fluids ((%default-port-encoding #f)) + (open-input-string "hello, world"))) + +(define %utf8/ascii-port + (with-fluids ((%default-port-encoding "UTF-8")) + (open-input-string "hello, world"))) + +(define %utf8/wide-port + (with-fluids ((%default-port-encoding "UTF-8")) + (open-input-string "안녕하세요"))) + + +(with-benchmark-prefix "peek-char" + + (benchmark "latin-1 port" 700000 + (peek-char %latin1-port)) + + (benchmark "utf-8 port, ascii character" 700000 + (peek-char %utf8/ascii-port)) + + (benchmark "utf-8 port, Korean character" 700000 + (peek-char %utf8/wide-port))) + +(with-benchmark-prefix "read-char" + + (benchmark "latin-1 port" 10000000 + (read-char %latin1-port)) + + (benchmark "utf-8 port, ascii character" 10000000 + (read-char %utf8/ascii-port)) + + (benchmark "utf-8 port, Korean character" 10000000 + (read-char %utf8/wide-port))) + +(with-benchmark-prefix "char-ready?" + + (benchmark "latin-1 port" 10000000 + (char-ready? %latin1-port)) + + (benchmark "utf-8 port, ascii character" 10000000 + (char-ready? %utf8/ascii-port)) + + (benchmark "utf-8 port, Korean character" 10000000 + (char-ready? %utf8/wide-port))) diff --git a/libguile/ports.c b/libguile/ports.c index 7c3791d22..6cf0de2cc 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -1023,13 +1023,15 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0, #define SCM_MBCHAR_BUF_SIZE (4) -/* Get one codepoint from a file, using the port's encoding. */ -scm_t_wchar -scm_getc (SCM port) +/* Read a codepoint from PORT and return it. Fill BUF with the byte + representation of the codepoint in PORT's encoding, and set *LEN to + the length in bytes of that representation. Raise an error on + failure. */ +static scm_t_wchar +get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) { int c; - unsigned int bufcount = 0; - char buf[SCM_MBCHAR_BUF_SIZE]; + size_t bufcount = 0; scm_t_uint32 result_buf; scm_t_wchar codepoint = 0; scm_t_uint32 *u32; @@ -1133,6 +1135,8 @@ scm_getc (SCM port) break; } + *len = bufcount; + return codepoint; failure: @@ -1155,6 +1159,15 @@ scm_getc (SCM port) return 0; } +/* Read a codepoint from PORT and return it. */ +scm_t_wchar +scm_getc (SCM port) +{ + size_t len; + char buf[SCM_MBCHAR_BUF_SIZE]; + + return get_codepoint (port, buf, &len); +} /* this should only be called when the read buffer is empty. it tries to refill the read buffer. it returns the first char from @@ -1635,18 +1648,37 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0, "to @code{read-char} would have hung.") #define FUNC_NAME s_scm_peek_char { - scm_t_wchar c, column; + SCM result; + scm_t_wchar c; + char bytes[SCM_MBCHAR_BUF_SIZE]; + long column, line; + size_t len; + if (SCM_UNBNDP (port)) port = scm_current_input_port (); else SCM_VALIDATE_OPINPORT (1, port); - column = SCM_COL(port); - c = scm_getc (port); - if (EOF == c) - return SCM_EOF_VAL; - scm_ungetc (c, port); - SCM_COL(port) = column; - return SCM_MAKE_CHAR (c); + + column = SCM_COL (port); + line = SCM_LINUM (port); + + c = get_codepoint (port, bytes, &len); + if (c == EOF) + result = SCM_EOF_VAL; + else + { + long i; + + result = SCM_MAKE_CHAR (c); + + for (i = len - 1; i >= 0; i--) + scm_unget_byte (bytes[i], port); + + SCM_COL (port) = column; + SCM_LINUM (port) = line; + } + + return result; } #undef FUNC_NAME diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index bb5c17336..4edd53127 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -422,7 +422,25 @@ (and (eq? faulty-str str) (string=? from "UTF-32") (string=? to "ISO-8859-1") - (string? (strerror errno)))))))) + (string? (strerror errno))))))) + + (pass-if "peek-char [latin-1]" + (let ((p (with-fluids ((%default-port-encoding #f)) + (open-input-string "hello, world")))) + (and (char=? (peek-char p) #\h) + (char=? (peek-char p) #\h) + (char=? (peek-char p) #\h) + (= (port-line p) 0) + (= (port-column p) 0)))) + + (pass-if "peek-char [utf-8]" + (let ((p (with-fluids ((%default-port-encoding "UTF-8")) + (open-input-string "안녕하세요")))) + (and (char=? (peek-char p) #\안) + (char=? (peek-char p) #\안) + (char=? (peek-char p) #\안) + (= (port-line p) 0) + (= (port-column p) 0))))) (with-test-prefix "call-with-output-string" -- 2.20.1