From: Michael Gran Date: Sat, 17 Jul 2010 10:45:28 +0000 (-0700) Subject: open-file should handle binary mode and coding declarations X-Git-Url: https://git.hcoop.net/bpt/guile.git/commitdiff_plain/211683cc5c99542dfb6e2a33f7cb8c1f9abbc702 open-file should handle binary mode and coding declarations The open-file port should use the 8-bit ISO-8859-1 encoding when a file is opened using mode "b". Also, it should honor a "coding:" declaration at the top of a file when reading files where it is present. * libguile/fports.c (scm_open_file): modified * test-suite/tests/ports.test: more tests for open-file * doc/ref/api-io.texi (File Ports): more documentation for open-file --- diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi index f09ecfbb4..83474a166 100644 --- a/doc/ref/api-io.texi +++ b/doc/ref/api-io.texi @@ -130,8 +130,8 @@ this fluid otherwise. @deffn {Scheme Procedure} port-encoding port @deffnx {C Function} scm_port_encoding -Returns, as a string, the character encoding that @var{port} uses to -interpret its input and output. +Returns, as a string, the character encoding that @var{port} uses to interpret +its input and output. The value @code{#f} is equivalent to @code{"ISO-8859-1"}. @end deffn @deffn {Scheme Procedure} set-port-conversion-strategy! port sym @@ -856,11 +856,25 @@ systems, but has no effect on Unix-like systems. (For reference, Guile leaves text versus binary up to the C library, @code{b} here just adds @code{O_BINARY} to the underlying @code{open} call, when that flag is available.) + +Also, open the file using the 8-bit character encoding "ISO-8859-1", +ignoring any coding declaration or port encoding. + +Note that, when reading or writing binary data with ports, the +bytevector ports in the @code{(rnrs io ports)} module are preferred, +as they return vectors, and not strings (@pxref{R6RS I/O Ports}). @end table If a file cannot be opened with the access requested, @code{open-file} throws an exception. +When the file is opened, this procedure will scan for a coding +declaration (@pxref{Character Encoding of Source Files}). If present +will use that encoding for interpreting the file. Otherwise, the +port's encoding will be used. To supress this behavior, open +the file in binary mode and then set the port encoding explicitly +using @code{set-port-encoding!}. + In theory we could create read/write ports which were buffered in one direction only. However this isn't included in the current interfaces. diff --git a/libguile/fports.c b/libguile/fports.c index 04f3815d0..4a24dc63f 100644 --- a/libguile/fports.c +++ b/libguile/fports.c @@ -315,7 +315,7 @@ fport_canonicalize_filename (SCM filename) * Return the new port. */ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, - (SCM filename, SCM mode), + (SCM filename, SCM mode), "Open the file whose name is @var{filename}, and return a port\n" "representing that file. The attributes of the port are\n" "determined by the @var{mode} string. The way in which this is\n" @@ -336,7 +336,10 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, "The following additional characters can be appended:\n" "@table @samp\n" "@item b\n" - "Open the underlying file in binary mode, if supported by the operating system. " + "Open the underlying file in binary mode, if supported by the system.\n" + "Also, open the file using the binary-compatible character encoding\n" + "\"ISO-8859-1\", ignoring the port's encoding and the coding declaration\n" + "at the top of the input file, if any.\n" "@item +\n" "Open the port for both input and output. E.g., @code{r+}: open\n" "an existing file for both input and output.\n" @@ -351,6 +354,11 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, "Add line-buffering to the port. The port output buffer will be\n" "automatically flushed whenever a newline character is written.\n" "@end table\n" + "When the file is opened, this procedure will scan for a coding\n" + "declaration@pxref{Character Encoding of Source Files}. If present\n" + "will use that encoding for interpreting the file. Otherwise, the\n" + "port's encoding will be used.\n" + "\n" "In theory we could create read/write ports which were buffered\n" "in one direction only. However this isn't included in the\n" "current interfaces. If a file cannot be opened with the access\n" @@ -358,7 +366,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, #define FUNC_NAME s_scm_open_file { SCM port; - int fdes, flags = 0; + int fdes, flags = 0, use_encoding = 1; unsigned int retries; char *file, *md, *ptr; @@ -393,6 +401,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR; break; case 'b': + use_encoding = 0; #if defined (O_BINARY) flags |= O_BINARY; #endif @@ -426,9 +435,27 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, } } + /* Create a port from this file descriptor. The port's encoding is initially + %default-port-encoding. */ port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode), fport_canonicalize_filename (filename)); + if (use_encoding) + { + /* If this file has a coding declaration, use that as the port + encoding. */ + if (SCM_INPUT_PORT_P (port)) + { + char *enc = scm_i_scan_for_encoding (port); + if (enc != NULL) + scm_i_set_port_encoding_x (port, enc); + } + } + else + /* If this is a binary file, use the binary-friendly ISO-8859-1 + encoding. */ + scm_i_set_port_encoding_x (port, NULL); + scm_dynwind_end (); return port; diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index 9f9985aca..bb5c17336 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -21,7 +21,8 @@ :use-module (test-suite lib) :use-module (test-suite guile-test) :use-module (ice-9 popen) - :use-module (ice-9 rdelim)) + :use-module (ice-9 rdelim) + :use-module (rnrs bytevectors)) (define (display-line . args) (for-each display args) @@ -184,19 +185,76 @@ ;;; read-line should use the port encoding (not the locale encoding). (let ((str "ĉu bone?")) - (with-locale "C" - (let* ((filename (test-file)) - (port (open-file filename "wl"))) - (set-port-encoding! port "UTF-8") - (write-line str port) - (let ((in-port (open-input-file filename))) - (set-port-encoding! in-port "UTF-8") - (let ((line (read-line in-port))) - (close-port in-port) - (close-port port) - (pass-if "file: read-line honors port encoding" - (string=? line str)))) - (delete-file filename)))) + (with-locale "C" + (let* ((filename (test-file)) + (port (open-file filename "wl"))) + (set-port-encoding! port "UTF-8") + (write-line str port) + (let ((in-port (open-input-file filename))) + (set-port-encoding! in-port "UTF-8") + (let ((line (read-line in-port))) + (close-port in-port) + (close-port port) + (pass-if "file: read-line honors port encoding" + (string=? line str)))) + (delete-file filename)))) + +;;; binary mode ignores port encoding +(pass-if "file: binary mode ignores port encoding" + (with-fluids ((%default-port-encoding "UTF-8")) + (let* ((filename (test-file)) + (port (open-file filename "w")) + (test-string "一二三") + (binary-test-string + (apply string + (map integer->char + (uniform-vector->list + (string->utf8 test-string)))))) + (write-line test-string port) + (close-port port) + (let* ((in-port (open-file filename "rb")) + (line (read-line in-port))) + (close-port in-port) + (delete-file filename) + (string=? line binary-test-string))))) + +;;; binary mode ignores file coding declaration +(pass-if "file: binary mode ignores file coding declaration" + (with-fluids ((%default-port-encoding "UTF-8")) + (let* ((filename (test-file)) + (port (open-file filename "w")) + (test-string "一二三") + (binary-test-string + (apply string + (map integer->char + (uniform-vector->list + (string->utf8 test-string)))))) + (write-line ";; coding: utf-8" port) + (write-line test-string port) + (close-port port) + (let* ((in-port (open-file filename "rb")) + (line1 (read-line in-port)) + (line2 (read-line in-port))) + (close-port in-port) + (delete-file filename) + (string=? line2 binary-test-string))))) + +;; open-file honors file coding declarations +(pass-if "file: open-file honors coding declarations" + (with-fluids ((%default-port-encoding "UTF-8")) + (let* ((filename (test-file)) + (port (open-output-file filename)) + (test-string "€100")) + (set-port-encoding! port "ISO-8859-15") + (write-line ";; coding: iso-8859-15" port) + (write-line test-string port) + (close-port port) + (let* ((in-port (open-input-file filename)) + (line1 (read-line in-port)) + (line2 (read-line in-port))) + (close-port in-port) + (delete-file filename) + (string=? line2 test-string))))) ;;; ungetting characters and strings. (with-input-from-string "walk on the moon\nmoon"