Rewrite %initialize-object in Scheme
[bpt/guile.git] / module / texinfo / string-utils.scm
CommitLineData
47f3ce52
AW
1;;;; (texinfo string-utils) -- text filling and wrapping
2;;;;
43c2a483 3;;;; Copyright (C) 2009, 2013 Free Software Foundation, Inc.
47f3ce52
AW
4;;;; Copyright (C) 2003 Richard Todd
5;;;;
6;;;; This library is free software; you can redistribute it and/or
7;;;; modify it under the terms of the GNU Lesser General Public
8;;;; License as published by the Free Software Foundation; either
9;;;; version 3 of the License, or (at your option) any later version.
10;;;;
11;;;; This library is distributed in the hope that it will be useful,
12;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14;;;; Lesser General Public License for more details.
15;;;;
16;;;; You should have received a copy of the GNU Lesser General Public
17;;;; License along with this library; if not, write to the Free Software
18;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;;;;
20\f
21;;; Commentary:
22;; Module @samp{(texinfo string-utils)} provides various string-related
23;; functions useful to Guile's texinfo support.
24;;; Code:
25
26(define-module (texinfo string-utils)
27 #:use-module (srfi srfi-13)
28 #:use-module (srfi srfi-14)
29 #:use-module (oop goops)
30 #:export (escape-special-chars
31 transform-string
32 expand-tabs
33 center-string
34 left-justify-string
35 right-justify-string
36 collapse-repeated-chars
37 make-text-wrapper
38 fill-string
39 string->wrapped-lines))
40
41(define* (transform-string str match? replace #:optional (start #f) (end #f))
42"Uses @var{match?} against each character in @var{str}, and performs a
43replacement on each character for which matches are found.
44
45@var{match?} may either be a function, a character, a string, or
46@code{#t}. If @var{match?} is a function, then it takes a single
47character as input, and should return @samp{#t} for matches.
48@var{match?} is a character, it is compared to each string character
49using @code{char=?}. If @var{match?} is a string, then any character
50in that string will be considered a match. @code{#t} will cause
51every character to be a match.
52
53If @var{replace} is a function, it is called with the matched
54character as an argument, and the returned value is sent to the output
55string via @samp{display}. If @var{replace} is anything else, it is
56sent through the output string via @samp{display}.
57
58Note that te replacement for the matched characters does not need to
59be a single character. That is what differentiates this function from
60@samp{string-map}, and what makes it useful for applications such as
61converting @samp{#\\&} to @samp{\"&\"} in web page text. Some other
62functions in this module are just wrappers around common uses of
63@samp{transform-string}. Transformations not possible with this
64function should probably be done with regular expressions.
65
66If @var{start} and @var{end} are given, they control which portion
67of the string undergoes transformation. The entire input string
68is still output, though. So, if @var{start} is @samp{5}, then the
69first five characters of @var{str} will still appear in the returned
70string.
71
72@lisp
73; these two are equivalent...
74 (transform-string str #\\space #\\-) ; change all spaces to -'s
75 (transform-string str (lambda (c) (char=? #\\space c)) #\\-)
76@end lisp"
77 ;; I had implemented this with string-fold, but it was
78 ;; slower...
79 (let* ((os (open-output-string))
80 (matcher (cond ((char? match?)
81 (lambda (c) (char=? match? c)))
82 ((procedure? match?)
83 match?)
84 ((string? match?)
85 (lambda (c) (string-index match? c)))
86 ((boolean? match?)
87 (lambda (c) match?))
88 (else (throw 'bad-type "expected #t, char, string, or procedure"))))
89 (replacer (if (procedure? replace)
90 (lambda (c) (display (replace c) os))
91 (lambda (c) (display replace os)))))
92
93 ;; put the first part in, un-transformed if they asked for it...
94 (if (and start (<= start (string-length str)))
95 (display (substring str 0 start) os))
96
97 ;; process the portion they want processed....
98 (string-for-each
99 (lambda (c)
100 (if (matcher c)
101 ;; we have a match! replace the char as directed...
102 (replacer c)
103
104 ;; not a match, just insert the character itself...
105 (write-char c os)))
106 str
107 (or start 0)
108 (or end (string-length str)))
109
110 ;; if there was any at the end, tack it on...
111 (if (and end (< end (string-length str)))
112 (display (substring str end) os))
113
114 (get-output-string os)))
115
116(define* (expand-tabs str #:optional (tab-size 8))
117"Returns a copy of @var{str} with all tabs expanded to spaces. @var{tab-size} defaults to 8.
118
119Assuming tab size of 8, this is equivalent to: @lisp
120 (transform-string str #\\tab \" \")
121@end lisp"
122 (transform-string str
123 #\tab
124 (make-string tab-size #\space)))
125
126(define (escape-special-chars str special-chars escape-char)
127"Returns a copy of @var{str} with all given special characters preceded
128by the given @var{escape-char}.
129
130@var{special-chars} can either be a single character, or a string consisting
131of all the special characters.
132
133@lisp
134;; make a string regexp-safe...
135 (escape-special-chars \"***(Example String)***\"
136 \"[]()/*.\"
137 #\\\\)
138=> \"\\\\*\\\\*\\\\*\\\\(Example String\\\\)\\\\*\\\\*\\\\*\"
139
140;; also can escape a singe char...
141 (escape-special-chars \"richardt@@vzavenue.net\"
142 #\\@@
143 #\\@@)
144=> \"richardt@@@@vzavenue.net\"
145@end lisp"
146 (transform-string str
147 (if (char? special-chars)
148 ;; if they gave us a char, use char=?
149 (lambda (c) (char=? c special-chars))
150
151 ;; if they gave us a string, see if our character is in it
152 (lambda (c) (string-index special-chars c)))
153
154 ;; replace matches with the character preceded by the escape character
155 (lambda (c) (string escape-char c))))
156
157(define* (center-string str #:optional (width 80) (chr #\space) (rchr #f))
158"Returns a copy of @var{str} centered in a field of @var{width}
159characters. Any needed padding is done by character @var{chr}, which
160defaults to @samp{#\\space}. If @var{rchr} is provided, then the
161padding to the right will use it instead. See the examples below.
162left and @var{rchr} on the right. The default @var{width} is 80. The
91a214eb 163default @var{chr} and @var{rchr} is @samp{#\\space}. The string is
47f3ce52
AW
164never truncated.
165@lisp
166 (center-string \"Richard Todd\" 24)
167=> \" Richard Todd \"
168
169 (center-string \" Richard Todd \" 24 #\\=)
170=> \"===== Richard Todd =====\"
171
172 (center-string \" Richard Todd \" 24 #\\< #\\>)
173=> \"<<<<< Richard Todd >>>>>\"
174@end lisp"
175 (let* ((len (string-length str))
176 (lpad (make-string (max (quotient (- width len) 2) 0) chr))
177 ;; right-char == char unless it has been provided by the user
178 (right-chr (or rchr chr))
179 (rpad (if (char=? right-chr chr)
180 lpad
181 (make-string (max (quotient (- width len) 2) 0) right-chr))))
182 (if (>= len width)
183 str
184 (string-append lpad str rpad (if (odd? (- width len)) (string right-chr) "")))))
185
186(define* (left-justify-string str #:optional (width 80) (chr #\space))
187"@code{left-justify-string str [width chr]}.
188Returns a copy of @var{str} padded with @var{chr} such that it is left
189justified in a field of @var{width} characters. The default
190@var{width} is 80. Unlike @samp{string-pad} from srfi-13, the string
191is never truncated."
192 (let* ((len (string-length str))
193 (pad (make-string (max (- width len) 0) chr)))
194 (if (>= len width)
195 str
196 (string-append str pad))))
197
198(define* (right-justify-string str #:optional (width 80) (chr #\space))
199"Returns a copy of @var{str} padded with @var{chr} such that it is
200right justified in a field of @var{width} characters. The default
201@var{width} is 80. The default @var{chr} is @samp{#\\space}. Unlike
202@samp{string-pad} from srfi-13, the string is never truncated."
203 (let* ((len (string-length str))
204 (pad (make-string (max (- width len) 0) chr)))
205 (if (>= len width)
206 str
207 (string-append pad str))))
208
209 (define* (collapse-repeated-chars str #:optional (chr #\space) (num 1))
210"Returns a copy of @var{str} with all repeated instances of
211@var{chr} collapsed down to at most @var{num} instances.
212The default value for @var{chr} is @samp{#\\space}, and
213the default value for @var{num} is 1.
214
215@lisp
216 (collapse-repeated-chars \"H e l l o\")
217=> \"H e l l o\"
218 (collapse-repeated-chars \"H--e--l--l--o\" #\\-)
219=> \"H-e-l-l-o\"
220 (collapse-repeated-chars \"H-e--l---l----o\" #\\- 2)
221=> \"H-e--l--l--o\"
222@end lisp"
223 ;; define repeat-locator as a stateful match? function which remembers
224 ;; the last character it had seen.
225 (let ((repeat-locator
226 ;; initialize prev-chr to something other than what we're seeking...
227 (let ((prev-chr (if (char=? chr #\space) #\A #\space))
228 (match-count 0))
229 (lambda (c)
230 (if (and (char=? c prev-chr)
231 (char=? prev-chr chr))
232 ;; found enough duplicates if the match-count is high enough
233 (begin
234 (set! match-count (+ 1 match-count))
235 (>= match-count num))
236
237 ;; did not find a duplicate
238 (begin (set! match-count 0)
239 (set! prev-chr c)
240 #f))))))
241
242 ;; transform the string with our stateful matcher...
243 ;; deleting matches...
244 (transform-string str repeat-locator "")))
245
246;; split a text string into segments that have the form...
247;; <ws non-ws> <ws non-ws> etc..
248(define (split-by-single-words str)
249 (let ((non-wschars (char-set-complement char-set:whitespace)))
250 (let loop ((ans '())
251 (index 0))
252 (let ((next-non-ws (string-index str non-wschars index)))
253 (if next-non-ws
254 ;; found non-ws...look for ws following...
255 (let ((next-ws (string-index str char-set:whitespace next-non-ws)))
256 (if next-ws
257 ;; found the ws following...
258 (loop (cons (substring str index next-ws) ans)
259 next-ws)
260 ;; did not find ws...must be the end...
261 (reverse (cons (substring str index) ans))))
262 ;; did not find non-ws... only ws at end of the string...
263 (reverse ans))))))
264
43c2a483
LC
265(define (end-of-sentence? str)
266 "Return #t when STR likely denotes the end of sentence."
267 (let ((len (string-length str)))
268 (and (> len 1)
269 (eqv? #\. (string-ref str (- len 1)))
270 (not (eqv? #\. (string-ref str (- len 2)))))))
271
47f3ce52
AW
272(define* (make-text-wrapper #:key
273 (line-width 80)
274 (expand-tabs? #t)
275 (tab-width 8)
276 (collapse-whitespace? #t)
277 (subsequent-indent "")
278 (initial-indent "")
279 (break-long-words? #t))
280 "Returns a procedure that will split a string into lines according to the
281given parameters.
282
283@table @code
284@item #:line-width
285This is the target length used when deciding where to wrap lines.
286Default is 80.
287
288@item #:expand-tabs?
289Boolean describing whether tabs in the input should be expanded. Default
290is #t.
291
292@item #:tab-width
293If tabs are expanded, this will be the number of spaces to which they
294expand. Default is 8.
295
296@item #:collapse-whitespace?
297Boolean describing whether the whitespace inside the existing text
298should be removed or not. Default is #t.
299
300If text is already well-formatted, and is just being wrapped to fit in a
301different width, then set this to @samp{#f}. This way, many common text
302conventions (such as two spaces between sentences) can be preserved if
303in the original text. If the input text spacing cannot be trusted, then
304leave this setting at the default, and all repeated whitespace will be
305collapsed down to a single space.
306
307@item #:initial-indent
308Defines a string that will be put in front of the first line of wrapped
309text. Default is the empty string, ``''.
310
311@item #:subsequent-indent
312Defines a string that will be put in front of all lines of wrapped
313text, except the first one. Default is the empty string, ``''.
314
315@item #:break-long-words?
316If a single word is too big to fit on a line, this setting tells the
317wrapper what to do. Defaults to #t, which will break up long words.
318When set to #f, the line will be allowed, even though it is longer
319than the defined @code{#:line-width}.
320@end table
321
322The return value is a procedure of one argument, the input string, which
323returns a list of strings, where each element of the list is one line."
324 (lambda (str)
325 ;; replace newlines with spaces
326 (set! str (transform-string str (lambda (c) (char=? c #\nl)) #\space))
327
328 ;; expand tabs if they wanted us to...
329 (if expand-tabs?
330 (set! str (expand-tabs str tab-width)))
331
332 ;; collapse whitespace if they wanted us to...
333 (if collapse-whitespace?
334 (set! str (collapse-repeated-chars str)))
335
336 ;; drop any whitespace from the front...
337 (set! str (string-trim str))
338
339 ;; now start breaking the text into lines...
340 (let loop ((ans '())
341 (words (split-by-single-words str))
342 (line initial-indent)
343 (count 0))
344 (if (null? words)
345 ;; out of words? ...done!
346 (reverse (if (> count 0)
347 (cons line ans)
348 ans))
349
350 ;; not out of words...keep going...
351 (let ((length-left (- line-width
352 (string-length line)))
353 (next-word (if (= count 0)
354 (string-trim (car words))
355 (car words))))
356 (cond
357 ;; does the next entry fit?
358 ((<= (string-length next-word)
359 length-left)
360 (loop ans
361 (cdr words)
43c2a483
LC
362 (if (and collapse-whitespace?
363 (end-of-sentence? line))
364 ;; Add an extra space after the period.
365 (string-append line " " next-word)
366 (string-append line next-word))
47f3ce52
AW
367 (+ count 1)))
368
369 ;; ok, it didn't fit...is there already at least one word on the line?
370 ((> count 0)
371 ;; try to use it for the next line, then...
372 (loop (cons line ans)
373 words
374 subsequent-indent
375 0))
376
377 ;; ok, it didn't fit...and it's the first word.
378 ;; were we told to break up long words?
379 (break-long-words?
380 ;; break the like at the limit, since the user wants us to...
381 (loop (cons (string-append line (substring next-word 0 length-left))
382 ans)
383 (cons (substring next-word length-left)
384 (cdr words))
385 subsequent-indent
386 0))
387
388 ;; well, then is it the first word and we *shouldn't* break long words, then...
389 (else
390 (loop (cons (string-append line next-word)
391 ans)
392 (cdr words)
393 subsequent-indent
394 0))))))))
395
396(define (string->wrapped-lines str . kwargs)
397 "@code{string->wrapped-lines str keywds ...}. Wraps the text given in
398string @var{str} according to the parameters provided in @var{keywds},
399or the default setting if they are not given. Returns a list of strings
400representing the formatted lines. Valid keyword arguments are discussed
401in @code{make-text-wrapper}."
402 ((apply make-text-wrapper kwargs) str))
403
404(define (fill-string str . kwargs)
405 "Wraps the text given in string @var{str} according to the parameters
91a214eb 406provided in @var{kwargs}, or the default setting if they are not
47f3ce52
AW
407given. Returns a single string with the wrapped text. Valid keyword
408arguments are discussed in @code{make-text-wrapper}."
409 (string-join (apply string->wrapped-lines str kwargs)
410 "\n"
411 'infix))