Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (texinfo string-utils) -- text filling and wrapping |
2 | ;;;; | |
43c2a483 | 3 | ;;;; Copyright (C) 2009, 2013 Free Software Foundation, Inc. |
47f3ce52 AW |
4 | ;;;; Copyright (C) 2003 Richard Todd |
5 | ;;;; | |
6 | ;;;; This library is free software; you can redistribute it and/or | |
7 | ;;;; modify it under the terms of the GNU Lesser General Public | |
8 | ;;;; License as published by the Free Software Foundation; either | |
9 | ;;;; version 3 of the License, or (at your option) any later version. | |
10 | ;;;; | |
11 | ;;;; This library is distributed in the hope that it will be useful, | |
12 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | ;;;; Lesser General Public License for more details. | |
15 | ;;;; | |
16 | ;;;; You should have received a copy of the GNU Lesser General Public | |
17 | ;;;; License along with this library; if not, write to the Free Software | |
18 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | ;;;; | |
20 | \f | |
21 | ;;; Commentary: | |
22 | ;; Module @samp{(texinfo string-utils)} provides various string-related | |
23 | ;; functions useful to Guile's texinfo support. | |
24 | ;;; Code: | |
25 | ||
26 | (define-module (texinfo string-utils) | |
27 | #:use-module (srfi srfi-13) | |
28 | #:use-module (srfi srfi-14) | |
29 | #:use-module (oop goops) | |
30 | #:export (escape-special-chars | |
31 | transform-string | |
32 | expand-tabs | |
33 | center-string | |
34 | left-justify-string | |
35 | right-justify-string | |
36 | collapse-repeated-chars | |
37 | make-text-wrapper | |
38 | fill-string | |
39 | string->wrapped-lines)) | |
40 | ||
41 | (define* (transform-string str match? replace #:optional (start #f) (end #f)) | |
42 | "Uses @var{match?} against each character in @var{str}, and performs a | |
43 | replacement on each character for which matches are found. | |
44 | ||
45 | @var{match?} may either be a function, a character, a string, or | |
46 | @code{#t}. If @var{match?} is a function, then it takes a single | |
47 | character as input, and should return @samp{#t} for matches. | |
48 | @var{match?} is a character, it is compared to each string character | |
49 | using @code{char=?}. If @var{match?} is a string, then any character | |
50 | in that string will be considered a match. @code{#t} will cause | |
51 | every character to be a match. | |
52 | ||
53 | If @var{replace} is a function, it is called with the matched | |
54 | character as an argument, and the returned value is sent to the output | |
55 | string via @samp{display}. If @var{replace} is anything else, it is | |
56 | sent through the output string via @samp{display}. | |
57 | ||
58 | Note that te replacement for the matched characters does not need to | |
59 | be a single character. That is what differentiates this function from | |
60 | @samp{string-map}, and what makes it useful for applications such as | |
61 | converting @samp{#\\&} to @samp{\"&\"} in web page text. Some other | |
62 | functions in this module are just wrappers around common uses of | |
63 | @samp{transform-string}. Transformations not possible with this | |
64 | function should probably be done with regular expressions. | |
65 | ||
66 | If @var{start} and @var{end} are given, they control which portion | |
67 | of the string undergoes transformation. The entire input string | |
68 | is still output, though. So, if @var{start} is @samp{5}, then the | |
69 | first five characters of @var{str} will still appear in the returned | |
70 | string. | |
71 | ||
72 | @lisp | |
73 | ; these two are equivalent... | |
74 | (transform-string str #\\space #\\-) ; change all spaces to -'s | |
75 | (transform-string str (lambda (c) (char=? #\\space c)) #\\-) | |
76 | @end lisp" | |
77 | ;; I had implemented this with string-fold, but it was | |
78 | ;; slower... | |
79 | (let* ((os (open-output-string)) | |
80 | (matcher (cond ((char? match?) | |
81 | (lambda (c) (char=? match? c))) | |
82 | ((procedure? match?) | |
83 | match?) | |
84 | ((string? match?) | |
85 | (lambda (c) (string-index match? c))) | |
86 | ((boolean? match?) | |
87 | (lambda (c) match?)) | |
88 | (else (throw 'bad-type "expected #t, char, string, or procedure")))) | |
89 | (replacer (if (procedure? replace) | |
90 | (lambda (c) (display (replace c) os)) | |
91 | (lambda (c) (display replace os))))) | |
92 | ||
93 | ;; put the first part in, un-transformed if they asked for it... | |
94 | (if (and start (<= start (string-length str))) | |
95 | (display (substring str 0 start) os)) | |
96 | ||
97 | ;; process the portion they want processed.... | |
98 | (string-for-each | |
99 | (lambda (c) | |
100 | (if (matcher c) | |
101 | ;; we have a match! replace the char as directed... | |
102 | (replacer c) | |
103 | ||
104 | ;; not a match, just insert the character itself... | |
105 | (write-char c os))) | |
106 | str | |
107 | (or start 0) | |
108 | (or end (string-length str))) | |
109 | ||
110 | ;; if there was any at the end, tack it on... | |
111 | (if (and end (< end (string-length str))) | |
112 | (display (substring str end) os)) | |
113 | ||
114 | (get-output-string os))) | |
115 | ||
116 | (define* (expand-tabs str #:optional (tab-size 8)) | |
117 | "Returns a copy of @var{str} with all tabs expanded to spaces. @var{tab-size} defaults to 8. | |
118 | ||
119 | Assuming tab size of 8, this is equivalent to: @lisp | |
120 | (transform-string str #\\tab \" \") | |
121 | @end lisp" | |
122 | (transform-string str | |
123 | #\tab | |
124 | (make-string tab-size #\space))) | |
125 | ||
126 | (define (escape-special-chars str special-chars escape-char) | |
127 | "Returns a copy of @var{str} with all given special characters preceded | |
128 | by the given @var{escape-char}. | |
129 | ||
130 | @var{special-chars} can either be a single character, or a string consisting | |
131 | of all the special characters. | |
132 | ||
133 | @lisp | |
134 | ;; make a string regexp-safe... | |
135 | (escape-special-chars \"***(Example String)***\" | |
136 | \"[]()/*.\" | |
137 | #\\\\) | |
138 | => \"\\\\*\\\\*\\\\*\\\\(Example String\\\\)\\\\*\\\\*\\\\*\" | |
139 | ||
140 | ;; also can escape a singe char... | |
141 | (escape-special-chars \"richardt@@vzavenue.net\" | |
142 | #\\@@ | |
143 | #\\@@) | |
144 | => \"richardt@@@@vzavenue.net\" | |
145 | @end lisp" | |
146 | (transform-string str | |
147 | (if (char? special-chars) | |
148 | ;; if they gave us a char, use char=? | |
149 | (lambda (c) (char=? c special-chars)) | |
150 | ||
151 | ;; if they gave us a string, see if our character is in it | |
152 | (lambda (c) (string-index special-chars c))) | |
153 | ||
154 | ;; replace matches with the character preceded by the escape character | |
155 | (lambda (c) (string escape-char c)))) | |
156 | ||
157 | (define* (center-string str #:optional (width 80) (chr #\space) (rchr #f)) | |
158 | "Returns a copy of @var{str} centered in a field of @var{width} | |
159 | characters. Any needed padding is done by character @var{chr}, which | |
160 | defaults to @samp{#\\space}. If @var{rchr} is provided, then the | |
161 | padding to the right will use it instead. See the examples below. | |
162 | left and @var{rchr} on the right. The default @var{width} is 80. The | |
91a214eb | 163 | default @var{chr} and @var{rchr} is @samp{#\\space}. The string is |
47f3ce52 AW |
164 | never truncated. |
165 | @lisp | |
166 | (center-string \"Richard Todd\" 24) | |
167 | => \" Richard Todd \" | |
168 | ||
169 | (center-string \" Richard Todd \" 24 #\\=) | |
170 | => \"===== Richard Todd =====\" | |
171 | ||
172 | (center-string \" Richard Todd \" 24 #\\< #\\>) | |
173 | => \"<<<<< Richard Todd >>>>>\" | |
174 | @end lisp" | |
175 | (let* ((len (string-length str)) | |
176 | (lpad (make-string (max (quotient (- width len) 2) 0) chr)) | |
177 | ;; right-char == char unless it has been provided by the user | |
178 | (right-chr (or rchr chr)) | |
179 | (rpad (if (char=? right-chr chr) | |
180 | lpad | |
181 | (make-string (max (quotient (- width len) 2) 0) right-chr)))) | |
182 | (if (>= len width) | |
183 | str | |
184 | (string-append lpad str rpad (if (odd? (- width len)) (string right-chr) ""))))) | |
185 | ||
186 | (define* (left-justify-string str #:optional (width 80) (chr #\space)) | |
187 | "@code{left-justify-string str [width chr]}. | |
188 | Returns a copy of @var{str} padded with @var{chr} such that it is left | |
189 | justified in a field of @var{width} characters. The default | |
190 | @var{width} is 80. Unlike @samp{string-pad} from srfi-13, the string | |
191 | is never truncated." | |
192 | (let* ((len (string-length str)) | |
193 | (pad (make-string (max (- width len) 0) chr))) | |
194 | (if (>= len width) | |
195 | str | |
196 | (string-append str pad)))) | |
197 | ||
198 | (define* (right-justify-string str #:optional (width 80) (chr #\space)) | |
199 | "Returns a copy of @var{str} padded with @var{chr} such that it is | |
200 | right justified in a field of @var{width} characters. The default | |
201 | @var{width} is 80. The default @var{chr} is @samp{#\\space}. Unlike | |
202 | @samp{string-pad} from srfi-13, the string is never truncated." | |
203 | (let* ((len (string-length str)) | |
204 | (pad (make-string (max (- width len) 0) chr))) | |
205 | (if (>= len width) | |
206 | str | |
207 | (string-append pad str)))) | |
208 | ||
209 | (define* (collapse-repeated-chars str #:optional (chr #\space) (num 1)) | |
210 | "Returns a copy of @var{str} with all repeated instances of | |
211 | @var{chr} collapsed down to at most @var{num} instances. | |
212 | The default value for @var{chr} is @samp{#\\space}, and | |
213 | the default value for @var{num} is 1. | |
214 | ||
215 | @lisp | |
216 | (collapse-repeated-chars \"H e l l o\") | |
217 | => \"H e l l o\" | |
218 | (collapse-repeated-chars \"H--e--l--l--o\" #\\-) | |
219 | => \"H-e-l-l-o\" | |
220 | (collapse-repeated-chars \"H-e--l---l----o\" #\\- 2) | |
221 | => \"H-e--l--l--o\" | |
222 | @end lisp" | |
223 | ;; define repeat-locator as a stateful match? function which remembers | |
224 | ;; the last character it had seen. | |
225 | (let ((repeat-locator | |
226 | ;; initialize prev-chr to something other than what we're seeking... | |
227 | (let ((prev-chr (if (char=? chr #\space) #\A #\space)) | |
228 | (match-count 0)) | |
229 | (lambda (c) | |
230 | (if (and (char=? c prev-chr) | |
231 | (char=? prev-chr chr)) | |
232 | ;; found enough duplicates if the match-count is high enough | |
233 | (begin | |
234 | (set! match-count (+ 1 match-count)) | |
235 | (>= match-count num)) | |
236 | ||
237 | ;; did not find a duplicate | |
238 | (begin (set! match-count 0) | |
239 | (set! prev-chr c) | |
240 | #f)))))) | |
241 | ||
242 | ;; transform the string with our stateful matcher... | |
243 | ;; deleting matches... | |
244 | (transform-string str repeat-locator ""))) | |
245 | ||
246 | ;; split a text string into segments that have the form... | |
247 | ;; <ws non-ws> <ws non-ws> etc.. | |
248 | (define (split-by-single-words str) | |
249 | (let ((non-wschars (char-set-complement char-set:whitespace))) | |
250 | (let loop ((ans '()) | |
251 | (index 0)) | |
252 | (let ((next-non-ws (string-index str non-wschars index))) | |
253 | (if next-non-ws | |
254 | ;; found non-ws...look for ws following... | |
255 | (let ((next-ws (string-index str char-set:whitespace next-non-ws))) | |
256 | (if next-ws | |
257 | ;; found the ws following... | |
258 | (loop (cons (substring str index next-ws) ans) | |
259 | next-ws) | |
260 | ;; did not find ws...must be the end... | |
261 | (reverse (cons (substring str index) ans)))) | |
262 | ;; did not find non-ws... only ws at end of the string... | |
263 | (reverse ans)))))) | |
264 | ||
43c2a483 LC |
265 | (define (end-of-sentence? str) |
266 | "Return #t when STR likely denotes the end of sentence." | |
267 | (let ((len (string-length str))) | |
268 | (and (> len 1) | |
269 | (eqv? #\. (string-ref str (- len 1))) | |
270 | (not (eqv? #\. (string-ref str (- len 2))))))) | |
271 | ||
47f3ce52 AW |
272 | (define* (make-text-wrapper #:key |
273 | (line-width 80) | |
274 | (expand-tabs? #t) | |
275 | (tab-width 8) | |
276 | (collapse-whitespace? #t) | |
277 | (subsequent-indent "") | |
278 | (initial-indent "") | |
279 | (break-long-words? #t)) | |
280 | "Returns a procedure that will split a string into lines according to the | |
281 | given parameters. | |
282 | ||
283 | @table @code | |
284 | @item #:line-width | |
285 | This is the target length used when deciding where to wrap lines. | |
286 | Default is 80. | |
287 | ||
288 | @item #:expand-tabs? | |
289 | Boolean describing whether tabs in the input should be expanded. Default | |
290 | is #t. | |
291 | ||
292 | @item #:tab-width | |
293 | If tabs are expanded, this will be the number of spaces to which they | |
294 | expand. Default is 8. | |
295 | ||
296 | @item #:collapse-whitespace? | |
297 | Boolean describing whether the whitespace inside the existing text | |
298 | should be removed or not. Default is #t. | |
299 | ||
300 | If text is already well-formatted, and is just being wrapped to fit in a | |
301 | different width, then set this to @samp{#f}. This way, many common text | |
302 | conventions (such as two spaces between sentences) can be preserved if | |
303 | in the original text. If the input text spacing cannot be trusted, then | |
304 | leave this setting at the default, and all repeated whitespace will be | |
305 | collapsed down to a single space. | |
306 | ||
307 | @item #:initial-indent | |
308 | Defines a string that will be put in front of the first line of wrapped | |
309 | text. Default is the empty string, ``''. | |
310 | ||
311 | @item #:subsequent-indent | |
312 | Defines a string that will be put in front of all lines of wrapped | |
313 | text, except the first one. Default is the empty string, ``''. | |
314 | ||
315 | @item #:break-long-words? | |
316 | If a single word is too big to fit on a line, this setting tells the | |
317 | wrapper what to do. Defaults to #t, which will break up long words. | |
318 | When set to #f, the line will be allowed, even though it is longer | |
319 | than the defined @code{#:line-width}. | |
320 | @end table | |
321 | ||
322 | The return value is a procedure of one argument, the input string, which | |
323 | returns a list of strings, where each element of the list is one line." | |
324 | (lambda (str) | |
325 | ;; replace newlines with spaces | |
326 | (set! str (transform-string str (lambda (c) (char=? c #\nl)) #\space)) | |
327 | ||
328 | ;; expand tabs if they wanted us to... | |
329 | (if expand-tabs? | |
330 | (set! str (expand-tabs str tab-width))) | |
331 | ||
332 | ;; collapse whitespace if they wanted us to... | |
333 | (if collapse-whitespace? | |
334 | (set! str (collapse-repeated-chars str))) | |
335 | ||
336 | ;; drop any whitespace from the front... | |
337 | (set! str (string-trim str)) | |
338 | ||
339 | ;; now start breaking the text into lines... | |
340 | (let loop ((ans '()) | |
341 | (words (split-by-single-words str)) | |
342 | (line initial-indent) | |
343 | (count 0)) | |
344 | (if (null? words) | |
345 | ;; out of words? ...done! | |
346 | (reverse (if (> count 0) | |
347 | (cons line ans) | |
348 | ans)) | |
349 | ||
350 | ;; not out of words...keep going... | |
351 | (let ((length-left (- line-width | |
352 | (string-length line))) | |
353 | (next-word (if (= count 0) | |
354 | (string-trim (car words)) | |
355 | (car words)))) | |
356 | (cond | |
357 | ;; does the next entry fit? | |
358 | ((<= (string-length next-word) | |
359 | length-left) | |
360 | (loop ans | |
361 | (cdr words) | |
43c2a483 LC |
362 | (if (and collapse-whitespace? |
363 | (end-of-sentence? line)) | |
364 | ;; Add an extra space after the period. | |
365 | (string-append line " " next-word) | |
366 | (string-append line next-word)) | |
47f3ce52 AW |
367 | (+ count 1))) |
368 | ||
369 | ;; ok, it didn't fit...is there already at least one word on the line? | |
370 | ((> count 0) | |
371 | ;; try to use it for the next line, then... | |
372 | (loop (cons line ans) | |
373 | words | |
374 | subsequent-indent | |
375 | 0)) | |
376 | ||
377 | ;; ok, it didn't fit...and it's the first word. | |
378 | ;; were we told to break up long words? | |
379 | (break-long-words? | |
380 | ;; break the like at the limit, since the user wants us to... | |
381 | (loop (cons (string-append line (substring next-word 0 length-left)) | |
382 | ans) | |
383 | (cons (substring next-word length-left) | |
384 | (cdr words)) | |
385 | subsequent-indent | |
386 | 0)) | |
387 | ||
388 | ;; well, then is it the first word and we *shouldn't* break long words, then... | |
389 | (else | |
390 | (loop (cons (string-append line next-word) | |
391 | ans) | |
392 | (cdr words) | |
393 | subsequent-indent | |
394 | 0)))))))) | |
395 | ||
396 | (define (string->wrapped-lines str . kwargs) | |
397 | "@code{string->wrapped-lines str keywds ...}. Wraps the text given in | |
398 | string @var{str} according to the parameters provided in @var{keywds}, | |
399 | or the default setting if they are not given. Returns a list of strings | |
400 | representing the formatted lines. Valid keyword arguments are discussed | |
401 | in @code{make-text-wrapper}." | |
402 | ((apply make-text-wrapper kwargs) str)) | |
403 | ||
404 | (define (fill-string str . kwargs) | |
405 | "Wraps the text given in string @var{str} according to the parameters | |
91a214eb | 406 | provided in @var{kwargs}, or the default setting if they are not |
47f3ce52 AW |
407 | given. Returns a single string with the wrapped text. Valid keyword |
408 | arguments are discussed in @code{make-text-wrapper}." | |
409 | (string-join (apply string->wrapped-lines str kwargs) | |
410 | "\n" | |
411 | 'infix)) |