gnu: sbcl-cl-cffi-gtk: Update to 20200417.
[jackhill/guix/guix.git] / gnu / packages / textutils.scm
CommitLineData
7939e70a
TUBK
1;;; GNU Guix --- Functional package management for GNU
2;;; Copyright © 2015 Taylan Ulrich Bayırlı/Kammer <taylanbayirli@gmail.com>
0ad0ecee 3;;; Copyright © 2015, 2016, 2017, 2018, 2019 Ricardo Wurmus <rekado@elephly.net>
a64a8c46 4;;; Copyright © 2015, 2016 Ben Woodcroft <donttrustben@gmail.com>
e522d840 5;;; Copyright © 2015 Roel Janssen <roel@gnu.org>
2d8cf0b3 6;;; Copyright © 2016 Jelle Licht <jlicht@fsfe.org>
f17a5447 7;;; Copyright © 2016 Alex Griffin <a@ajgrf.com>
30024a1e 8;;; Copyright © 2016, 2018, 2019 Efraim Flashner <efraim@flashner.co.il>
47956fa0 9;;; Copyright © 2016 ng0 <ng0@n0.is>
0905048a 10;;; Copyright © 2016 Marius Bakke <mbakke@fastmail.com>
5353cea0 11;;; Copyright © 2017 Eric Bavier <bavier@member.fsf.org>
7fdca77e 12;;; Copyright © 2017 Rene Saavedra <rennes@openmailbox.org>
7577ab55 13;;; Copyright © 2017,2019 Hartmut Goebel <h.goebel@crazy-compilers.com>
3c8ba11a 14;;; Copyright © 2017 Kei Kebreau <kkebreau@posteo.net>
bfcdf887 15;;; Copyright © 2017 Alex Vong <alexvong1995@gmail.com>
9a71213e 16;;; Copyright © 2018, 2019, 2020 Tobias Geerinckx-Rice <me@tobias.gr>
4715f92e 17;;; Copyright © 2018 Pierre Neidhardt <mail@ambrevar.xyz>
1a0363cf 18;;; Copyright © 2018 Meiyo Peng <meiyo.peng@gmail.com>
933ac939 19;;; Copyright © 2019 Yoshinori Arai <kumagusu08@gmail.com>
f9488b70 20;;; Copyright © 2019 Mădălin Ionel Patrașcu <madalinionel.patrascu@mdc-berlin.de>
24719e8a 21;;; Copyright © 2019 Wiktor Żelazny <wzelazny@vurv.cz>
7939e70a
TUBK
22;;;
23;;; This file is part of GNU Guix.
24;;;
25;;; GNU Guix is free software; you can redistribute it and/or modify it
26;;; under the terms of the GNU General Public License as published by
27;;; the Free Software Foundation; either version 3 of the License, or (at
28;;; your option) any later version.
29;;;
30;;; GNU Guix is distributed in the hope that it will be useful, but
31;;; WITHOUT ANY WARRANTY; without even the implied warranty of
32;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33;;; GNU General Public License for more details.
34;;;
35;;; You should have received a copy of the GNU General Public License
36;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
37
38(define-module (gnu packages textutils)
39 #:use-module ((guix licenses) #:prefix license:)
40 #:use-module (guix packages)
41 #:use-module (guix download)
42 #:use-module (guix git-download)
ff3f6766 43 #:use-module (guix build-system ant)
7939e70a 44 #:use-module (guix build-system gnu)
03f801aa 45 #:use-module (guix build-system go)
c8d969b5 46 #:use-module (guix build-system cmake)
03639d03 47 #:use-module (guix build-system python)
9116f126 48 #:use-module (gnu packages)
f571e1c3 49 #:use-module (gnu packages autotools)
a6baae74 50 #:use-module (gnu packages base)
148585c2 51 #:use-module (gnu packages compression)
f9488b70 52 #:use-module (gnu packages gcc)
2f6e988d 53 #:use-module (gnu packages gettext)
ff3f6766 54 #:use-module (gnu packages java)
8888fe82 55 #:use-module (gnu packages ncurses)
56 #:use-module (gnu packages perl)
57 #:use-module (gnu packages pkg-config)
f17a5447 58 #:use-module (gnu packages python)
44d10b1f 59 #:use-module (gnu packages python-xyz)
8888fe82 60 #:use-module (gnu packages readline)
1506d491
EF
61 #:use-module (gnu packages slang)
62 #:use-module (gnu packages web))
7939e70a 63
2f6e988d
KK
64(define-public dos2unix
65 (package
66 (name "dos2unix")
ac95bd97 67 (version "7.4.1")
2f6e988d
KK
68 (source
69 (origin
70 (method url-fetch)
96b96ec6
TGR
71 (uri (string-append "https://waterlan.home.xs4all.nl/dos2unix/"
72 "dos2unix-" version ".tar.gz"))
2f6e988d 73 (sha256
ac95bd97 74 (base32 "08w6yywzirsxq8bh87jycvvw922ybhc2l426j2iqzliyn1h8mm8w"))))
2f6e988d
KK
75 (build-system gnu-build-system)
76 (arguments
77 '(#:make-flags (list "CC=gcc"
78 (string-append "prefix=" (assoc-ref %outputs "out")))
79 #:phases
80 (modify-phases %standard-phases
81 (delete 'configure)))) ; no configure script
82 (native-inputs
83 `(("gettext" ,gettext-minimal)
84 ("perl" ,perl)))
85 (home-page "https://waterlan.home.xs4all.nl/dos2unix.html")
86 (synopsis "DOS/Mac to Unix and vice versa text file format converter")
87 (description
88 "dos2unix is a tool to convert line breaks in a text file from Unix format
89to DOS format and vice versa.")
90 (license license:bsd-2)))
91
7939e70a
TUBK
92(define-public recode
93 (package
94 (name "recode")
f4a1de00 95 (version "3.7.6")
7939e70a
TUBK
96 (source
97 (origin
20a792b5
EF
98 (method url-fetch)
99 (uri (string-append "https://github.com/rrthomas/recode/releases/"
6f2f7bc8 100 "download/v" version "/recode-" version ".tar.gz"))
7939e70a 101 (sha256
f4a1de00 102 (base32 "0m59sd1ca0zw1aydpc3m8sw03nc885knmccqryg7byzmqs585ia6"))))
7939e70a 103 (build-system gnu-build-system)
20a792b5 104 (native-inputs
5ca1900b
TGR
105 `(("python" ,python)
106 ("python-cython" ,python-cython)))
20a792b5 107 (home-page "https://github.com/rrthomas/recode")
7939e70a
TUBK
108 (synopsis "Text encoding converter")
109 (description "The Recode library converts files between character sets and
110usages. It recognises or produces over 200 different character sets (or about
111300 if combined with an iconv library) and transliterates files between almost
112any pair. When exact transliteration are not possible, it gets rid of
113offending characters or falls back on approximations. The recode program is a
114handy front-end to the library.")
20a792b5 115 (license license:gpl3+)))
688fe865
TUBK
116
117(define-public enca
118 (package
119 (name "enca")
1462b456 120 (version "1.19")
688fe865
TUBK
121 (source
122 (origin
30024a1e
EF
123 (method git-fetch)
124 (uri (git-reference
125 (url "https://github.com/nijel/enca")
126 (commit version)))
127 (file-name (git-file-name name version))
688fe865 128 (sha256
30024a1e 129 (base32 "19q7cwwxmmk5j9438bsqdpjvdjawsd3zmw1zyqgi7s4m0rasr3ah"))))
688fe865 130 (build-system gnu-build-system)
1462b456
EF
131 ;; enca-1.19 tests fail with recent recode.
132 ;(inputs `(("recode" ,recode)))
688fe865
TUBK
133 (home-page "https://github.com/nijel/enca")
134 (synopsis "Text encoding detection tool")
135 (description "Enca (Extremely Naive Charset Analyser) consists of libenca,
136an encoding detection library, and enca, a command line frontend, integrating
137libenca and several charset conversion libraries and tools.")
138 (license license:gpl2)))
cd15ad82
RW
139
140(define-public utf8proc
141 (package
142 (name "utf8proc")
99293380 143 (version "2.4.0")
cd15ad82
RW
144 (source
145 (origin
39ff1d89
MB
146 (method git-fetch)
147 (uri (git-reference
148 (url "https://github.com/JuliaStrings/utf8proc")
149 (commit (string-append "v" version))))
150 (file-name (git-file-name name version))
cd15ad82 151 (sha256
99293380 152 (base32 "1i42hqwc8znqii9brangwkxk5cyc2lk95ip405fg88zr7z2ncr34"))))
cd15ad82 153 (build-system gnu-build-system)
99293380 154 (native-inputs ;test data that is otherwise downloaded with curl
5353cea0
EB
155 `(("NormalizationTest.txt"
156 ,(origin
157 (method url-fetch)
99293380 158 (uri (string-append "https://www.unicode.org/Public/12.1.0/ucd/"
5353cea0
EB
159 "NormalizationTest.txt"))
160 (sha256
99293380 161 (base32 "0hb97k9xv1lr847hwz0719ksqy39s47xw6k01dgs1368jdibvawc"))))
5353cea0
EB
162 ("GraphemeBreakTest.txt"
163 ,(origin
164 (method url-fetch)
99293380 165 (uri (string-append "https://www.unicode.org/Public/12.1.0/ucd/"
5353cea0
EB
166 "auxiliary/GraphemeBreakTest.txt"))
167 (sha256
99293380
MB
168 (base32 "0qc90ppmrwfn3y9cdn8jcjrn7qpdf0fhxkwh945yp4rvh37mbgcm"))))
169
170 ;; For tests.
171 ("perl" ,perl)))
cd15ad82 172 (arguments
5353cea0 173 '(#:make-flags (list "CC=gcc"
f1e3a8ae 174 (string-append "prefix=" (assoc-ref %outputs "out")))
cd15ad82 175 #:phases
f1e3a8ae 176 (modify-phases %standard-phases
5353cea0
EB
177 (delete 'configure)
178 (add-before 'check 'check-data
179 (lambda* (#:key inputs #:allow-other-keys)
180 (for-each (lambda (i)
181 (copy-file (assoc-ref inputs i)
182 (string-append "data/" i)))
183 '("NormalizationTest.txt" "GraphemeBreakTest.txt"))
184 (substitute* "data/GraphemeBreakTest.txt"
185 (("÷") "/")
99293380
MB
186 (("×") "+"))
187 #t)))))
22cc598b 188 (home-page "https://juliastrings.github.io/utf8proc/")
cd15ad82
RW
189 (synopsis "C library for processing UTF-8 Unicode data")
190 (description "utf8proc is a small C library that provides Unicode
191normalization, case-folding, and other operations for data in the UTF-8
9b72ce60 192encoding, supporting Unicode version 9.0.0.")
cd15ad82 193 (license license:expat)))
f571e1c3 194
205df739
TGR
195(define-public libconfuse
196 (package
197 (name "libconfuse")
198 (version "3.2.2")
199 (source
200 (origin
201 (method url-fetch)
202 (uri (string-append "https://github.com/martinh/libconfuse/"
203 "releases/download/v" version
204 "/confuse-" version ".tar.xz"))
205 (sha256
206 (base32 "02r1mmzik2m0iigbc2da3y754vj24i18r3ml5p2wzs027mjhn959"))))
207 (build-system gnu-build-system)
208 (home-page "https://github.com/martinh/libconfuse")
209 (synopsis "Configuration file parser library")
210 (description "libconfuse is a configuration file parser library. It
211supports sections and (lists of) values (strings, integers, floats, booleans
212or other sections), as well as some other features (such as
213single/double-quoted strings, environment variable expansion, functions and
214nested include statements).")
215 (license license:isc)))
216
f571e1c3
RW
217(define-public libgtextutils
218 (package
219 (name "libgtextutils")
220 (version "0.7")
221 (source
222 (origin
223 (method url-fetch)
224 (uri (string-append
225 "https://github.com/agordon/libgtextutils/releases/download/"
226 version "/libgtextutils-" version ".tar.gz"))
227 (sha256
228 (base32 "0jiybkb2z58wa2msvllnphr4js2hvjvh988pavb3mzkgr6ihwbkr"))))
229 (build-system gnu-build-system)
e005ddc2
EF
230 (arguments
231 '(#:phases
232 (modify-phases %standard-phases
233 (replace 'bootstrap
234 (lambda _ (invoke "sh" "reconf"))))))
f571e1c3
RW
235 (native-inputs
236 `(("autoconf" ,autoconf)
237 ("automake" ,automake)
f9488b70 238 ("gcc" ,gcc-5) ;; doesn't build with later versions
f571e1c3
RW
239 ("libtool" ,libtool)))
240 (home-page "https://github.com/agordon/libgtextutils")
241 (synopsis "Gordon's text utils library")
242 (description
243 "libgtextutils is a text utilities library used by the fastx toolkit from
244the Hannon Lab.")
245 (license license:agpl3+)))
aae2b445
BW
246
247(define-public cityhash
a64a8c46 248 (let ((commit "8af9b8c"))
aae2b445
BW
249 (package
250 (name "cityhash")
a64a8c46 251 (version (string-append "1.1-2." commit))
aae2b445
BW
252 (source (origin
253 (method git-fetch)
254 (uri (git-reference
255 (url "https://github.com/google/cityhash.git")
256 (commit commit)))
257 (file-name (string-append name "-" version ".tar.gz"))
258 (sha256
259 (base32
260 "0n6skf5dv8yfl1ckax8dqhvsbslkwc9158zf2ims0xqdvzsahbi6"))))
a64a8c46
BW
261 (build-system gnu-build-system)
262 (arguments
263 '(#:make-flags (list "CXXFLAGS=-g -O3")
264 #:phases
265 (modify-phases %standard-phases
266 ;; citycrc is not installed by default but is used by some
267 ;; programs.
268 (add-after 'install 'install-citycrc
269 (lambda* (#:key outputs #:allow-other-keys)
270 (let* ((out (assoc-ref outputs "out"))
271 (include (string-append out "/include")))
272 (install-file "src/citycrc.h" include))
273 #t)))))
274 (home-page "https://github.com/google/cityhash")
275 (synopsis "C++ hash functions for strings")
276 (description
277 "CityHash provides hash functions for strings. The functions mix the
aae2b445 278input bits thoroughly but are not suitable for cryptography.")
a64a8c46 279 (license license:expat))))
e522d840 280
9128db21
RW
281(define-public ustr
282 (package
283 (name "ustr")
284 (version "1.0.4")
285 (source (origin
286 (method url-fetch)
287 (uri (string-append "http://www.and.org/ustr/" version
288 "/ustr-" version ".tar.bz2"))
289 (sha256
290 (base32
7fdca77e
RS
291 "1i623ygdj7rkizj7985q9d6vj5amwg686aqb5j3ixpkqkyp6xbrx"))
292 (patches (search-patches "ustr-fix-build-with-gcc-5.patch"))))
9128db21
RW
293 (build-system gnu-build-system)
294 (arguments
295 `(#:make-flags
296 (list "CC=gcc"
297 "HIDE="
298 ;; Override "/sbin/ldconfig" with "echo" because we don't need
299 ;; "ldconfig".
300 "LDCONFIG=echo"
301 (string-append "prefix=" (assoc-ref %outputs "out"))
302 "all-shared")
303 #:phases
304 (modify-phases %standard-phases
305 (add-after 'unpack 'disable-check-for-stdint
306 (lambda _
307 ;; Of course we have stdint.h, just not in /usr/include
308 (substitute* '("Makefile"
309 "ustr-import.in")
310 (("-f \"/usr/include/stdint.h\"") "-z \"\""))
311 #t))
312 ;; No configure script
313 (delete 'configure))))
314 (home-page "http://www.and.org/ustr/")
315 (synopsis "String library with very low memory overhead")
316 (description
317 "Ustr is a string library for C with very low memory overhead.")
318 ;; Quoted from the home page: "The License for the code is MIT, new-BSD,
319 ;; LGPL, etc. ... if you need another license to help compatibility, just
320 ;; ask for it. It's basically public domain, without all the legal
321 ;; problems for everyone that trying to make something public domain
322 ;; entails."
323 (license license:public-domain)))
324
7577ab55
HG
325(define-public ascii2binary
326 (package
327 (name "ascii2binary")
328 (version "2.14")
329 (source
330 (origin
331 (method url-fetch)
332 (uri (string-append "http://billposer.org/Software/Downloads/"
333 "ascii2binary-" version ".tar.bz2"))
334 (sha256
335 (base32 "0dc9fxcdmppbs9s06jvq61zbk552laxps0xyk098gj41697ihd96"))))
336 (build-system gnu-build-system)
337 (native-inputs
338 `(("gettext" ,gettext-minimal)))
339 (home-page "https://billposer.org/Software/a2b.html")
340 (synopsis "Convert between ASCII, hexadecimal and binary representations")
341 (description "The two programs are useful for generating test data, for
342inspecting binary files, and for interfacing programs that generate textual
343output to programs that require binary input and conversely. They can also be
344useful when it is desired to reformat numbers.
345
346@itemize
347
348@item @command{ascii2binary} reads input consisting of ascii or hexadecimal
349 representation numbers separated by whitespace and produces as output
350 the binary equivalents. The type and precision of the binary output
351 is selected using command line flags.
352
353@item @command{binary2ascii} reads input consisting of binary numbers
354 and converts them to their ascii or hexadecimal representation.
355 Command line flags specify the type and size of the binary numbers
356 and provide control over the format of the output.
357 Unsigned integers may be written out in binary, octal, decimal,
358 or hexadecimal.
359
360 Signed integers may be written out only in binary or decimal. Floating
361 point numbers may be written out only decimal, either in standard or
362 scientific notation. (If you want to examine the binary representation
363 of floating point numbers, just treat the input as a sequence of unsigned
364 characters.)
365
a6baae74
HG
366@end itemize")
367 (license license:gpl3)))
368
369(define-public uniutils
370 (package
371 (name "uniutils")
372 (version "2.27")
373 (source
374 (origin
375 (method url-fetch)
376 (uri (string-append "http://billposer.org/Software/Downloads/"
377 "uniutils-" version ".tar.bz2"))
378 (sha256
379 (base32 "19w1510w87gx7n4qy3zsb0m467a4rn5scvh4ajajg7jh6x5xri08"))))
380 (build-system gnu-build-system)
381 (arguments
382 '(#:configure-flags '("--disable-dependency-tracking")
383 #:phases
384 (modify-phases %standard-phases
385 (add-after 'build 'fix-paths
386 (lambda* (#:key outputs inputs #:allow-other-keys)
387 (let ((out (assoc-ref outputs "out"))
388 (a2b (assoc-ref inputs "ascii2binary"))
389 (iconv (assoc-ref inputs "libiconv")))
390 (substitute* "utf8lookup"
391 (("^ascii2binary ") (string-append a2b "/bin/ascii2binary "))
392 (("^uniname ") (string-append out "/bin/uniname "))
393 (("^iconv ") (string-append iconv "/bin/iconv ")))
394 #t))))))
395 (inputs
396 `(("ascii2binary" ,ascii2binary)
397 ("libiconv" ,libiconv)))
398 (home-page "https://billposer.org/Software/unidesc.html")
399 (synopsis "Find out what is in a Unicode file")
400 (description "Useful tools when working with Unicode files when one
401doesn't know the writing system, doesn't have the necessary font, needs to
402inspect invisible characters, needs to find out whether characters have been
403combined or in what order they occur, or needs statistics on which characters
404occur.
405
406@itemize
407
408@item @command{uniname} defaults to printing the character offset of each
409character, its byte offset, its hex code value, its encoding, the glyph
410itself, and its name. It may also be used to validate UTF-8 input.
411
412@item @command{unidesc} reports the character ranges to which different
413portions of the text belong. It can also be used to identify Unicode encodings
414(e.g. UTF-16be) flagged by magic numbers.
415
416@item @command{unihist} generates a histogram of the characters in its input.
417
418@item @command{ExplicateUTF8} is intended for debugging or for learning about
419Unicode. It determines and explains the validity of a sequence of bytes as a
420UTF8 encoding.
421
422@item @command{utf8lookup} provides a handy way to look up Unicode characters
423from the command line.
424
425@item @command{unireverse} reverse each line of UTF-8 input
426character-by-character.
427
7577ab55
HG
428@end itemize")
429 (license license:gpl3)))
430
e522d840
RJ
431(define-public libconfig
432 (package
433 (name "libconfig")
1b362016
MB
434 (version "1.7.2")
435 (home-page "https://hyperrealm.github.io/libconfig/")
e522d840
RJ
436 (source (origin
437 (method url-fetch)
1b362016
MB
438 (uri (string-append home-page "/dist/libconfig-"
439 version ".tar.gz"))
e522d840
RJ
440 (sha256
441 (base32
1b362016 442 "1ngs2qx3cx5cbwinc5mvadly0b5n7s86zsc68c404czzfff7lg3w"))))
e522d840 443 (build-system gnu-build-system)
e522d840
RJ
444 (synopsis "C/C++ configuration file library")
445 (description
446 "Libconfig is a simple library for manipulating structured configuration
447files. This file format is more compact and more readable than XML. And
448unlike XML, it is type-aware, so it is not necessary to do string parsing in
449application code.")
450 (license license:lgpl2.1+)))
c8d969b5
PP
451
452(define-public pfff
453 (package
454 (name "pfff")
455 (version "1.0")
456 (source (origin
bc117787
EF
457 (method git-fetch)
458 (uri (git-reference
459 (url "https://github.com/pfff/pfff")
460 (commit (string-append "v" version))))
461 (file-name (git-file-name name version))
c8d969b5
PP
462 (sha256
463 (base32
bc117787 464 "1nxkfm7zliq3rmr7yp871sppwfnz71iz364m2sgazny71pzykggc"))))
c8d969b5 465 (build-system cmake-build-system)
86bbfc50 466 (home-page "https://biit.cs.ut.ee/pfff/")
c8d969b5
PP
467 (synopsis "Probabilistic fast file fingerprinting tool")
468 (description
469 "pfff is a tool for calculating a compact digital fingerprint of a file
470by sampling randomly from the file instead of reading it in full.
471Consequently, the computation has a flat performance characteristic,
472correlated with data variation rather than file size. pfff can be as reliable
473as existing hashing techniques, with provably negligible risk of collisions.")
474 (license license:bsd-3)))
2d8cf0b3
JL
475
476(define-public oniguruma
477 (package
478 (name "oniguruma")
b8181947 479 (version "6.9.4")
2d8cf0b3
JL
480 (source (origin
481 (method url-fetch)
482 (uri (string-append "https://github.com/kkos/"
483 "oniguruma/releases/download/v" version
484 "/onig-" version ".tar.gz"))
485 (sha256
486 (base32
b8181947 487 "0lvd1rpp49i0k1icblb0i76lj2cwmhf1c5p1jdz2m6g0ywpx4sa6"))))
2d8cf0b3
JL
488 (build-system gnu-build-system)
489 (home-page "https://github.com/kkos/oniguruma")
490 (synopsis "Regular expression library")
491 (description "Oniguruma is a regular expressions library. The special
492characteristic of this library is that different character encoding for every
493regular expression object can be specified.")
494 (license license:bsd-2)))
1b90e57e
RW
495
496(define-public antiword
497 (package
498 (name "antiword")
499 (version "0.37")
500 (source (origin
501 (method url-fetch)
502 (uri (string-append "http://www.winfield.demon.nl/linux"
503 "/antiword-" version ".tar.gz"))
504 (sha256
505 (base32
9116f126
EF
506 "1b7mi1l20jhj09kyh0bq14qzz8vdhhyf35gzwsq43mn6rc7h0b4f"))
507 (patches (search-patches "antiword-CVE-2014-8123.patch"))))
1b90e57e
RW
508 (build-system gnu-build-system)
509 (arguments
510 `(#:tests? #f ; There are no tests
511 #:make-flags
512 (list "-f" "Makefile.Linux"
513 (string-append "GLOBAL_INSTALL_DIR="
514 (assoc-ref %outputs "out") "/bin")
515 (string-append "GLOBAL_RESOURCES_DIR="
516 (assoc-ref %outputs "out") "/share/antiword"))
517 #:phases
518 (modify-phases %standard-phases
b28e05d4
RW
519 (replace 'configure
520 (lambda* (#:key outputs #:allow-other-keys)
521 ;; Ensure that mapping files can be found in the actual package
522 ;; data directory.
523 (substitute* "antiword.h"
524 (("/usr/share/antiword")
525 (string-append (assoc-ref outputs "out") "/share/antiword")))
526 #t))
1b90e57e
RW
527 (replace 'install
528 (lambda* (#:key make-flags #:allow-other-keys)
0ad0ecee 529 (apply invoke "make" `("global_install" ,@make-flags)))))))
1b90e57e
RW
530 (home-page "http://www.winfield.demon.nl/")
531 (synopsis "Microsoft Word document reader")
532 (description "Antiword is an application for displaying Microsoft Word
533documents. It can also convert the document to PostScript or XML. Only
534documents made by MS Word version 2 and version 6 or later are supported. The
535name comes from: \"The antidote against people who send Microsoft Word files
536to everybody, because they believe that everybody runs Windows and therefore
537runs Word\".")
538 (license license:gpl2+)))
f17a5447 539
d0abaf89
HG
540(define-public catdoc
541 (package
542 (name "catdoc")
543 (version "0.95")
544 (source (origin
545 (method url-fetch)
546 (uri (string-append "http://ftp.wagner.pp.ru/pub/catdoc/"
547 "catdoc-" version ".tar.gz"))
bfcdf887 548 (patches (search-patches "catdoc-CVE-2017-11110.patch"))
d0abaf89
HG
549 (sha256
550 (base32
551 "15h7v3bmwfk4z8r78xs5ih6vd0pskn0rj90xghvbzdjj0cc88jji"))))
552 (build-system gnu-build-system)
553 ;; TODO: Also build `wordview` which requires `tk` – make a separate
554 ;; package for this.
555 (arguments
556 '(#:tests? #f ; There are no tests
557 #:configure-flags '("--disable-wordview")
558 #:phases
559 (modify-phases %standard-phases
560 (add-before 'install 'fix-install
561 (lambda* (#:key outputs #:allow-other-keys)
562 (let ((out (assoc-ref outputs "out")))
563 (mkdir-p (string-append out "/share/man/man1"))))))))
13607002 564 (home-page "https://www.wagner.pp.ru/~vitus/software/catdoc/")
d0abaf89
HG
565 (synopsis "MS-Word to TeX or plain text converter")
566 (description "@command{catdoc} extracts text from MS-Word files, trying to
567preserve as many special printable characters as possible. It supports
568everything up to Word-97. Also supported are MS Write documents and RTF files.
569
570@command{catdoc} does not preserve complex word formatting, but it can
571translate some non-ASCII characters into TeX escape codes. It's goal is to
572extract plain text and allow you to read it and, probably, reformat with TeX,
573according to TeXnical rules.
574
575This package also provides @command{xls2csv}, which extracts data from Excel
576spreadsheets and outputs it in comma-separated-value format, and
577@command{catppt}, which extracts data from PowerPoint presentations.")
578 (license license:gpl2+)))
579
f17a5447
AG
580(define-public utfcpp
581 (package
582 (name "utfcpp")
f9c84cd4 583 (version "2.3.5")
f17a5447 584 (source (origin
d0f3ccf6
EF
585 (method git-fetch)
586 (uri (git-reference
587 (url "https://github.com/nemtrif/utfcpp")
588 (commit (string-append "v" version))))
589 (file-name (git-file-name name version))
f17a5447
AG
590 (sha256
591 (base32
d0f3ccf6 592 "1gr98d826z6wa58r1s5i7rz7q2x3r31v7zj0pjjlrc7gfxwklr4s"))))
f9c84cd4 593 (build-system cmake-build-system)
f17a5447 594 (arguments
f9c84cd4
TGR
595 `(#:out-of-source? #f
596 #:phases
597 (modify-phases %standard-phases
598 (replace 'install ; no install target
599 (lambda* (#:key outputs #:allow-other-keys)
600 (let* ((out (assoc-ref outputs "out"))
601 (include (string-append out "/include"))
602 (doc (string-append out "/share/doc/" ,name)))
603 (copy-recursively "source" include)
604 (install-file "README.md" doc)
605 #t))))))
f17a5447
AG
606 (home-page "https://github.com/nemtrif/utfcpp")
607 (synopsis "Portable C++ library for handling UTF-8")
608 (description "UTF8-CPP is a C++ library for handling UTF-8 encoded text
609in a portable way.")
610 (license license:boost1.0)))
8888fe82 611
612(define-public dbacl
613 (package
614 (name "dbacl")
9a71213e 615 (version "1.14.1")
8888fe82 616 (source
617 (origin
618 (method url-fetch)
9a71213e 619 (uri (string-append "mirror://sourceforge/dbacl/dbacl/" version "/"
50b01dcf 620 "dbacl-" version ".tar.gz"))
8888fe82 621 (sha256
9a71213e
TGR
622 (base32 "1gas0112wqjvwn9qg3hxnawk7h3prr0w9b2h68f3p1ifd1kzn3gz"))
623 (patches (search-patches "dbacl-include-locale.h.patch"))))
8888fe82 624 (build-system gnu-build-system)
625 (arguments
626 `(#:make-flags
627 (list
628 (string-append "-I" (assoc-ref %build-inputs "slang")
629 "/include/slang")
630 (string-append "-I" (assoc-ref %build-inputs "ncurses")
631 "/include/ncurses"))
632 #:phases
633 (modify-phases %standard-phases
634 (add-after 'unpack 'delete-sample6-and-japanese
635 (lambda _
636 (substitute* "doc/Makefile.am"
637 (("sample6.txt") "")
638 (("japanese.txt") ""))
639 (delete-file "doc/sample6.txt")
640 (delete-file "doc/japanese.txt")
641 (substitute* (list "src/tests/Makefile.am"
642 "src/tests/Makefile.in")
643 (("dbacl-jap.shin") "")
644 (("dbacl-jap.sh") ""))
645 #t))
646 (add-after 'unpack 'delete-test
647 ;; See comments about the license.
648 (lambda _
db317b38
TGR
649 (delete-file "src/tests/dbacl-jap.shin")
650 #t))
8888fe82 651 (add-after 'unpack 'fix-test-files
652 (lambda* (#:key inputs outputs #:allow-other-keys)
653 (let* ((out (assoc-ref outputs "out"))
654 (bin (string-append out "/bin")))
655 (substitute* (find-files "src/tests/" "\\.shin$")
656 (("PATH=/bin:/usr/bin")
657 "#PATH=/bin:/usr/bin")
658 (("diff") (string-append (which "diff")))
659 (("tr") (string-append (which "tr"))))
9a71213e
TGR
660 #t)))
661 (replace 'bootstrap
662 (lambda _
663 (invoke "autoreconf" "-vif")
664 #t)))))
8888fe82 665 (inputs
666 `(("ncurses" ,ncurses)
667 ("perl" ,perl)
668 ("readline" ,readline)
669 ("slang" ,slang)))
670 (native-inputs
671 `(("libtool" ,libtool)
672 ("autoconf" ,autoconf)
673 ("automake" ,automake)
674 ("pkg-config" ,pkg-config)))
7314a547 675 (home-page "https://www.lbreyer.com/dbacl.html")
8888fe82 676 (synopsis "Bayesian text and email classifier")
677 (description
678 "dbacl is a fast Bayesian text and email classifier. It builds a variety
679of language models using maximum entropy (minimum divergence) principles, and
680these can then be used to categorize input data automatically among multiple
681categories.")
682 ;; The software is licensed as GPLv3 or later, but
683 ;; includes various sample texts in the doc dir:
684 ;; - sample1.txt, sample3 and sampe5.txt are in the public domain,
685 ;; by Mark Twain.
686 ;; - sample2.txt, sample4.txt are in the public domain, by Aristotle.
687 ;; - sample6.txt is a forwarded email, copyright unknown.
688 ;; Guix does exclude sample6.txt.
689 ;; - japanese.txt is a Japanese unoffical translation of the
690 ;; GNU General Public License, (c) by the Free Software Foundation.
691 ;; Guix excludes this file.
692 (license (list license:gpl3+ license:public-domain))))
0905048a
MB
693
694(define-public dotconf
695 (package
696 (name "dotconf")
697 (version "1.3")
698 (source (origin
08bc7f26
RW
699 (method git-fetch)
700 (uri (git-reference
701 (url "https://github.com/williamh/dotconf.git")
702 (commit (string-append "v" version))))
703 (file-name (git-file-name name version))
0905048a
MB
704 (sha256
705 (base32
08bc7f26 706 "1sc95hw5k2xagpafny0v35filmcn05k1ds5ghkldfpf6xw4hakp7"))))
0905048a 707 (build-system gnu-build-system)
7a52d0e0 708 (arguments `(#:tests? #f)) ; FIXME maketest.sh does not work.
0905048a
MB
709 (native-inputs
710 `(("autoconf" ,autoconf)
711 ("automake" ,automake)
712 ("libtool" ,libtool)))
713 (home-page "https://github.com/williamh/dotconf")
714 (synopsis "Configuration file parser library")
715 (description
716 "C library for creating and parsing configuration files.")
717 (license (list license:lgpl2.1 ; Main distribution.
718 license:asl1.1)))) ; src/readdir.{c,h}
ff3f6766
RW
719
720(define-public java-rsyntaxtextarea
721 (package
722 (name "java-rsyntaxtextarea")
723 (version "2.6.1")
724 (source (origin
725 (method url-fetch)
726 (uri (string-append "https://github.com/bobbylight/"
727 "RSyntaxTextArea/archive/"
728 version ".tar.gz"))
729 (file-name (string-append name "-" version ".tar.gz"))
730 (sha256
731 (base32
732 "0c5mqg2klj5rvf8fhycrli8rf6s37l9p7a8knw9gpp65r1c120q2"))))
733 (build-system ant-build-system)
734 (arguments
735 `(;; FIXME: some tests fail because locale resources cannot be found.
736 ;; Even when I add them to the class path,
737 ;; RSyntaxTextAreaEditorKitDumbCompleteWordActionTest fails.
738 #:tests? #f
739 #:jar-name "rsyntaxtextarea.jar"))
740 (native-inputs
741 `(("java-junit" ,java-junit)
742 ("java-hamcrest-core" ,java-hamcrest-core)))
743 (home-page "https://bobbylight.github.io/RSyntaxTextArea/")
744 (synopsis "Syntax highlighting text component for Java Swing")
745 (description "RSyntaxTextArea is a syntax highlighting, code folding text
746component for Java Swing. It extends @code{JTextComponent} so it integrates
747completely with the standard @code{javax.swing.text} package. It is fast and
748efficient, and can be used in any application that needs to edit or view
749source code.")
750 (license license:bsd-3)))
03639d03
RW
751
752;; We use the sources from git instead of the tarball from pypi, because the
753;; latter does not include the Cython source file from which bycython.cpp is
754;; generated.
755(define-public python-editdistance
756 (let ((commit "3ea84a7dd3258c76aa3be851ef3d50e59c886846")
757 (revision "1"))
758 (package
759 (name "python-editdistance")
760 (version (string-append "0.3.1-" revision "." (string-take commit 7)))
761 (source
762 (origin
763 (method git-fetch)
764 (uri (git-reference
765 (url "https://github.com/aflc/editdistance.git")
766 (commit commit)))
54335b59 767 (file-name (git-file-name name version))
03639d03
RW
768 (sha256
769 (base32
770 "1l43svsv12crvzphrgi6x435z6xg8m086c64armp8wzb4l8ccm7g"))))
771 (build-system python-build-system)
772 (arguments
773 `(#:phases
774 (modify-phases %standard-phases
775 (add-after 'unpack 'build-cython-code
776 (lambda _
777 (with-directory-excursion "editdistance"
778 (delete-file "bycython.cpp")
54d5b27e 779 (invoke "cython" "--cplus" "bycython.pyx")))))))
03639d03
RW
780 (native-inputs
781 `(("python-cython" ,python-cython)))
782 (home-page "https://www.github.com/aflc/editdistance")
783 (synopsis "Fast implementation of the edit distance (Levenshtein distance)")
784 (description
785 "This library simply implements Levenshtein distance algorithm with C++
786and Cython.")
787 (license license:expat))))
03f801aa
CB
788
789(define-public go-github.com-mattn-go-runewidth
d050000a
BL
790 (let ((commit "703b5e6b11ae25aeb2af9ebb5d5fdf8fa2575211")
791 (version "0.0.4")
792 (revision "1"))
793 (package
794 (name "go-github.com-mattn-go-runewidth")
795 (version (git-version version revision commit))
796 (source
797 (origin
798 (method git-fetch)
799 (uri (git-reference
800 (url "https://github.com/mattn/runewidth")
801 (commit commit)))
802 (file-name (git-file-name name version))
803 (sha256
804 (base32
805 "0znpyz71gajx3g0j2zp63nhjj2c07g16885vxv4ykwnrfmzbgk4w"))))
806 (build-system go-build-system)
807 (arguments
808 '(#:import-path "github.com/mattn/go-runewidth"))
809 (synopsis "@code{runewidth} provides Go functions to work with string widths")
810 (description
811 "The @code{runewidth} library provides Go functions for padding,
812measuring and checking the width of strings, with support for East Asian
813text.")
814 (home-page "https://github.com/mattn/runewidth")
815 (license license:expat))))
74fa77e9
PN
816
817(define-public docx2txt
818 (package
819 (name "docx2txt")
820 (version "1.4")
821 (source (origin
822 (method url-fetch)
823 (uri (string-append
824 "mirror://sourceforge/docx2txt/docx2txt/v"
825 version "/docx2txt-" version ".tgz"))
826 (sha256
827 (base32
828 "06vdikjvpj6qdb41d8wzfnyj44jpnknmlgbhbr1w215420lpb5xj"))))
829 (build-system gnu-build-system)
830 (inputs
831 `(("unzip" ,unzip)
832 ("perl" ,perl)))
833 (arguments
834 `(#:tests? #f ; No tests.
835 #:make-flags (list (string-append "BINDIR="
836 (assoc-ref %outputs "out") "/bin")
837 (string-append "CONFIGDIR="
838 (assoc-ref %outputs "out") "/etc")
839 ;; Makefile seems to be a bit dumb at guessing.
840 (string-append "INSTALL=install")
841 (string-append "PERL=perl"))
842 #:phases
843 (modify-phases %standard-phases
844 (delete 'configure)
845 (add-after 'install 'fix-install
846 (lambda* (#:key outputs inputs #:allow-other-keys)
847 (let* ((out (assoc-ref outputs "out"))
848 (bin (string-append out "/bin"))
849 (config (string-append out "/etc/docx2txt.config"))
850 (unzip (assoc-ref inputs "unzip")))
851 ;; According to INSTALL, the .sh wrapper can be skipped.
852 (delete-file (string-append bin "/docx2txt.sh"))
853 (rename-file (string-append bin "/docx2txt.pl")
854 (string-append bin "/docx2txt"))
855 (substitute* config
856 (("config_unzip => '/usr/bin/unzip',")
857 (string-append "config_unzip => '"
858 unzip
859 "/bin/unzip',")))
860 ;; Makefile is wrong.
f6ccb95a
TGR
861 (chmod config #o644)
862 #t))))))
74fa77e9
PN
863 (synopsis "Recover text from @file{.docx} files, with good formatting")
864 (description
865 "@command{docx2txt} is a Perl based command line utility to convert
866Microsoft Office @file{.docx} documents to equivalent text documents. Latest
867version supports following features during text extraction.
868
869@itemize
870@item Character conversions; currency characters are converted to respective
871names like Euro.
872@item Capitalisation of text blocks.
873@item Center and right justification of text fitting in a line of
874(configurable) 80 columns.
875@item Horizontal ruler, line breaks, paragraphs separation, tabs.
876@item Indicating hyperlinked text along with the hyperlink (configurable).
877@item Handling (bullet, decimal, letter, roman) lists along with (attempt at)
878indentation.
879@end itemize\n")
880 (home-page "http://docx2txt.sourceforge.net")
881 (license license:gpl3+)))
1a0363cf 882
bc5aa386
VC
883(define-public odt2txt
884 (package
885 (name "odt2txt")
886 (version "0.5")
887 (source
888 (origin
889 (method git-fetch)
890 (uri (git-reference
891 (url "https://github.com/dstosberg/odt2txt/")
892 (commit (string-append "v" version))))
893 (file-name (git-file-name name version))
894 (sha256
895 (base32
896 "0im3kzvhxkjlx57w6h13mc9584c74ma1dyymgvpq2y61av3gc35v"))))
897 (build-system gnu-build-system)
898 (arguments
899 `(#:tests? #f ; no make check
900 #:make-flags (list "CC=gcc"
901 (string-append "DESTDIR=" (assoc-ref %outputs "out")))
902 #:phases
903 (modify-phases %standard-phases
904 ;; no configure script
905 (delete 'configure))))
906 (inputs
907 `(("zlib" ,zlib)))
908 (home-page "https://github.com/dstosberg/odt2txt/")
909 (synopsis "Converter from OpenDocument Text to plain text")
910 (description "odt2txt is a command-line tool which extracts the text out
911of OpenDocument Texts, as produced by OpenOffice.org, KOffice, StarOffice and
912others.
913
914odt2txt can also extract text from some file formats similar to OpenDocument
915Text, such as OpenOffice.org XML (*.sxw), which was used by OpenOffice.org
916version 1.x and older StarOffice versions. To a lesser extent, odt2txt may be
917useful to extract content from OpenDocument spreadsheets (*.ods) and
918OpenDocument presentations (*.odp).")
919 (license license:gpl2)))
920
1a0363cf
MP
921(define-public opencc
922 (package
923 (name "opencc")
924 (version "1.0.5")
925 (source
926 (origin
bf5af934
EF
927 (method git-fetch)
928 (uri (git-reference
929 (url "https://github.com/BYVoid/OpenCC")
930 (commit (string-append "ver." version))))
931 (file-name (git-file-name name version))
1a0363cf 932 (sha256
1506d491
EF
933 (base32
934 "1pv5md225qwhbn8ql932zdg6gh1qlx3paiajaks8gfsa07yzvhr4"))
935 (modules '((guix build utils)))
936 (snippet
937 '(begin
938 ;; TODO: Unbundle tclap, darts-clone, gtest
939 (delete-file-recursively "deps/rapidjson-0.11") #t))))
1a0363cf 940 (build-system cmake-build-system)
1506d491
EF
941 (arguments
942 '(#:phases
943 (modify-phases %standard-phases
944 (add-after 'unpack 'patch-3rd-party-references
945 (lambda* (#:key inputs #:allow-other-keys)
946 (let ((rapidjson (assoc-ref inputs "rapidjson")))
947 (substitute* "src/CMakeLists.txt"
948 (("../deps/rapidjson-0.11")
949 (string-append rapidjson "/include/rapidjson")))
950 #t))))))
1a0363cf 951 (native-inputs
1506d491
EF
952 `(("python" ,python-wrapper)
953 ("rapidjson" ,rapidjson)))
1a0363cf
MP
954 (home-page "https://github.com/BYVoid/OpenCC")
955 (synopsis "Convert between Traditional Chinese and Simplified Chinese")
956 (description "Open Chinese Convert (OpenCC) converts between Traditional
957Chinese and Simplified Chinese, supporting character-level conversion,
958phrase-level conversion, variant conversion, and regional idioms among
959Mainland China, Taiwan, and Hong-Kong.")
960 (license license:asl2.0)))
933ac939
YA
961
962(define-public nkf
963 (let ((commit "08043eadf4abdddcf277842217e3c77a24740dc2")
964 (revision "1"))
965 (package
966 (name "nkf")
967 ;; The commits corresponding to specific versions are published
968 ;; here:
969 ;; https://ja.osdn.net/projects/nkf/scm/git/nkf/
970 (version "2.1.5")
971 (source (origin
972 (method git-fetch)
973 (uri (git-reference
974 (url "https://github.com/nurse/nkf.git")
975 (commit commit)))
976 (file-name (git-file-name name version))
977 (sha256
978 (base32
979 "0anw0knr1iy4p9w3d3b3pbwzh1c43p1i2q4c28kw9zviw8kx2rly"))))
980 (build-system gnu-build-system)
981 (arguments
982 `(#:tests? #f ; test for perl module
983 #:make-flags (list "CC=gcc" "CFLAGS=-O2 -Wall -pedantic"
984 (string-append "prefix=" %output)
985 "MKDIR=mkdir -p")
986 #:phases
987 (modify-phases %standard-phases
988 (delete 'configure)))) ; No ./configure script
989 (home-page "https://ja.osdn.net/projects/nkf/")
990 (synopsis "Network Kanji Filter")
991 (description "Nkf is yet another kanji code converter among networks,
992hosts and terminals. It converts input kanji code to designated kanji code
993such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8, UTF-16 or UTF-32.")
994 (license license:zlib))))
74247b80
NG
995
996(define-public python-pandocfilters
997 (package
998 (name "python-pandocfilters")
999 (version "1.4.2")
1000 (source
1001 (origin
1002 (method url-fetch)
1003 (uri (pypi-uri "pandocfilters" version))
1004 (sha256
1005 (base32
1006 "1a8d9b7s48gmq9zj0pmbyv2sivn5i7m6mybgpkk4jm5vd7hp1pdk"))))
1007 (build-system python-build-system)
1008 (home-page "https://github.com/jgm/pandocfilters")
1009 (synopsis "Python module for writing Pandoc filters")
1010 (description "Pandoc is a powerful utility to transform various
1011input formats into a wide range of output formats. To alter the
1012exported output document, Pandoc allows the usage of filters, which
1013are pipes that read a JSON serialization of the Pandoc AST from stdin,
1014transform it in some way, and write it to stdout. It allows therefore
1015to alter the processing of Pandoc's supported input formats, for
1016instance one can add new syntax elements to markdown, etc.
1017
1018This package provides Python bindings.")
1019 (license license:bsd-3)))
24719e8a
1020
1021(define-public aha
1022 (package
1023 (name "aha")
1024 (version "0.5")
1025 (source
1026 (origin
1027 (method git-fetch)
1028 (uri (git-reference
1029 (url "https://github.com/theZiz/aha")
1030 (commit version)))
1031 (sha256
1032 (base32
1033 "0byml4rmpiaalwx69jcixl3yvpvwmwiss1jzgsqwshilb2p4qnmz"))
1034 (file-name (git-file-name name version))))
1035 (build-system gnu-build-system)
1036 (arguments
1037 '(#:phases
1038 (modify-phases %standard-phases
1039 (delete 'configure))
1040 #:make-flags (list "CC=gcc"
1041 (string-append "PREFIX="
1042 (assoc-ref %outputs "out")))
1043 ;; no check target
1044 #:tests? #f))
1045 (home-page "https://github.com/theZiz/aha")
1046 (synopsis "Converts terminal escape sequences to HTML")
1047 (description "@command{aha} (Ansi Html Adapter) converts ANSI escape sequences
1048of a Unix terminal to HTML code.")
1049 (license (list license:lgpl2.0+ license:mpl1.1))))