mailmap: Update entries for Nikita.
[jackhill/guix/guix.git] / gnu / packages / textutils.scm
CommitLineData
7939e70a
TUBK
1;;; GNU Guix --- Functional package management for GNU
2;;; Copyright © 2015 Taylan Ulrich Bayırlı/Kammer <taylanbayirli@gmail.com>
0ad0ecee 3;;; Copyright © 2015, 2016, 2017, 2018, 2019 Ricardo Wurmus <rekado@elephly.net>
a64a8c46 4;;; Copyright © 2015, 2016 Ben Woodcroft <donttrustben@gmail.com>
e522d840 5;;; Copyright © 2015 Roel Janssen <roel@gnu.org>
2d8cf0b3 6;;; Copyright © 2016 Jelle Licht <jlicht@fsfe.org>
f17a5447 7;;; Copyright © 2016 Alex Griffin <a@ajgrf.com>
30024a1e 8;;; Copyright © 2016, 2018, 2019 Efraim Flashner <efraim@flashner.co.il>
3c986a7d 9;;; Copyright © 2016 Nikita <nikita@n0.is>
0905048a 10;;; Copyright © 2016 Marius Bakke <mbakke@fastmail.com>
5353cea0 11;;; Copyright © 2017 Eric Bavier <bavier@member.fsf.org>
7fdca77e 12;;; Copyright © 2017 Rene Saavedra <rennes@openmailbox.org>
7577ab55 13;;; Copyright © 2017,2019 Hartmut Goebel <h.goebel@crazy-compilers.com>
3c8ba11a 14;;; Copyright © 2017 Kei Kebreau <kkebreau@posteo.net>
bfcdf887 15;;; Copyright © 2017 Alex Vong <alexvong1995@gmail.com>
9a71213e 16;;; Copyright © 2018, 2019, 2020 Tobias Geerinckx-Rice <me@tobias.gr>
4715f92e 17;;; Copyright © 2018 Pierre Neidhardt <mail@ambrevar.xyz>
1a0363cf 18;;; Copyright © 2018 Meiyo Peng <meiyo.peng@gmail.com>
933ac939 19;;; Copyright © 2019 Yoshinori Arai <kumagusu08@gmail.com>
f9488b70 20;;; Copyright © 2019 Mădălin Ionel Patrașcu <madalinionel.patrascu@mdc-berlin.de>
24719e8a 21;;; Copyright © 2019 Wiktor Żelazny <wzelazny@vurv.cz>
7939e70a
TUBK
22;;;
23;;; This file is part of GNU Guix.
24;;;
25;;; GNU Guix is free software; you can redistribute it and/or modify it
26;;; under the terms of the GNU General Public License as published by
27;;; the Free Software Foundation; either version 3 of the License, or (at
28;;; your option) any later version.
29;;;
30;;; GNU Guix is distributed in the hope that it will be useful, but
31;;; WITHOUT ANY WARRANTY; without even the implied warranty of
32;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33;;; GNU General Public License for more details.
34;;;
35;;; You should have received a copy of the GNU General Public License
36;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
37
38(define-module (gnu packages textutils)
39 #:use-module ((guix licenses) #:prefix license:)
40 #:use-module (guix packages)
41 #:use-module (guix download)
42 #:use-module (guix git-download)
ff3f6766 43 #:use-module (guix build-system ant)
7939e70a 44 #:use-module (guix build-system gnu)
03f801aa 45 #:use-module (guix build-system go)
c8d969b5 46 #:use-module (guix build-system cmake)
03639d03 47 #:use-module (guix build-system python)
9116f126 48 #:use-module (gnu packages)
f571e1c3 49 #:use-module (gnu packages autotools)
a6baae74 50 #:use-module (gnu packages base)
148585c2 51 #:use-module (gnu packages compression)
f9488b70 52 #:use-module (gnu packages gcc)
2f6e988d 53 #:use-module (gnu packages gettext)
ff3f6766 54 #:use-module (gnu packages java)
8888fe82 55 #:use-module (gnu packages ncurses)
56 #:use-module (gnu packages perl)
57 #:use-module (gnu packages pkg-config)
f17a5447 58 #:use-module (gnu packages python)
44d10b1f 59 #:use-module (gnu packages python-xyz)
8888fe82 60 #:use-module (gnu packages readline)
1506d491
EF
61 #:use-module (gnu packages slang)
62 #:use-module (gnu packages web))
7939e70a 63
2f6e988d
KK
64(define-public dos2unix
65 (package
66 (name "dos2unix")
ac95bd97 67 (version "7.4.1")
2f6e988d
KK
68 (source
69 (origin
70 (method url-fetch)
96b96ec6
TGR
71 (uri (string-append "https://waterlan.home.xs4all.nl/dos2unix/"
72 "dos2unix-" version ".tar.gz"))
2f6e988d 73 (sha256
ac95bd97 74 (base32 "08w6yywzirsxq8bh87jycvvw922ybhc2l426j2iqzliyn1h8mm8w"))))
2f6e988d
KK
75 (build-system gnu-build-system)
76 (arguments
77 '(#:make-flags (list "CC=gcc"
78 (string-append "prefix=" (assoc-ref %outputs "out")))
79 #:phases
80 (modify-phases %standard-phases
81 (delete 'configure)))) ; no configure script
82 (native-inputs
83 `(("gettext" ,gettext-minimal)
84 ("perl" ,perl)))
85 (home-page "https://waterlan.home.xs4all.nl/dos2unix.html")
86 (synopsis "DOS/Mac to Unix and vice versa text file format converter")
87 (description
88 "dos2unix is a tool to convert line breaks in a text file from Unix format
89to DOS format and vice versa.")
90 (license license:bsd-2)))
91
7939e70a
TUBK
92(define-public recode
93 (package
94 (name "recode")
f4a1de00 95 (version "3.7.6")
7939e70a
TUBK
96 (source
97 (origin
20a792b5
EF
98 (method url-fetch)
99 (uri (string-append "https://github.com/rrthomas/recode/releases/"
6f2f7bc8 100 "download/v" version "/recode-" version ".tar.gz"))
7939e70a 101 (sha256
f4a1de00 102 (base32 "0m59sd1ca0zw1aydpc3m8sw03nc885knmccqryg7byzmqs585ia6"))))
7939e70a 103 (build-system gnu-build-system)
20a792b5 104 (native-inputs
5ca1900b
TGR
105 `(("python" ,python)
106 ("python-cython" ,python-cython)))
20a792b5 107 (home-page "https://github.com/rrthomas/recode")
7939e70a
TUBK
108 (synopsis "Text encoding converter")
109 (description "The Recode library converts files between character sets and
110usages. It recognises or produces over 200 different character sets (or about
111300 if combined with an iconv library) and transliterates files between almost
112any pair. When exact transliteration are not possible, it gets rid of
113offending characters or falls back on approximations. The recode program is a
114handy front-end to the library.")
20a792b5 115 (license license:gpl3+)))
688fe865
TUBK
116
117(define-public enca
118 (package
119 (name "enca")
1462b456 120 (version "1.19")
688fe865
TUBK
121 (source
122 (origin
30024a1e
EF
123 (method git-fetch)
124 (uri (git-reference
125 (url "https://github.com/nijel/enca")
126 (commit version)))
127 (file-name (git-file-name name version))
688fe865 128 (sha256
30024a1e 129 (base32 "19q7cwwxmmk5j9438bsqdpjvdjawsd3zmw1zyqgi7s4m0rasr3ah"))))
688fe865 130 (build-system gnu-build-system)
1462b456
EF
131 ;; enca-1.19 tests fail with recent recode.
132 ;(inputs `(("recode" ,recode)))
688fe865
TUBK
133 (home-page "https://github.com/nijel/enca")
134 (synopsis "Text encoding detection tool")
135 (description "Enca (Extremely Naive Charset Analyser) consists of libenca,
136an encoding detection library, and enca, a command line frontend, integrating
137libenca and several charset conversion libraries and tools.")
138 (license license:gpl2)))
cd15ad82
RW
139
140(define-public utf8proc
141 (package
142 (name "utf8proc")
99293380 143 (version "2.4.0")
cd15ad82
RW
144 (source
145 (origin
39ff1d89
MB
146 (method git-fetch)
147 (uri (git-reference
148 (url "https://github.com/JuliaStrings/utf8proc")
149 (commit (string-append "v" version))))
150 (file-name (git-file-name name version))
cd15ad82 151 (sha256
99293380 152 (base32 "1i42hqwc8znqii9brangwkxk5cyc2lk95ip405fg88zr7z2ncr34"))))
cd15ad82 153 (build-system gnu-build-system)
99293380 154 (native-inputs ;test data that is otherwise downloaded with curl
5353cea0
EB
155 `(("NormalizationTest.txt"
156 ,(origin
157 (method url-fetch)
99293380 158 (uri (string-append "https://www.unicode.org/Public/12.1.0/ucd/"
5353cea0
EB
159 "NormalizationTest.txt"))
160 (sha256
99293380 161 (base32 "0hb97k9xv1lr847hwz0719ksqy39s47xw6k01dgs1368jdibvawc"))))
5353cea0
EB
162 ("GraphemeBreakTest.txt"
163 ,(origin
164 (method url-fetch)
99293380 165 (uri (string-append "https://www.unicode.org/Public/12.1.0/ucd/"
5353cea0
EB
166 "auxiliary/GraphemeBreakTest.txt"))
167 (sha256
99293380
MB
168 (base32 "0qc90ppmrwfn3y9cdn8jcjrn7qpdf0fhxkwh945yp4rvh37mbgcm"))))
169
170 ;; For tests.
171 ("perl" ,perl)))
cd15ad82 172 (arguments
5353cea0 173 '(#:make-flags (list "CC=gcc"
f1e3a8ae 174 (string-append "prefix=" (assoc-ref %outputs "out")))
cd15ad82 175 #:phases
f1e3a8ae 176 (modify-phases %standard-phases
5353cea0
EB
177 (delete 'configure)
178 (add-before 'check 'check-data
179 (lambda* (#:key inputs #:allow-other-keys)
180 (for-each (lambda (i)
181 (copy-file (assoc-ref inputs i)
182 (string-append "data/" i)))
183 '("NormalizationTest.txt" "GraphemeBreakTest.txt"))
184 (substitute* "data/GraphemeBreakTest.txt"
185 (("÷") "/")
99293380
MB
186 (("×") "+"))
187 #t)))))
22cc598b 188 (home-page "https://juliastrings.github.io/utf8proc/")
cd15ad82
RW
189 (synopsis "C library for processing UTF-8 Unicode data")
190 (description "utf8proc is a small C library that provides Unicode
191normalization, case-folding, and other operations for data in the UTF-8
9b72ce60 192encoding, supporting Unicode version 9.0.0.")
cd15ad82 193 (license license:expat)))
f571e1c3 194
205df739
TGR
195(define-public libconfuse
196 (package
197 (name "libconfuse")
198 (version "3.2.2")
199 (source
200 (origin
201 (method url-fetch)
202 (uri (string-append "https://github.com/martinh/libconfuse/"
203 "releases/download/v" version
204 "/confuse-" version ".tar.xz"))
205 (sha256
206 (base32 "02r1mmzik2m0iigbc2da3y754vj24i18r3ml5p2wzs027mjhn959"))))
207 (build-system gnu-build-system)
208 (home-page "https://github.com/martinh/libconfuse")
209 (synopsis "Configuration file parser library")
210 (description "libconfuse is a configuration file parser library. It
211supports sections and (lists of) values (strings, integers, floats, booleans
212or other sections), as well as some other features (such as
213single/double-quoted strings, environment variable expansion, functions and
214nested include statements).")
215 (license license:isc)))
216
f571e1c3
RW
217(define-public libgtextutils
218 (package
219 (name "libgtextutils")
220 (version "0.7")
221 (source
222 (origin
223 (method url-fetch)
224 (uri (string-append
225 "https://github.com/agordon/libgtextutils/releases/download/"
226 version "/libgtextutils-" version ".tar.gz"))
227 (sha256
228 (base32 "0jiybkb2z58wa2msvllnphr4js2hvjvh988pavb3mzkgr6ihwbkr"))))
229 (build-system gnu-build-system)
e005ddc2
EF
230 (arguments
231 '(#:phases
232 (modify-phases %standard-phases
233 (replace 'bootstrap
234 (lambda _ (invoke "sh" "reconf"))))))
f571e1c3
RW
235 (native-inputs
236 `(("autoconf" ,autoconf)
237 ("automake" ,automake)
f9488b70 238 ("gcc" ,gcc-5) ;; doesn't build with later versions
f571e1c3
RW
239 ("libtool" ,libtool)))
240 (home-page "https://github.com/agordon/libgtextutils")
241 (synopsis "Gordon's text utils library")
242 (description
243 "libgtextutils is a text utilities library used by the fastx toolkit from
244the Hannon Lab.")
245 (license license:agpl3+)))
aae2b445
BW
246
247(define-public cityhash
a64a8c46 248 (let ((commit "8af9b8c"))
aae2b445
BW
249 (package
250 (name "cityhash")
a64a8c46 251 (version (string-append "1.1-2." commit))
aae2b445
BW
252 (source (origin
253 (method git-fetch)
254 (uri (git-reference
255 (url "https://github.com/google/cityhash.git")
256 (commit commit)))
257 (file-name (string-append name "-" version ".tar.gz"))
258 (sha256
259 (base32
260 "0n6skf5dv8yfl1ckax8dqhvsbslkwc9158zf2ims0xqdvzsahbi6"))))
a64a8c46
BW
261 (build-system gnu-build-system)
262 (arguments
263 '(#:make-flags (list "CXXFLAGS=-g -O3")
264 #:phases
265 (modify-phases %standard-phases
266 ;; citycrc is not installed by default but is used by some
267 ;; programs.
268 (add-after 'install 'install-citycrc
269 (lambda* (#:key outputs #:allow-other-keys)
270 (let* ((out (assoc-ref outputs "out"))
271 (include (string-append out "/include")))
272 (install-file "src/citycrc.h" include))
273 #t)))))
274 (home-page "https://github.com/google/cityhash")
275 (synopsis "C++ hash functions for strings")
276 (description
277 "CityHash provides hash functions for strings. The functions mix the
aae2b445 278input bits thoroughly but are not suitable for cryptography.")
a64a8c46 279 (license license:expat))))
e522d840 280
9128db21
RW
281(define-public ustr
282 (package
283 (name "ustr")
284 (version "1.0.4")
285 (source (origin
286 (method url-fetch)
287 (uri (string-append "http://www.and.org/ustr/" version
288 "/ustr-" version ".tar.bz2"))
289 (sha256
290 (base32
7fdca77e
RS
291 "1i623ygdj7rkizj7985q9d6vj5amwg686aqb5j3ixpkqkyp6xbrx"))
292 (patches (search-patches "ustr-fix-build-with-gcc-5.patch"))))
9128db21
RW
293 (build-system gnu-build-system)
294 (arguments
295 `(#:make-flags
296 (list "CC=gcc"
297 "HIDE="
298 ;; Override "/sbin/ldconfig" with "echo" because we don't need
299 ;; "ldconfig".
300 "LDCONFIG=echo"
301 (string-append "prefix=" (assoc-ref %outputs "out"))
302 "all-shared")
303 #:phases
304 (modify-phases %standard-phases
305 (add-after 'unpack 'disable-check-for-stdint
306 (lambda _
307 ;; Of course we have stdint.h, just not in /usr/include
308 (substitute* '("Makefile"
309 "ustr-import.in")
310 (("-f \"/usr/include/stdint.h\"") "-z \"\""))
311 #t))
312 ;; No configure script
313 (delete 'configure))))
314 (home-page "http://www.and.org/ustr/")
315 (synopsis "String library with very low memory overhead")
316 (description
317 "Ustr is a string library for C with very low memory overhead.")
318 ;; Quoted from the home page: "The License for the code is MIT, new-BSD,
319 ;; LGPL, etc. ... if you need another license to help compatibility, just
320 ;; ask for it. It's basically public domain, without all the legal
321 ;; problems for everyone that trying to make something public domain
322 ;; entails."
323 (license license:public-domain)))
324
7577ab55
HG
325(define-public ascii2binary
326 (package
327 (name "ascii2binary")
328 (version "2.14")
329 (source
330 (origin
331 (method url-fetch)
332 (uri (string-append "http://billposer.org/Software/Downloads/"
333 "ascii2binary-" version ".tar.bz2"))
334 (sha256
335 (base32 "0dc9fxcdmppbs9s06jvq61zbk552laxps0xyk098gj41697ihd96"))))
336 (build-system gnu-build-system)
337 (native-inputs
338 `(("gettext" ,gettext-minimal)))
339 (home-page "https://billposer.org/Software/a2b.html")
340 (synopsis "Convert between ASCII, hexadecimal and binary representations")
341 (description "The two programs are useful for generating test data, for
342inspecting binary files, and for interfacing programs that generate textual
343output to programs that require binary input and conversely. They can also be
344useful when it is desired to reformat numbers.
345
346@itemize
347
348@item @command{ascii2binary} reads input consisting of ascii or hexadecimal
349 representation numbers separated by whitespace and produces as output
350 the binary equivalents. The type and precision of the binary output
351 is selected using command line flags.
352
353@item @command{binary2ascii} reads input consisting of binary numbers
354 and converts them to their ascii or hexadecimal representation.
355 Command line flags specify the type and size of the binary numbers
356 and provide control over the format of the output.
357 Unsigned integers may be written out in binary, octal, decimal,
358 or hexadecimal.
359
360 Signed integers may be written out only in binary or decimal. Floating
361 point numbers may be written out only decimal, either in standard or
362 scientific notation. (If you want to examine the binary representation
363 of floating point numbers, just treat the input as a sequence of unsigned
364 characters.)
365
a6baae74
HG
366@end itemize")
367 (license license:gpl3)))
368
369(define-public uniutils
370 (package
371 (name "uniutils")
372 (version "2.27")
373 (source
374 (origin
375 (method url-fetch)
376 (uri (string-append "http://billposer.org/Software/Downloads/"
377 "uniutils-" version ".tar.bz2"))
378 (sha256
379 (base32 "19w1510w87gx7n4qy3zsb0m467a4rn5scvh4ajajg7jh6x5xri08"))))
380 (build-system gnu-build-system)
381 (arguments
382 '(#:configure-flags '("--disable-dependency-tracking")
383 #:phases
384 (modify-phases %standard-phases
385 (add-after 'build 'fix-paths
386 (lambda* (#:key outputs inputs #:allow-other-keys)
387 (let ((out (assoc-ref outputs "out"))
388 (a2b (assoc-ref inputs "ascii2binary"))
389 (iconv (assoc-ref inputs "libiconv")))
390 (substitute* "utf8lookup"
391 (("^ascii2binary ") (string-append a2b "/bin/ascii2binary "))
392 (("^uniname ") (string-append out "/bin/uniname "))
393 (("^iconv ") (string-append iconv "/bin/iconv ")))
394 #t))))))
395 (inputs
396 `(("ascii2binary" ,ascii2binary)
397 ("libiconv" ,libiconv)))
398 (home-page "https://billposer.org/Software/unidesc.html")
399 (synopsis "Find out what is in a Unicode file")
400 (description "Useful tools when working with Unicode files when one
401doesn't know the writing system, doesn't have the necessary font, needs to
402inspect invisible characters, needs to find out whether characters have been
403combined or in what order they occur, or needs statistics on which characters
404occur.
405
406@itemize
407
408@item @command{uniname} defaults to printing the character offset of each
409character, its byte offset, its hex code value, its encoding, the glyph
410itself, and its name. It may also be used to validate UTF-8 input.
411
412@item @command{unidesc} reports the character ranges to which different
413portions of the text belong. It can also be used to identify Unicode encodings
414(e.g. UTF-16be) flagged by magic numbers.
415
416@item @command{unihist} generates a histogram of the characters in its input.
417
418@item @command{ExplicateUTF8} is intended for debugging or for learning about
419Unicode. It determines and explains the validity of a sequence of bytes as a
420UTF8 encoding.
421
422@item @command{utf8lookup} provides a handy way to look up Unicode characters
423from the command line.
424
425@item @command{unireverse} reverse each line of UTF-8 input
426character-by-character.
427
7577ab55
HG
428@end itemize")
429 (license license:gpl3)))
430
e522d840
RJ
431(define-public libconfig
432 (package
433 (name "libconfig")
1b362016
MB
434 (version "1.7.2")
435 (home-page "https://hyperrealm.github.io/libconfig/")
e522d840
RJ
436 (source (origin
437 (method url-fetch)
1b362016
MB
438 (uri (string-append home-page "/dist/libconfig-"
439 version ".tar.gz"))
e522d840
RJ
440 (sha256
441 (base32
1b362016 442 "1ngs2qx3cx5cbwinc5mvadly0b5n7s86zsc68c404czzfff7lg3w"))))
e522d840 443 (build-system gnu-build-system)
e522d840
RJ
444 (synopsis "C/C++ configuration file library")
445 (description
446 "Libconfig is a simple library for manipulating structured configuration
447files. This file format is more compact and more readable than XML. And
448unlike XML, it is type-aware, so it is not necessary to do string parsing in
449application code.")
450 (license license:lgpl2.1+)))
c8d969b5
PP
451
452(define-public pfff
453 (package
454 (name "pfff")
455 (version "1.0")
456 (source (origin
bc117787
EF
457 (method git-fetch)
458 (uri (git-reference
459 (url "https://github.com/pfff/pfff")
460 (commit (string-append "v" version))))
461 (file-name (git-file-name name version))
c8d969b5
PP
462 (sha256
463 (base32
bc117787 464 "1nxkfm7zliq3rmr7yp871sppwfnz71iz364m2sgazny71pzykggc"))))
c8d969b5 465 (build-system cmake-build-system)
86bbfc50 466 (home-page "https://biit.cs.ut.ee/pfff/")
c8d969b5
PP
467 (synopsis "Probabilistic fast file fingerprinting tool")
468 (description
469 "pfff is a tool for calculating a compact digital fingerprint of a file
470by sampling randomly from the file instead of reading it in full.
471Consequently, the computation has a flat performance characteristic,
472correlated with data variation rather than file size. pfff can be as reliable
473as existing hashing techniques, with provably negligible risk of collisions.")
474 (license license:bsd-3)))
2d8cf0b3
JL
475
476(define-public oniguruma
477 (package
478 (name "oniguruma")
7eb823c6 479 (version "6.9.5-rev1")
2d8cf0b3
JL
480 (source (origin
481 (method url-fetch)
482 (uri (string-append "https://github.com/kkos/"
7eb823c6
MB
483 "oniguruma/releases/download/v"
484 ;; If there is a "-" in the version, convert
485 ;; to underscore for this part of the URI.
486 (string-map (lambda (c) (if (char=? #\- c) #\_ c))
487 version)
2d8cf0b3
JL
488 "/onig-" version ".tar.gz"))
489 (sha256
490 (base32
7eb823c6 491 "17m92k1n6bvza6m35fpd5g36zwpwm3hfz3478iwj5bvj2sfq8g6k"))))
2d8cf0b3 492 (build-system gnu-build-system)
28b55a51 493 (arguments '(#:configure-flags '("--disable-static")))
2d8cf0b3
JL
494 (home-page "https://github.com/kkos/oniguruma")
495 (synopsis "Regular expression library")
496 (description "Oniguruma is a regular expressions library. The special
497characteristic of this library is that different character encoding for every
498regular expression object can be specified.")
499 (license license:bsd-2)))
1b90e57e
RW
500
501(define-public antiword
502 (package
503 (name "antiword")
504 (version "0.37")
505 (source (origin
506 (method url-fetch)
507 (uri (string-append "http://www.winfield.demon.nl/linux"
508 "/antiword-" version ".tar.gz"))
509 (sha256
510 (base32
9116f126
EF
511 "1b7mi1l20jhj09kyh0bq14qzz8vdhhyf35gzwsq43mn6rc7h0b4f"))
512 (patches (search-patches "antiword-CVE-2014-8123.patch"))))
1b90e57e
RW
513 (build-system gnu-build-system)
514 (arguments
515 `(#:tests? #f ; There are no tests
516 #:make-flags
517 (list "-f" "Makefile.Linux"
518 (string-append "GLOBAL_INSTALL_DIR="
519 (assoc-ref %outputs "out") "/bin")
520 (string-append "GLOBAL_RESOURCES_DIR="
521 (assoc-ref %outputs "out") "/share/antiword"))
522 #:phases
523 (modify-phases %standard-phases
b28e05d4
RW
524 (replace 'configure
525 (lambda* (#:key outputs #:allow-other-keys)
526 ;; Ensure that mapping files can be found in the actual package
527 ;; data directory.
528 (substitute* "antiword.h"
529 (("/usr/share/antiword")
530 (string-append (assoc-ref outputs "out") "/share/antiword")))
531 #t))
1b90e57e
RW
532 (replace 'install
533 (lambda* (#:key make-flags #:allow-other-keys)
0ad0ecee 534 (apply invoke "make" `("global_install" ,@make-flags)))))))
1b90e57e
RW
535 (home-page "http://www.winfield.demon.nl/")
536 (synopsis "Microsoft Word document reader")
537 (description "Antiword is an application for displaying Microsoft Word
538documents. It can also convert the document to PostScript or XML. Only
539documents made by MS Word version 2 and version 6 or later are supported. The
540name comes from: \"The antidote against people who send Microsoft Word files
541to everybody, because they believe that everybody runs Windows and therefore
542runs Word\".")
543 (license license:gpl2+)))
f17a5447 544
d0abaf89
HG
545(define-public catdoc
546 (package
547 (name "catdoc")
548 (version "0.95")
549 (source (origin
550 (method url-fetch)
551 (uri (string-append "http://ftp.wagner.pp.ru/pub/catdoc/"
552 "catdoc-" version ".tar.gz"))
bfcdf887 553 (patches (search-patches "catdoc-CVE-2017-11110.patch"))
d0abaf89
HG
554 (sha256
555 (base32
556 "15h7v3bmwfk4z8r78xs5ih6vd0pskn0rj90xghvbzdjj0cc88jji"))))
557 (build-system gnu-build-system)
558 ;; TODO: Also build `wordview` which requires `tk` – make a separate
559 ;; package for this.
560 (arguments
561 '(#:tests? #f ; There are no tests
562 #:configure-flags '("--disable-wordview")
563 #:phases
564 (modify-phases %standard-phases
565 (add-before 'install 'fix-install
566 (lambda* (#:key outputs #:allow-other-keys)
567 (let ((out (assoc-ref outputs "out")))
568 (mkdir-p (string-append out "/share/man/man1"))))))))
13607002 569 (home-page "https://www.wagner.pp.ru/~vitus/software/catdoc/")
d0abaf89
HG
570 (synopsis "MS-Word to TeX or plain text converter")
571 (description "@command{catdoc} extracts text from MS-Word files, trying to
572preserve as many special printable characters as possible. It supports
573everything up to Word-97. Also supported are MS Write documents and RTF files.
574
575@command{catdoc} does not preserve complex word formatting, but it can
576translate some non-ASCII characters into TeX escape codes. It's goal is to
577extract plain text and allow you to read it and, probably, reformat with TeX,
578according to TeXnical rules.
579
580This package also provides @command{xls2csv}, which extracts data from Excel
581spreadsheets and outputs it in comma-separated-value format, and
582@command{catppt}, which extracts data from PowerPoint presentations.")
583 (license license:gpl2+)))
584
f17a5447
AG
585(define-public utfcpp
586 (package
587 (name "utfcpp")
f9c84cd4 588 (version "2.3.5")
f17a5447 589 (source (origin
d0f3ccf6
EF
590 (method git-fetch)
591 (uri (git-reference
592 (url "https://github.com/nemtrif/utfcpp")
593 (commit (string-append "v" version))))
594 (file-name (git-file-name name version))
f17a5447
AG
595 (sha256
596 (base32
d0f3ccf6 597 "1gr98d826z6wa58r1s5i7rz7q2x3r31v7zj0pjjlrc7gfxwklr4s"))))
f9c84cd4 598 (build-system cmake-build-system)
f17a5447 599 (arguments
f9c84cd4
TGR
600 `(#:out-of-source? #f
601 #:phases
602 (modify-phases %standard-phases
603 (replace 'install ; no install target
604 (lambda* (#:key outputs #:allow-other-keys)
605 (let* ((out (assoc-ref outputs "out"))
606 (include (string-append out "/include"))
607 (doc (string-append out "/share/doc/" ,name)))
608 (copy-recursively "source" include)
609 (install-file "README.md" doc)
610 #t))))))
f17a5447
AG
611 (home-page "https://github.com/nemtrif/utfcpp")
612 (synopsis "Portable C++ library for handling UTF-8")
613 (description "UTF8-CPP is a C++ library for handling UTF-8 encoded text
614in a portable way.")
615 (license license:boost1.0)))
8888fe82 616
617(define-public dbacl
618 (package
619 (name "dbacl")
9a71213e 620 (version "1.14.1")
8888fe82 621 (source
622 (origin
623 (method url-fetch)
9a71213e 624 (uri (string-append "mirror://sourceforge/dbacl/dbacl/" version "/"
50b01dcf 625 "dbacl-" version ".tar.gz"))
8888fe82 626 (sha256
9a71213e
TGR
627 (base32 "1gas0112wqjvwn9qg3hxnawk7h3prr0w9b2h68f3p1ifd1kzn3gz"))
628 (patches (search-patches "dbacl-include-locale.h.patch"))))
8888fe82 629 (build-system gnu-build-system)
630 (arguments
631 `(#:make-flags
632 (list
633 (string-append "-I" (assoc-ref %build-inputs "slang")
634 "/include/slang")
635 (string-append "-I" (assoc-ref %build-inputs "ncurses")
636 "/include/ncurses"))
637 #:phases
638 (modify-phases %standard-phases
639 (add-after 'unpack 'delete-sample6-and-japanese
640 (lambda _
641 (substitute* "doc/Makefile.am"
642 (("sample6.txt") "")
643 (("japanese.txt") ""))
644 (delete-file "doc/sample6.txt")
645 (delete-file "doc/japanese.txt")
646 (substitute* (list "src/tests/Makefile.am"
647 "src/tests/Makefile.in")
648 (("dbacl-jap.shin") "")
649 (("dbacl-jap.sh") ""))
650 #t))
651 (add-after 'unpack 'delete-test
652 ;; See comments about the license.
653 (lambda _
db317b38
TGR
654 (delete-file "src/tests/dbacl-jap.shin")
655 #t))
8888fe82 656 (add-after 'unpack 'fix-test-files
657 (lambda* (#:key inputs outputs #:allow-other-keys)
658 (let* ((out (assoc-ref outputs "out"))
659 (bin (string-append out "/bin")))
660 (substitute* (find-files "src/tests/" "\\.shin$")
661 (("PATH=/bin:/usr/bin")
662 "#PATH=/bin:/usr/bin")
663 (("diff") (string-append (which "diff")))
664 (("tr") (string-append (which "tr"))))
9a71213e
TGR
665 #t)))
666 (replace 'bootstrap
667 (lambda _
668 (invoke "autoreconf" "-vif")
669 #t)))))
8888fe82 670 (inputs
671 `(("ncurses" ,ncurses)
672 ("perl" ,perl)
673 ("readline" ,readline)
674 ("slang" ,slang)))
675 (native-inputs
676 `(("libtool" ,libtool)
677 ("autoconf" ,autoconf)
678 ("automake" ,automake)
679 ("pkg-config" ,pkg-config)))
7314a547 680 (home-page "https://www.lbreyer.com/dbacl.html")
8888fe82 681 (synopsis "Bayesian text and email classifier")
682 (description
683 "dbacl is a fast Bayesian text and email classifier. It builds a variety
684of language models using maximum entropy (minimum divergence) principles, and
685these can then be used to categorize input data automatically among multiple
686categories.")
687 ;; The software is licensed as GPLv3 or later, but
688 ;; includes various sample texts in the doc dir:
689 ;; - sample1.txt, sample3 and sampe5.txt are in the public domain,
690 ;; by Mark Twain.
691 ;; - sample2.txt, sample4.txt are in the public domain, by Aristotle.
692 ;; - sample6.txt is a forwarded email, copyright unknown.
693 ;; Guix does exclude sample6.txt.
694 ;; - japanese.txt is a Japanese unoffical translation of the
695 ;; GNU General Public License, (c) by the Free Software Foundation.
696 ;; Guix excludes this file.
697 (license (list license:gpl3+ license:public-domain))))
0905048a
MB
698
699(define-public dotconf
700 (package
701 (name "dotconf")
702 (version "1.3")
703 (source (origin
08bc7f26
RW
704 (method git-fetch)
705 (uri (git-reference
706 (url "https://github.com/williamh/dotconf.git")
707 (commit (string-append "v" version))))
708 (file-name (git-file-name name version))
0905048a
MB
709 (sha256
710 (base32
08bc7f26 711 "1sc95hw5k2xagpafny0v35filmcn05k1ds5ghkldfpf6xw4hakp7"))))
0905048a 712 (build-system gnu-build-system)
7a52d0e0 713 (arguments `(#:tests? #f)) ; FIXME maketest.sh does not work.
0905048a
MB
714 (native-inputs
715 `(("autoconf" ,autoconf)
716 ("automake" ,automake)
717 ("libtool" ,libtool)))
718 (home-page "https://github.com/williamh/dotconf")
719 (synopsis "Configuration file parser library")
720 (description
721 "C library for creating and parsing configuration files.")
722 (license (list license:lgpl2.1 ; Main distribution.
723 license:asl1.1)))) ; src/readdir.{c,h}
ff3f6766
RW
724
725(define-public java-rsyntaxtextarea
726 (package
727 (name "java-rsyntaxtextarea")
728 (version "2.6.1")
729 (source (origin
730 (method url-fetch)
731 (uri (string-append "https://github.com/bobbylight/"
732 "RSyntaxTextArea/archive/"
733 version ".tar.gz"))
734 (file-name (string-append name "-" version ".tar.gz"))
735 (sha256
736 (base32
737 "0c5mqg2klj5rvf8fhycrli8rf6s37l9p7a8knw9gpp65r1c120q2"))))
738 (build-system ant-build-system)
739 (arguments
740 `(;; FIXME: some tests fail because locale resources cannot be found.
741 ;; Even when I add them to the class path,
742 ;; RSyntaxTextAreaEditorKitDumbCompleteWordActionTest fails.
743 #:tests? #f
744 #:jar-name "rsyntaxtextarea.jar"))
745 (native-inputs
746 `(("java-junit" ,java-junit)
747 ("java-hamcrest-core" ,java-hamcrest-core)))
748 (home-page "https://bobbylight.github.io/RSyntaxTextArea/")
749 (synopsis "Syntax highlighting text component for Java Swing")
750 (description "RSyntaxTextArea is a syntax highlighting, code folding text
751component for Java Swing. It extends @code{JTextComponent} so it integrates
752completely with the standard @code{javax.swing.text} package. It is fast and
753efficient, and can be used in any application that needs to edit or view
754source code.")
755 (license license:bsd-3)))
03639d03
RW
756
757;; We use the sources from git instead of the tarball from pypi, because the
758;; latter does not include the Cython source file from which bycython.cpp is
759;; generated.
760(define-public python-editdistance
761 (let ((commit "3ea84a7dd3258c76aa3be851ef3d50e59c886846")
762 (revision "1"))
763 (package
764 (name "python-editdistance")
765 (version (string-append "0.3.1-" revision "." (string-take commit 7)))
766 (source
767 (origin
768 (method git-fetch)
769 (uri (git-reference
770 (url "https://github.com/aflc/editdistance.git")
771 (commit commit)))
54335b59 772 (file-name (git-file-name name version))
03639d03
RW
773 (sha256
774 (base32
775 "1l43svsv12crvzphrgi6x435z6xg8m086c64armp8wzb4l8ccm7g"))))
776 (build-system python-build-system)
777 (arguments
778 `(#:phases
779 (modify-phases %standard-phases
780 (add-after 'unpack 'build-cython-code
781 (lambda _
782 (with-directory-excursion "editdistance"
783 (delete-file "bycython.cpp")
54d5b27e 784 (invoke "cython" "--cplus" "bycython.pyx")))))))
03639d03
RW
785 (native-inputs
786 `(("python-cython" ,python-cython)))
787 (home-page "https://www.github.com/aflc/editdistance")
788 (synopsis "Fast implementation of the edit distance (Levenshtein distance)")
789 (description
790 "This library simply implements Levenshtein distance algorithm with C++
791and Cython.")
792 (license license:expat))))
03f801aa
CB
793
794(define-public go-github.com-mattn-go-runewidth
d050000a
BL
795 (let ((commit "703b5e6b11ae25aeb2af9ebb5d5fdf8fa2575211")
796 (version "0.0.4")
797 (revision "1"))
798 (package
799 (name "go-github.com-mattn-go-runewidth")
800 (version (git-version version revision commit))
801 (source
802 (origin
803 (method git-fetch)
804 (uri (git-reference
805 (url "https://github.com/mattn/runewidth")
806 (commit commit)))
807 (file-name (git-file-name name version))
808 (sha256
809 (base32
810 "0znpyz71gajx3g0j2zp63nhjj2c07g16885vxv4ykwnrfmzbgk4w"))))
811 (build-system go-build-system)
812 (arguments
813 '(#:import-path "github.com/mattn/go-runewidth"))
814 (synopsis "@code{runewidth} provides Go functions to work with string widths")
815 (description
816 "The @code{runewidth} library provides Go functions for padding,
817measuring and checking the width of strings, with support for East Asian
818text.")
819 (home-page "https://github.com/mattn/runewidth")
820 (license license:expat))))
74fa77e9
PN
821
822(define-public docx2txt
823 (package
824 (name "docx2txt")
825 (version "1.4")
826 (source (origin
827 (method url-fetch)
828 (uri (string-append
829 "mirror://sourceforge/docx2txt/docx2txt/v"
830 version "/docx2txt-" version ".tgz"))
831 (sha256
832 (base32
833 "06vdikjvpj6qdb41d8wzfnyj44jpnknmlgbhbr1w215420lpb5xj"))))
834 (build-system gnu-build-system)
835 (inputs
836 `(("unzip" ,unzip)
837 ("perl" ,perl)))
838 (arguments
839 `(#:tests? #f ; No tests.
840 #:make-flags (list (string-append "BINDIR="
841 (assoc-ref %outputs "out") "/bin")
842 (string-append "CONFIGDIR="
843 (assoc-ref %outputs "out") "/etc")
844 ;; Makefile seems to be a bit dumb at guessing.
845 (string-append "INSTALL=install")
846 (string-append "PERL=perl"))
847 #:phases
848 (modify-phases %standard-phases
849 (delete 'configure)
850 (add-after 'install 'fix-install
851 (lambda* (#:key outputs inputs #:allow-other-keys)
852 (let* ((out (assoc-ref outputs "out"))
853 (bin (string-append out "/bin"))
854 (config (string-append out "/etc/docx2txt.config"))
855 (unzip (assoc-ref inputs "unzip")))
856 ;; According to INSTALL, the .sh wrapper can be skipped.
857 (delete-file (string-append bin "/docx2txt.sh"))
858 (rename-file (string-append bin "/docx2txt.pl")
859 (string-append bin "/docx2txt"))
860 (substitute* config
861 (("config_unzip => '/usr/bin/unzip',")
862 (string-append "config_unzip => '"
863 unzip
864 "/bin/unzip',")))
865 ;; Makefile is wrong.
f6ccb95a
TGR
866 (chmod config #o644)
867 #t))))))
74fa77e9
PN
868 (synopsis "Recover text from @file{.docx} files, with good formatting")
869 (description
870 "@command{docx2txt} is a Perl based command line utility to convert
871Microsoft Office @file{.docx} documents to equivalent text documents. Latest
872version supports following features during text extraction.
873
874@itemize
875@item Character conversions; currency characters are converted to respective
876names like Euro.
877@item Capitalisation of text blocks.
878@item Center and right justification of text fitting in a line of
879(configurable) 80 columns.
880@item Horizontal ruler, line breaks, paragraphs separation, tabs.
881@item Indicating hyperlinked text along with the hyperlink (configurable).
882@item Handling (bullet, decimal, letter, roman) lists along with (attempt at)
883indentation.
884@end itemize\n")
885 (home-page "http://docx2txt.sourceforge.net")
886 (license license:gpl3+)))
1a0363cf 887
bc5aa386
VC
888(define-public odt2txt
889 (package
890 (name "odt2txt")
891 (version "0.5")
892 (source
893 (origin
894 (method git-fetch)
895 (uri (git-reference
896 (url "https://github.com/dstosberg/odt2txt/")
897 (commit (string-append "v" version))))
898 (file-name (git-file-name name version))
899 (sha256
900 (base32
901 "0im3kzvhxkjlx57w6h13mc9584c74ma1dyymgvpq2y61av3gc35v"))))
902 (build-system gnu-build-system)
903 (arguments
904 `(#:tests? #f ; no make check
905 #:make-flags (list "CC=gcc"
906 (string-append "DESTDIR=" (assoc-ref %outputs "out")))
907 #:phases
908 (modify-phases %standard-phases
909 ;; no configure script
910 (delete 'configure))))
911 (inputs
912 `(("zlib" ,zlib)))
913 (home-page "https://github.com/dstosberg/odt2txt/")
914 (synopsis "Converter from OpenDocument Text to plain text")
915 (description "odt2txt is a command-line tool which extracts the text out
916of OpenDocument Texts, as produced by OpenOffice.org, KOffice, StarOffice and
917others.
918
919odt2txt can also extract text from some file formats similar to OpenDocument
920Text, such as OpenOffice.org XML (*.sxw), which was used by OpenOffice.org
921version 1.x and older StarOffice versions. To a lesser extent, odt2txt may be
922useful to extract content from OpenDocument spreadsheets (*.ods) and
923OpenDocument presentations (*.odp).")
924 (license license:gpl2)))
925
1a0363cf
MP
926(define-public opencc
927 (package
928 (name "opencc")
929 (version "1.0.5")
930 (source
931 (origin
bf5af934
EF
932 (method git-fetch)
933 (uri (git-reference
934 (url "https://github.com/BYVoid/OpenCC")
935 (commit (string-append "ver." version))))
936 (file-name (git-file-name name version))
1a0363cf 937 (sha256
1506d491
EF
938 (base32
939 "1pv5md225qwhbn8ql932zdg6gh1qlx3paiajaks8gfsa07yzvhr4"))
940 (modules '((guix build utils)))
941 (snippet
942 '(begin
943 ;; TODO: Unbundle tclap, darts-clone, gtest
944 (delete-file-recursively "deps/rapidjson-0.11") #t))))
1a0363cf 945 (build-system cmake-build-system)
1506d491
EF
946 (arguments
947 '(#:phases
948 (modify-phases %standard-phases
949 (add-after 'unpack 'patch-3rd-party-references
950 (lambda* (#:key inputs #:allow-other-keys)
951 (let ((rapidjson (assoc-ref inputs "rapidjson")))
952 (substitute* "src/CMakeLists.txt"
953 (("../deps/rapidjson-0.11")
954 (string-append rapidjson "/include/rapidjson")))
955 #t))))))
1a0363cf 956 (native-inputs
1506d491
EF
957 `(("python" ,python-wrapper)
958 ("rapidjson" ,rapidjson)))
1a0363cf
MP
959 (home-page "https://github.com/BYVoid/OpenCC")
960 (synopsis "Convert between Traditional Chinese and Simplified Chinese")
961 (description "Open Chinese Convert (OpenCC) converts between Traditional
962Chinese and Simplified Chinese, supporting character-level conversion,
963phrase-level conversion, variant conversion, and regional idioms among
964Mainland China, Taiwan, and Hong-Kong.")
965 (license license:asl2.0)))
933ac939
YA
966
967(define-public nkf
968 (let ((commit "08043eadf4abdddcf277842217e3c77a24740dc2")
969 (revision "1"))
970 (package
971 (name "nkf")
972 ;; The commits corresponding to specific versions are published
973 ;; here:
974 ;; https://ja.osdn.net/projects/nkf/scm/git/nkf/
975 (version "2.1.5")
976 (source (origin
977 (method git-fetch)
978 (uri (git-reference
979 (url "https://github.com/nurse/nkf.git")
980 (commit commit)))
981 (file-name (git-file-name name version))
982 (sha256
983 (base32
984 "0anw0knr1iy4p9w3d3b3pbwzh1c43p1i2q4c28kw9zviw8kx2rly"))))
985 (build-system gnu-build-system)
986 (arguments
987 `(#:tests? #f ; test for perl module
988 #:make-flags (list "CC=gcc" "CFLAGS=-O2 -Wall -pedantic"
989 (string-append "prefix=" %output)
990 "MKDIR=mkdir -p")
991 #:phases
992 (modify-phases %standard-phases
993 (delete 'configure)))) ; No ./configure script
994 (home-page "https://ja.osdn.net/projects/nkf/")
995 (synopsis "Network Kanji Filter")
996 (description "Nkf is yet another kanji code converter among networks,
997hosts and terminals. It converts input kanji code to designated kanji code
998such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8, UTF-16 or UTF-32.")
999 (license license:zlib))))
74247b80
NG
1000
1001(define-public python-pandocfilters
1002 (package
1003 (name "python-pandocfilters")
1004 (version "1.4.2")
1005 (source
1006 (origin
1007 (method url-fetch)
1008 (uri (pypi-uri "pandocfilters" version))
1009 (sha256
1010 (base32
1011 "1a8d9b7s48gmq9zj0pmbyv2sivn5i7m6mybgpkk4jm5vd7hp1pdk"))))
1012 (build-system python-build-system)
1013 (home-page "https://github.com/jgm/pandocfilters")
1014 (synopsis "Python module for writing Pandoc filters")
1015 (description "Pandoc is a powerful utility to transform various
1016input formats into a wide range of output formats. To alter the
1017exported output document, Pandoc allows the usage of filters, which
1018are pipes that read a JSON serialization of the Pandoc AST from stdin,
1019transform it in some way, and write it to stdout. It allows therefore
1020to alter the processing of Pandoc's supported input formats, for
1021instance one can add new syntax elements to markdown, etc.
1022
1023This package provides Python bindings.")
1024 (license license:bsd-3)))
24719e8a
1025
1026(define-public aha
1027 (package
1028 (name "aha")
1029 (version "0.5")
1030 (source
1031 (origin
1032 (method git-fetch)
1033 (uri (git-reference
1034 (url "https://github.com/theZiz/aha")
1035 (commit version)))
1036 (sha256
1037 (base32
1038 "0byml4rmpiaalwx69jcixl3yvpvwmwiss1jzgsqwshilb2p4qnmz"))
1039 (file-name (git-file-name name version))))
1040 (build-system gnu-build-system)
1041 (arguments
1042 '(#:phases
1043 (modify-phases %standard-phases
1044 (delete 'configure))
1045 #:make-flags (list "CC=gcc"
1046 (string-append "PREFIX="
1047 (assoc-ref %outputs "out")))
1048 ;; no check target
1049 #:tests? #f))
1050 (home-page "https://github.com/theZiz/aha")
1051 (synopsis "Converts terminal escape sequences to HTML")
1052 (description "@command{aha} (Ansi Html Adapter) converts ANSI escape sequences
1053of a Unix terminal to HTML code.")
1054 (license (list license:lgpl2.0+ license:mpl1.1))))