gnu: python-pandas: Fix build on 32-bit.
[jackhill/guix/guix.git] / gnu / packages / textutils.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2015 Taylan Ulrich Bayırlı/Kammer <taylanbayirli@gmail.com>
3 ;;; Copyright © 2015, 2016 Ricardo Wurmus <rekado@elephly.net>
4 ;;; Copyright © 2015, 2016 Ben Woodcroft <donttrustben@gmail.com>
5 ;;; Copyright © 2015 Roel Janssen <roel@gnu.org>
6 ;;; Copyright © 2016 Jelle Licht <jlicht@fsfe.org>
7 ;;; Copyright © 2016 Alex Griffin <a@ajgrf.com>
8 ;;; Copyright © 2016 Efraim Flashner <efraim@flashner.co.il>
9 ;;; Copyright © 2016 ng0 <ng0@we.make.ritual.n0.is>
10 ;;; Copyright © 2016 Marius Bakke <mbakke@fastmail.com>
11 ;;; Copyright © 2017 Eric Bavier <bavier@member.fsf.org>
12 ;;;
13 ;;; This file is part of GNU Guix.
14 ;;;
15 ;;; GNU Guix is free software; you can redistribute it and/or modify it
16 ;;; under the terms of the GNU General Public License as published by
17 ;;; the Free Software Foundation; either version 3 of the License, or (at
18 ;;; your option) any later version.
19 ;;;
20 ;;; GNU Guix is distributed in the hope that it will be useful, but
21 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
22 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 ;;; GNU General Public License for more details.
24 ;;;
25 ;;; You should have received a copy of the GNU General Public License
26 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
27
28 (define-module (gnu packages textutils)
29 #:use-module ((guix licenses) #:prefix license:)
30 #:use-module (guix packages)
31 #:use-module (guix download)
32 #:use-module (guix git-download)
33 #:use-module (guix build-system gnu)
34 #:use-module (guix build-system cmake)
35 #:use-module (guix build-system trivial)
36 #:use-module (gnu packages)
37 #:use-module (gnu packages autotools)
38 #:use-module (gnu packages ncurses)
39 #:use-module (gnu packages perl)
40 #:use-module (gnu packages pkg-config)
41 #:use-module (gnu packages python)
42 #:use-module (gnu packages readline)
43 #:use-module (gnu packages slang)
44 #:use-module (gnu packages zip))
45
46 (define-public recode
47 (package
48 (name "recode")
49 ;; Last beta release (3.7-beta2) is from 2008; last commit from Feb 2014.
50 ;; So we use that commit instead.
51 (version "3.7.0.201402")
52 (source
53 (origin
54 (method git-fetch)
55 (uri (git-reference
56 (url "https://github.com/pinard/Recode.git")
57 (commit "2d7092a9999194fc0e9449717a8048c8d8e26c18")))
58 (sha256
59 (base32 "1wssv8z6g3ryrw33sksz4rjhlnhgvvdqszw1ggl4rcwks34n86zm"))
60 (file-name (string-append name "-" version "-checkout"))))
61 (build-system gnu-build-system)
62 (native-inputs `(("python" ,python-2)))
63 (arguments
64 '(#:phases
65 (alist-cons-before
66 'check 'pre-check
67 (lambda _
68 (substitute* "tests/setup.py"
69 (("([[:space:]]*)include_dirs=.*" all space)
70 (string-append all space "library_dirs=['../src/.libs'],\n")))
71 ;; The test extension 'Recode.so' lacks RUNPATH for 'librecode.so'.
72 (setenv "LD_LIBRARY_PATH" (string-append (getcwd) "/src/.libs")))
73 %standard-phases)))
74 (home-page "https://github.com/pinard/Recode")
75 (synopsis "Text encoding converter")
76 (description "The Recode library converts files between character sets and
77 usages. It recognises or produces over 200 different character sets (or about
78 300 if combined with an iconv library) and transliterates files between almost
79 any pair. When exact transliteration are not possible, it gets rid of
80 offending characters or falls back on approximations. The recode program is a
81 handy front-end to the library.")
82 (license license:gpl2+)))
83
84 (define-public enca
85 (package
86 (name "enca")
87 (version "1.16")
88 (source
89 (origin
90 (method url-fetch)
91 (uri (string-append
92 "https://github.com/nijel/enca/archive/" version ".tar.gz"))
93 (sha256
94 (base32 "1xik00x0yvhswsw2isnclabhv536xk1s42cf5z54gfbpbhc7ni8l"))
95 (file-name (string-append name "-" version ".tar.gz"))))
96 (build-system gnu-build-system)
97 (inputs `(("recode" ,recode)))
98
99 ;; Both 'test-convert-64.sh' and 'test-convert-filter.sh' manipulate a
100 ;; 'test.tmp' file, so they have to run in sequence.
101 (arguments '(#:parallel-tests? #f))
102
103 (home-page "https://github.com/nijel/enca")
104 (synopsis "Text encoding detection tool")
105 (description "Enca (Extremely Naive Charset Analyser) consists of libenca,
106 an encoding detection library, and enca, a command line frontend, integrating
107 libenca and several charset conversion libraries and tools.")
108 (license license:gpl2)))
109
110 (define-public utf8proc
111 (package
112 (name "utf8proc")
113 (version "2.1.0")
114 (source
115 (origin
116 (method url-fetch)
117 (uri (string-append
118 "https://github.com/JuliaLang/utf8proc/archive/v"
119 version ".tar.gz"))
120 (file-name (string-append name "-" version ".tar.gz"))
121 (sha256
122 (base32 "0q1jhdkk4f9b0zb8s2ql3sba3br5nvjsmbsaybmgj064k9hwbk15"))))
123 (build-system gnu-build-system)
124 (inputs ;test data that is otherwise downloaded with curl
125 `(("NormalizationTest.txt"
126 ,(origin
127 (method url-fetch)
128 (uri (string-append "http://www.unicode.org/Public/9.0.0/ucd/"
129 "NormalizationTest.txt"))
130 (sha256
131 (base32 "1fxrz0bilsbwl685336aqi88k62i6nqhm62rvy4zhg3bcm4dhj1d"))))
132 ("GraphemeBreakTest.txt"
133 ,(origin
134 (method url-fetch)
135 (uri (string-append "http://www.unicode.org/Public/9.0.0/ucd/"
136 "auxiliary/GraphemeBreakTest.txt"))
137 (sha256
138 (base32 "0qbhyhmf0778lc2hcwlpizrvmdxwpk959v2q2wb8abv09ba7wvn7"))))))
139 (arguments
140 '(#:make-flags (list "CC=gcc"
141 (string-append "prefix=" (assoc-ref %outputs "out")))
142 #:phases
143 (modify-phases %standard-phases
144 (delete 'configure)
145 (add-before 'check 'check-data
146 (lambda* (#:key inputs #:allow-other-keys)
147 (for-each (lambda (i)
148 (copy-file (assoc-ref inputs i)
149 (string-append "data/" i)))
150 '("NormalizationTest.txt" "GraphemeBreakTest.txt"))
151 (substitute* "data/GraphemeBreakTest.txt"
152 (("÷") "/")
153 (("×") "+")))))))
154 (home-page "http://julialang.org/utf8proc/")
155 (synopsis "C library for processing UTF-8 Unicode data")
156 (description "utf8proc is a small C library that provides Unicode
157 normalization, case-folding, and other operations for data in the UTF-8
158 encoding, supporting Unicode version 9.0.0.")
159 (license license:expat)))
160
161 (define-public libgtextutils
162 (package
163 (name "libgtextutils")
164 (version "0.7")
165 (source
166 (origin
167 (method url-fetch)
168 (uri (string-append
169 "https://github.com/agordon/libgtextutils/releases/download/"
170 version "/libgtextutils-" version ".tar.gz"))
171 (sha256
172 (base32 "0jiybkb2z58wa2msvllnphr4js2hvjvh988pavb3mzkgr6ihwbkr"))))
173 (build-system gnu-build-system)
174 (arguments
175 '(#:phases
176 (alist-cons-after
177 'unpack 'autoreconf
178 (lambda _ (zero? (system* "autoreconf" "-vif")))
179 %standard-phases)))
180 (native-inputs
181 `(("autoconf" ,autoconf)
182 ("automake" ,automake)
183 ("libtool" ,libtool)))
184 (home-page "https://github.com/agordon/libgtextutils")
185 (synopsis "Gordon's text utils library")
186 (description
187 "libgtextutils is a text utilities library used by the fastx toolkit from
188 the Hannon Lab.")
189 (license license:agpl3+)))
190
191 (define-public cityhash
192 (let ((commit "8af9b8c"))
193 (package
194 (name "cityhash")
195 (version (string-append "1.1-2." commit))
196 (source (origin
197 (method git-fetch)
198 (uri (git-reference
199 (url "https://github.com/google/cityhash.git")
200 (commit commit)))
201 (file-name (string-append name "-" version ".tar.gz"))
202 (sha256
203 (base32
204 "0n6skf5dv8yfl1ckax8dqhvsbslkwc9158zf2ims0xqdvzsahbi6"))))
205 (build-system gnu-build-system)
206 (arguments
207 '(#:make-flags (list "CXXFLAGS=-g -O3")
208 #:phases
209 (modify-phases %standard-phases
210 ;; citycrc is not installed by default but is used by some
211 ;; programs.
212 (add-after 'install 'install-citycrc
213 (lambda* (#:key outputs #:allow-other-keys)
214 (let* ((out (assoc-ref outputs "out"))
215 (include (string-append out "/include")))
216 (install-file "src/citycrc.h" include))
217 #t)))))
218 (home-page "https://github.com/google/cityhash")
219 (synopsis "C++ hash functions for strings")
220 (description
221 "CityHash provides hash functions for strings. The functions mix the
222 input bits thoroughly but are not suitable for cryptography.")
223 (license license:expat))))
224
225 (define-public ustr
226 (package
227 (name "ustr")
228 (version "1.0.4")
229 (source (origin
230 (method url-fetch)
231 (uri (string-append "http://www.and.org/ustr/" version
232 "/ustr-" version ".tar.bz2"))
233 (sha256
234 (base32
235 "1i623ygdj7rkizj7985q9d6vj5amwg686aqb5j3ixpkqkyp6xbrx"))))
236 (build-system gnu-build-system)
237 (arguments
238 `(#:make-flags
239 (list "CC=gcc"
240 "HIDE="
241 ;; Override "/sbin/ldconfig" with "echo" because we don't need
242 ;; "ldconfig".
243 "LDCONFIG=echo"
244 (string-append "prefix=" (assoc-ref %outputs "out"))
245 "all-shared")
246 #:phases
247 (modify-phases %standard-phases
248 (add-after 'unpack 'disable-check-for-stdint
249 (lambda _
250 ;; Of course we have stdint.h, just not in /usr/include
251 (substitute* '("Makefile"
252 "ustr-import.in")
253 (("-f \"/usr/include/stdint.h\"") "-z \"\""))
254 #t))
255 ;; No configure script
256 (delete 'configure))))
257 (home-page "http://www.and.org/ustr/")
258 (synopsis "String library with very low memory overhead")
259 (description
260 "Ustr is a string library for C with very low memory overhead.")
261 ;; Quoted from the home page: "The License for the code is MIT, new-BSD,
262 ;; LGPL, etc. ... if you need another license to help compatibility, just
263 ;; ask for it. It's basically public domain, without all the legal
264 ;; problems for everyone that trying to make something public domain
265 ;; entails."
266 (license license:public-domain)))
267
268 (define-public libconfig
269 (package
270 (name "libconfig")
271 (version "1.5")
272 (source (origin
273 (method url-fetch)
274 (uri (string-append "http://www.hyperrealm.com/libconfig/"
275 "libconfig-" version ".tar.gz"))
276 (sha256
277 (base32
278 "1xh3hzk63v4y8815lc5209m3s6ms2cpgw4h5hg462i4f1lwsl7g3"))))
279 (build-system gnu-build-system)
280 (home-page "http://www.hyperrealm.com/libconfig/")
281 (synopsis "C/C++ configuration file library")
282 (description
283 "Libconfig is a simple library for manipulating structured configuration
284 files. This file format is more compact and more readable than XML. And
285 unlike XML, it is type-aware, so it is not necessary to do string parsing in
286 application code.")
287 (license license:lgpl2.1+)))
288
289 (define-public pfff
290 (package
291 (name "pfff")
292 (version "1.0")
293 (source (origin
294 (method url-fetch)
295 (uri (string-append "https://github.com/pfff/pfff/archive/v"
296 version ".tar.gz"))
297 (file-name (string-append name "-" version ".tar.gz"))
298 (sha256
299 (base32
300 "00m553aa277iarxj6dalmklyb64r7ias49bfwzbacsfg8h3kar8m"))))
301 (build-system cmake-build-system)
302 (home-page "http://biit.cs.ut.ee/pfff/")
303 (synopsis "Probabilistic fast file fingerprinting tool")
304 (description
305 "pfff is a tool for calculating a compact digital fingerprint of a file
306 by sampling randomly from the file instead of reading it in full.
307 Consequently, the computation has a flat performance characteristic,
308 correlated with data variation rather than file size. pfff can be as reliable
309 as existing hashing techniques, with provably negligible risk of collisions.")
310 (license license:bsd-3)))
311
312 (define-public oniguruma
313 (package
314 (name "oniguruma")
315 (version "5.9.6")
316 (source (origin
317 (method url-fetch)
318 (uri (string-append "https://github.com/kkos/"
319 "oniguruma/releases/download/v" version
320 "/onig-" version ".tar.gz"))
321 (sha256
322 (base32
323 "19s79vsclqn170mw0ajwv7j37qsbn4f1yjz3yavnhvva6c820r6m"))))
324 (build-system gnu-build-system)
325 (home-page "https://github.com/kkos/oniguruma")
326 (synopsis "Regular expression library")
327 (description "Oniguruma is a regular expressions library. The special
328 characteristic of this library is that different character encoding for every
329 regular expression object can be specified.")
330 (license license:bsd-2)))
331
332 (define-public antiword
333 (package
334 (name "antiword")
335 (version "0.37")
336 (source (origin
337 (method url-fetch)
338 (uri (string-append "http://www.winfield.demon.nl/linux"
339 "/antiword-" version ".tar.gz"))
340 (sha256
341 (base32
342 "1b7mi1l20jhj09kyh0bq14qzz8vdhhyf35gzwsq43mn6rc7h0b4f"))
343 (patches (search-patches "antiword-CVE-2014-8123.patch"))))
344 (build-system gnu-build-system)
345 (arguments
346 `(#:tests? #f ; There are no tests
347 #:make-flags
348 (list "-f" "Makefile.Linux"
349 (string-append "GLOBAL_INSTALL_DIR="
350 (assoc-ref %outputs "out") "/bin")
351 (string-append "GLOBAL_RESOURCES_DIR="
352 (assoc-ref %outputs "out") "/share/antiword"))
353 #:phases
354 (modify-phases %standard-phases
355 (delete 'configure)
356 (replace 'install
357 (lambda* (#:key make-flags #:allow-other-keys)
358 (zero? (apply system* "make" `("global_install" ,@make-flags))))))))
359 (home-page "http://www.winfield.demon.nl/")
360 (synopsis "Microsoft Word document reader")
361 (description "Antiword is an application for displaying Microsoft Word
362 documents. It can also convert the document to PostScript or XML. Only
363 documents made by MS Word version 2 and version 6 or later are supported. The
364 name comes from: \"The antidote against people who send Microsoft Word files
365 to everybody, because they believe that everybody runs Windows and therefore
366 runs Word\".")
367 (license license:gpl2+)))
368
369 (define-public utfcpp
370 (package
371 (name "utfcpp")
372 (version "2.3.4")
373 (source (origin
374 (method url-fetch)
375 (uri
376 (string-append
377 "mirror://sourceforge/utfcpp/utf8cpp_2x/Release%20"
378 version "/utf8_v"
379 (string-map (lambda (x) (if (eq? x #\.) #\_ x)) version)
380 ".zip"))
381 (file-name (string-append name "-" version ".zip"))
382 (sha256
383 (base32
384 "1vqhs0aipcvvdrwcs7h3jsryg6mgbmc4s34n5cm6d36q4nxwwwrk"))))
385 (build-system trivial-build-system)
386 (arguments
387 `(#:modules ((guix build utils))
388 #:builder
389 (begin
390 (use-modules (guix build utils))
391 (let ((source (assoc-ref %build-inputs "source"))
392 (out (assoc-ref %outputs "out"))
393 (unzip (string-append (assoc-ref %build-inputs "unzip")
394 "/bin/unzip")))
395 (mkdir-p out)
396 (with-directory-excursion out
397 (system* unzip source)
398 (mkdir-p "share/doc")
399 (rename-file "doc" "share/doc/utfcpp")
400 (rename-file "source" "include"))))))
401 (native-inputs `(("unzip" ,unzip)))
402 (home-page "https://github.com/nemtrif/utfcpp")
403 (synopsis "Portable C++ library for handling UTF-8")
404 (description "UTF8-CPP is a C++ library for handling UTF-8 encoded text
405 in a portable way.")
406 (license license:boost1.0)))
407
408 (define-public dbacl
409 (package
410 (name "dbacl")
411 (version "1.14")
412 (source
413 (origin
414 (method url-fetch)
415 (uri (string-append "http://www.lbreyer.com/gpl/"
416 name "-" version ".tar.gz"))
417 (sha256
418 (base32
419 "0224g6x71hyvy7jikfxmgcwww1r5lvk0jx36cva319cb9nmrbrq7"))))
420 (build-system gnu-build-system)
421 (arguments
422 `(#:make-flags
423 (list
424 (string-append "-I" (assoc-ref %build-inputs "slang")
425 "/include/slang")
426 (string-append "-I" (assoc-ref %build-inputs "ncurses")
427 "/include/ncurses"))
428 #:phases
429 (modify-phases %standard-phases
430 (add-after 'unpack 'delete-sample6-and-japanese
431 (lambda _
432 (substitute* "doc/Makefile.am"
433 (("sample6.txt") "")
434 (("japanese.txt") ""))
435 (delete-file "doc/sample6.txt")
436 (delete-file "doc/japanese.txt")
437 (substitute* (list "src/tests/Makefile.am"
438 "src/tests/Makefile.in")
439 (("dbacl-jap.shin") "")
440 (("dbacl-jap.sh") ""))
441 #t))
442 (add-after 'unpack 'delete-test
443 ;; See comments about the license.
444 (lambda _
445 (delete-file "src/tests/dbacl-jap.shin")))
446 (add-after 'delete-sample6-and-japanese 'autoreconf
447 (lambda _
448 (zero? (system* "autoreconf" "-vif"))))
449 (add-after 'unpack 'fix-test-files
450 (lambda* (#:key inputs outputs #:allow-other-keys)
451 (let* ((out (assoc-ref outputs "out"))
452 (bin (string-append out "/bin")))
453 (substitute* (find-files "src/tests/" "\\.shin$")
454 (("PATH=/bin:/usr/bin")
455 "#PATH=/bin:/usr/bin")
456 (("diff") (string-append (which "diff")))
457 (("tr") (string-append (which "tr"))))
458 #t))))))
459 (inputs
460 `(("ncurses" ,ncurses)
461 ("perl" ,perl)
462 ("readline" ,readline)
463 ("slang" ,slang)))
464 (native-inputs
465 `(("libtool" ,libtool)
466 ("autoconf" ,autoconf)
467 ("automake" ,automake)
468 ("pkg-config" ,pkg-config)))
469 (home-page "http://www.lbreyer.com/dbacl.html")
470 (synopsis "Bayesian text and email classifier")
471 (description
472 "dbacl is a fast Bayesian text and email classifier. It builds a variety
473 of language models using maximum entropy (minimum divergence) principles, and
474 these can then be used to categorize input data automatically among multiple
475 categories.")
476 ;; The software is licensed as GPLv3 or later, but
477 ;; includes various sample texts in the doc dir:
478 ;; - sample1.txt, sample3 and sampe5.txt are in the public domain,
479 ;; by Mark Twain.
480 ;; - sample2.txt, sample4.txt are in the public domain, by Aristotle.
481 ;; - sample6.txt is a forwarded email, copyright unknown.
482 ;; Guix does exclude sample6.txt.
483 ;; - japanese.txt is a Japanese unoffical translation of the
484 ;; GNU General Public License, (c) by the Free Software Foundation.
485 ;; Guix excludes this file.
486 (license (list license:gpl3+ license:public-domain))))
487
488 (define-public dotconf
489 (package
490 (name "dotconf")
491 (version "1.3")
492 (source (origin
493 (method url-fetch)
494 (uri (string-append
495 "https://github.com/williamh/dotconf/archive/v"
496 version ".tar.gz"))
497 (file-name (string-append name "-" version ".tar.gz"))
498 (sha256
499 (base32
500 "0lsnh0yaw44psmx59hq94cj1932gscp5h8d3cnh05l0svr0cy7kz"))))
501 (build-system gnu-build-system)
502 (arguments
503 `(#:tests? #f ; FIXME maketest.sh does not work.
504 #:phases
505 (modify-phases %standard-phases
506 (add-before 'configure 'autoreconf
507 (lambda _
508 (zero? (system* "autoreconf" "-vif")))))))
509 (native-inputs
510 `(("autoconf" ,autoconf)
511 ("automake" ,automake)
512 ("libtool" ,libtool)))
513 (home-page "https://github.com/williamh/dotconf")
514 (synopsis "Configuration file parser library")
515 (description
516 "C library for creating and parsing configuration files.")
517 (license (list license:lgpl2.1 ; Main distribution.
518 license:asl1.1)))) ; src/readdir.{c,h}