37bd3339d316b4d87a9a507320978359cf68edf3
[jackhill/guix/guix.git] / gnu / packages / bioinformatics.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2015 Ben Woodcroft <donttrustben@gmail.com>
4 ;;;
5 ;;; This file is part of GNU Guix.
6 ;;;
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
11 ;;;
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
16 ;;;
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20 (define-module (gnu packages bioinformatics)
21 #:use-module ((guix licenses) #:prefix license:)
22 #:use-module (guix packages)
23 #:use-module (guix utils)
24 #:use-module (guix download)
25 #:use-module (guix git-download)
26 #:use-module (guix build-system gnu)
27 #:use-module (guix build-system cmake)
28 #:use-module (guix build-system perl)
29 #:use-module (guix build-system python)
30 #:use-module (guix build-system ruby)
31 #:use-module (guix build-system trivial)
32 #:use-module (gnu packages)
33 #:use-module (gnu packages algebra)
34 #:use-module (gnu packages base)
35 #:use-module (gnu packages boost)
36 #:use-module (gnu packages compression)
37 #:use-module (gnu packages cpio)
38 #:use-module (gnu packages file)
39 #:use-module (gnu packages java)
40 #:use-module (gnu packages linux)
41 #:use-module (gnu packages machine-learning)
42 #:use-module (gnu packages maths)
43 #:use-module (gnu packages ncurses)
44 #:use-module (gnu packages perl)
45 #:use-module (gnu packages pkg-config)
46 #:use-module (gnu packages popt)
47 #:use-module (gnu packages protobuf)
48 #:use-module (gnu packages python)
49 #:use-module (gnu packages ruby)
50 #:use-module (gnu packages statistics)
51 #:use-module (gnu packages tbb)
52 #:use-module (gnu packages textutils)
53 #:use-module (gnu packages vim)
54 #:use-module (gnu packages web)
55 #:use-module (gnu packages xml)
56 #:use-module (gnu packages zip)
57 #:use-module (srfi srfi-1))
58
59 (define-public aragorn
60 (package
61 (name "aragorn")
62 (version "1.2.36")
63 (source (origin
64 (method url-fetch)
65 (uri (string-append
66 "http://mbio-serv2.mbioekol.lu.se/ARAGORN/Downloads/aragorn"
67 version ".tgz"))
68 (sha256
69 (base32
70 "1dg7jlz1qpqy88igjxd6ncs11ccsirb36qv1z01a0np4i4jh61mb"))))
71 (build-system gnu-build-system)
72 (arguments
73 `(#:tests? #f ; there are no tests
74 #:phases
75 (modify-phases %standard-phases
76 (delete 'configure)
77 (replace 'build
78 (lambda _
79 (zero? (system* "gcc"
80 "-O3"
81 "-ffast-math"
82 "-finline-functions"
83 "-o"
84 "aragorn"
85 (string-append "aragorn" ,version ".c")))))
86 (replace 'install
87 (lambda* (#:key outputs #:allow-other-keys)
88 (let* ((out (assoc-ref outputs "out"))
89 (bin (string-append out "/bin"))
90 (man (string-append out "/share/man/man1")))
91 (mkdir-p bin)
92 (copy-file "aragorn"
93 (string-append bin "/aragorn"))
94 (mkdir-p man)
95 (copy-file "aragorn.1"
96 (string-append man "/aragorn.1")))
97 #t)))))
98 (home-page "http://mbio-serv2.mbioekol.lu.se/ARAGORN")
99 (synopsis "Detect tRNA, mtRNA and tmRNA genes in nucleotide sequences")
100 (description
101 "Aragorn identifies transfer RNA, mitochondrial RNA and
102 transfer-messenger RNA from nucleotide sequences, based on homology to known
103 tRNA consensus sequences and RNA structure. It also outputs the secondary
104 structure of the predicted RNA.")
105 (license license:gpl2)))
106
107 (define-public bamtools
108 (package
109 (name "bamtools")
110 (version "2.3.0")
111 (source (origin
112 (method url-fetch)
113 (uri (string-append
114 "https://github.com/pezmaster31/bamtools/archive/v"
115 version ".tar.gz"))
116 (file-name (string-append name "-" version ".tar.gz"))
117 (sha256
118 (base32
119 "1brry29bw2xr2l9pqn240rkqwayg85b8qq78zk2zs6nlspk4d018"))))
120 (build-system cmake-build-system)
121 (arguments
122 `(#:tests? #f ;no "check" target
123 #:phases
124 (modify-phases %standard-phases
125 (add-before
126 'configure 'set-ldflags
127 (lambda* (#:key outputs #:allow-other-keys)
128 (setenv "LDFLAGS"
129 (string-append
130 "-Wl,-rpath="
131 (assoc-ref outputs "out") "/lib/bamtools")))))))
132 (inputs `(("zlib" ,zlib)))
133 (home-page "https://github.com/pezmaster31/bamtools")
134 (synopsis "C++ API and command-line toolkit for working with BAM data")
135 (description
136 "BamTools provides both a C++ API and a command-line toolkit for handling
137 BAM files.")
138 (license license:expat)))
139
140 (define-public bedops
141 (package
142 (name "bedops")
143 (version "2.4.14")
144 (source (origin
145 (method url-fetch)
146 (uri (string-append "https://github.com/bedops/bedops/archive/v"
147 version ".tar.gz"))
148 (file-name (string-append name "-" version ".tar.gz"))
149 (sha256
150 (base32
151 "1kqbac547wyqma81cyky9n7mkgikjpsfd3nnmcm6hpqwanqgh10v"))))
152 (build-system gnu-build-system)
153 (arguments
154 '(#:tests? #f
155 #:make-flags (list (string-append "BINDIR=" %output "/bin"))
156 #:phases
157 (alist-cons-after
158 'unpack 'unpack-tarballs
159 (lambda _
160 ;; FIXME: Bedops includes tarballs of minimally patched upstream
161 ;; libraries jansson, zlib, and bzip2. We cannot just use stock
162 ;; libraries because at least one of the libraries (zlib) is
163 ;; patched to add a C++ function definition (deflateInit2cpp).
164 ;; Until the Bedops developers offer a way to link against system
165 ;; libraries we have to build the in-tree copies of these three
166 ;; libraries.
167
168 ;; See upstream discussion:
169 ;; https://github.com/bedops/bedops/issues/124
170
171 ;; Unpack the tarballs to benefit from shebang patching.
172 (with-directory-excursion "third-party"
173 (and (zero? (system* "tar" "xvf" "jansson-2.6.tar.bz2"))
174 (zero? (system* "tar" "xvf" "zlib-1.2.7.tar.bz2"))
175 (zero? (system* "tar" "xvf" "bzip2-1.0.6.tar.bz2"))))
176 ;; Disable unpacking of tarballs in Makefile.
177 (substitute* "system.mk/Makefile.linux"
178 (("^\tbzcat .*") "\t@echo \"not unpacking\"\n")
179 (("\\./configure") "CONFIG_SHELL=bash ./configure"))
180 (substitute* "third-party/zlib-1.2.7/Makefile.in"
181 (("^SHELL=.*$") "SHELL=bash\n")))
182 (alist-delete 'configure %standard-phases))))
183 (home-page "https://github.com/bedops/bedops")
184 (synopsis "Tools for high-performance genomic feature operations")
185 (description
186 "BEDOPS is a suite of tools to address common questions raised in genomic
187 studies---mostly with regard to overlap and proximity relationships between
188 data sets. It aims to be scalable and flexible, facilitating the efficient
189 and accurate analysis and management of large-scale genomic data.
190
191 BEDOPS provides tools that perform highly efficient and scalable Boolean and
192 other set operations, statistical calculations, archiving, conversion and
193 other management of genomic data of arbitrary scale. Tasks can be easily
194 split by chromosome for distributing whole-genome analyses across a
195 computational cluster.")
196 (license license:gpl2+)))
197
198 (define-public bedtools
199 (package
200 (name "bedtools")
201 (version "2.24.0")
202 (source (origin
203 (method url-fetch)
204 (uri (string-append "https://github.com/arq5x/bedtools2/archive/v"
205 version ".tar.gz"))
206 (file-name (string-append name "-" version ".tar.gz"))
207 (sha256
208 (base32
209 "0lnxrjvs3nnmb4bmskag1wg3h2hd80przz5q3xd0bvs7vyxrvpbl"))
210 (patches (list (search-patch "bedtools-32bit-compilation.patch")))))
211 (build-system gnu-build-system)
212 (native-inputs `(("python" ,python-2)))
213 (inputs `(("samtools" ,samtools)
214 ("zlib" ,zlib)))
215 (arguments
216 '(#:test-target "test"
217 #:phases
218 (alist-cons-after
219 'unpack 'patch-makefile-SHELL-definition
220 (lambda _
221 ;; patch-makefile-SHELL cannot be used here as it does not
222 ;; yet patch definitions with `:='. Since changes to
223 ;; patch-makefile-SHELL result in a full rebuild, features
224 ;; of patch-makefile-SHELL are reimplemented here.
225 (substitute* "Makefile"
226 (("^SHELL := .*$") (string-append "SHELL := " (which "bash") " -e \n"))))
227 (alist-delete
228 'configure
229 (alist-replace
230 'install
231 (lambda* (#:key outputs #:allow-other-keys)
232 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
233 (mkdir-p bin)
234 (for-each (lambda (file)
235 (copy-file file (string-append bin (basename file))))
236 (find-files "bin" ".*"))))
237 %standard-phases)))))
238 (home-page "https://github.com/arq5x/bedtools2")
239 (synopsis "Tools for genome analysis and arithmetic")
240 (description
241 "Collectively, the bedtools utilities are a swiss-army knife of tools for
242 a wide-range of genomics analysis tasks. The most widely-used tools enable
243 genome arithmetic: that is, set theory on the genome. For example, bedtools
244 allows one to intersect, merge, count, complement, and shuffle genomic
245 intervals from multiple files in widely-used genomic file formats such as BAM,
246 BED, GFF/GTF, VCF.")
247 (license license:gpl2)))
248
249 (define-public python2-pybedtools
250 (package
251 (name "python2-pybedtools")
252 (version "0.6.9")
253 (source (origin
254 (method url-fetch)
255 (uri (string-append
256 "https://pypi.python.org/packages/source/p/pybedtools/pybedtools-"
257 version ".tar.gz"))
258 (sha256
259 (base32
260 "1ldzdxw1p4y3g2ignmggsdypvqkcwqwzhdha4rbgpih048z5p4an"))))
261 (build-system python-build-system)
262 (arguments `(#:python ,python-2)) ; no Python 3 support
263 (inputs
264 `(("python-cython" ,python2-cython)
265 ("python-matplotlib" ,python2-matplotlib)))
266 (propagated-inputs
267 `(("bedtools" ,bedtools)
268 ("samtools" ,samtools)))
269 (native-inputs
270 `(("python-pyyaml" ,python2-pyyaml)
271 ("python-nose" ,python2-nose)
272 ("python-setuptools" ,python2-setuptools)))
273 (home-page "https://pythonhosted.org/pybedtools/")
274 (synopsis "Python wrapper for BEDtools programs")
275 (description
276 "pybedtools is a Python wrapper for Aaron Quinlan's BEDtools programs,
277 which are widely used for genomic interval manipulation or \"genome algebra\".
278 pybedtools extends BEDTools by offering feature-level manipulations from with
279 Python.")
280 (license license:gpl2+)))
281
282 (define-public bioperl-minimal
283 (let* ((inputs `(("perl-module-build" ,perl-module-build)
284 ("perl-data-stag" ,perl-data-stag)
285 ("perl-libwww" ,perl-libwww)
286 ("perl-uri" ,perl-uri)))
287 (transitive-inputs
288 (map (compose package-name cadr)
289 (delete-duplicates
290 (concatenate
291 (map (compose package-transitive-target-inputs cadr) inputs))))))
292 (package
293 (name "bioperl-minimal")
294 (version "1.6.924")
295 (source
296 (origin
297 (method url-fetch)
298 (uri (string-append "mirror://cpan/authors/id/C/CJ/CJFIELDS/BioPerl-"
299 version ".tar.gz"))
300 (sha256
301 (base32
302 "1l3npcvvvwjlhkna9dndpfv1hklhrgva013kw96m0n1wpd37ask1"))))
303 (build-system perl-build-system)
304 (arguments
305 `(#:phases
306 (modify-phases %standard-phases
307 (add-after
308 'install 'wrap-programs
309 (lambda* (#:key outputs #:allow-other-keys)
310 ;; Make sure all executables in "bin" find the required Perl
311 ;; modules at runtime. As the PERL5LIB variable contains also
312 ;; the paths of native inputs, we pick the transitive target
313 ;; inputs from %build-inputs.
314 (let* ((out (assoc-ref outputs "out"))
315 (bin (string-append out "/bin/"))
316 (path (string-join
317 (cons (string-append out "/lib/perl5/site_perl")
318 (map (lambda (name)
319 (assoc-ref %build-inputs name))
320 ',transitive-inputs))
321 ":")))
322 (for-each (lambda (file)
323 (wrap-program file
324 `("PERL5LIB" ":" prefix (,path))))
325 (find-files bin "\\.pl$"))
326 #t))))))
327 (inputs inputs)
328 (native-inputs
329 `(("perl-test-most" ,perl-test-most)))
330 (home-page "http://search.cpan.org/dist/BioPerl")
331 (synopsis "Bioinformatics toolkit")
332 (description
333 "BioPerl is the product of a community effort to produce Perl code which
334 is useful in biology. Examples include Sequence objects, Alignment objects
335 and database searching objects. These objects not only do what they are
336 advertised to do in the documentation, but they also interact - Alignment
337 objects are made from the Sequence objects, Sequence objects have access to
338 Annotation and SeqFeature objects and databases, Blast objects can be
339 converted to Alignment objects, and so on. This means that the objects
340 provide a coordinated and extensible framework to do computational biology.")
341 (license (package-license perl)))))
342
343 (define-public python-biopython
344 (package
345 (name "python-biopython")
346 (version "1.65")
347 (source (origin
348 (method url-fetch)
349 (uri (string-append
350 "http://biopython.org/DIST/biopython-"
351 version ".tar.gz"))
352 (sha256
353 (base32
354 "13m8s9jkrw40zvdp1rl709n6lmgdh4f52aann7gzr6sfp0fwhg26"))))
355 (build-system python-build-system)
356 (inputs
357 `(("python-numpy" ,python-numpy)))
358 (native-inputs
359 `(("python-setuptools" ,python2-setuptools)))
360 (home-page "http://biopython.org/")
361 (synopsis "Tools for biological computation in Python")
362 (description
363 "Biopython is a set of tools for biological computation including parsers
364 for bioinformatics files into Python data structures; interfaces to common
365 bioinformatics programs; a standard sequence class and tools for performing
366 common operations on them; code to perform data classification; code for
367 dealing with alignments; code making it easy to split up parallelizable tasks
368 into separate processes; and more.")
369 (license (license:non-copyleft "http://www.biopython.org/DIST/LICENSE"))))
370
371 (define-public python2-biopython
372 (package (inherit (package-with-python2 python-biopython))
373 (inputs
374 `(("python2-numpy" ,python2-numpy)))))
375
376 (define-public blast+
377 (package
378 (name "blast+")
379 (version "2.2.31")
380 (source (origin
381 (method url-fetch)
382 (uri (string-append
383 "ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/"
384 version "/ncbi-blast-" version "+-src.tar.gz"))
385 (sha256
386 (base32
387 "19gq6as4k1jrgsd26158ads6h7v4jca3h4r5dzg1y0m6ya50x5ph"))
388 (modules '((guix build utils)))
389 (snippet
390 '(begin
391 ;; Remove bundled bzip2 and zlib
392 (delete-file-recursively "c++/src/util/compress/bzip2")
393 (delete-file-recursively "c++/src/util/compress/zlib")
394 (substitute* "c++/src/util/compress/Makefile.in"
395 (("bzip2 zlib api") "api"))
396 ;; Remove useless msbuild directory
397 (delete-file-recursively
398 "c++/src/build-system/project_tree_builder/msbuild")
399 #t))))
400 (build-system gnu-build-system)
401 (arguments
402 `(;; There are three(!) tests for this massive library, and all fail with
403 ;; "unparsable timing stats".
404 ;; ERR [127] -- [util/regexp] test_pcre.sh (unparsable timing stats)
405 ;; ERR [127] -- [serial/datatool] datatool.sh (unparsable timing stats)
406 ;; ERR [127] -- [serial/datatool] datatool_xml.sh (unparsable timing stats)
407 #:tests? #f
408 #:out-of-source? #t
409 #:parallel-build? #f ; not supported
410 #:phases
411 (modify-phases %standard-phases
412 (add-before
413 'configure 'set-HOME
414 ;; $HOME needs to be set at some point during the configure phase
415 (lambda _ (setenv "HOME" "/tmp") #t))
416 (add-after
417 'unpack 'enter-dir
418 (lambda _ (chdir "c++") #t))
419 (add-after
420 'enter-dir 'fix-build-system
421 (lambda _
422 (define (which* cmd)
423 (cond ((string=? cmd "date")
424 ;; make call to "date" deterministic
425 "date -d @0")
426 ((which cmd)
427 => identity)
428 (else
429 (format (current-error-port)
430 "WARNING: Unable to find absolute path for ~s~%"
431 cmd)
432 #f)))
433
434 ;; Rewrite hardcoded paths to various tools
435 (substitute* (append '("src/build-system/configure.ac"
436 "src/build-system/configure"
437 "scripts/common/impl/if_diff.sh"
438 "scripts/common/impl/run_with_lock.sh"
439 "src/build-system/Makefile.configurables.real"
440 "src/build-system/Makefile.in.top"
441 "src/build-system/Makefile.meta.gmake=no"
442 "src/build-system/Makefile.meta.in"
443 "src/build-system/Makefile.meta_l"
444 "src/build-system/Makefile.meta_p"
445 "src/build-system/Makefile.meta_r"
446 "src/build-system/Makefile.mk.in"
447 "src/build-system/Makefile.requirements"
448 "src/build-system/Makefile.rules_with_autodep.in")
449 (find-files "scripts/common/check" "\\.sh$"))
450 (("(/usr/bin/|/bin/)([a-z][-_.a-z]*)" all dir cmd)
451 (or (which* cmd) all)))
452
453 (substitute* (find-files "src/build-system" "^config.*")
454 (("LN_S=/bin/\\$LN_S") (string-append "LN_S=" (which "ln")))
455 (("^PATH=.*") ""))
456
457 ;; rewrite "/var/tmp" in check script
458 (substitute* "scripts/common/check/check_make_unix.sh"
459 (("/var/tmp") "/tmp"))
460
461 ;; do not reset PATH
462 (substitute* (find-files "scripts/common/impl/" "\\.sh$")
463 (("^ *PATH=.*") "")
464 (("action=/bin/") "action=")
465 (("export PATH") ":"))
466 #t))
467 (replace
468 'configure
469 (lambda* (#:key inputs outputs #:allow-other-keys)
470 (let ((out (assoc-ref outputs "out"))
471 (lib (string-append (assoc-ref outputs "lib") "/lib"))
472 (include (string-append (assoc-ref outputs "include")
473 "/include/ncbi-tools++")))
474 ;; The 'configure' script doesn't recognize things like
475 ;; '--enable-fast-install'.
476 (zero? (system* "./configure.orig"
477 (string-append "--with-build-root=" (getcwd) "/build")
478 (string-append "--prefix=" out)
479 (string-append "--libdir=" lib)
480 (string-append "--includedir=" include)
481 (string-append "--with-bz2="
482 (assoc-ref inputs "bzip2"))
483 (string-append "--with-z="
484 (assoc-ref inputs "zlib"))
485 ;; Each library is built twice by default, once
486 ;; with "-static" in its name, and again
487 ;; without.
488 "--without-static"
489 "--with-dll"))))))))
490 (outputs '("out" ; 19 MB
491 "lib" ; 203 MB
492 "include")) ; 32 MB
493 (inputs
494 `(("bzip2" ,bzip2)
495 ("zlib" ,zlib)))
496 (native-inputs
497 `(("cpio" ,cpio)))
498 (home-page "http://blast.ncbi.nlm.nih.gov")
499 (synopsis "Basic local alignment search tool")
500 (description
501 "BLAST is a popular method of performing a DNA or protein sequence
502 similarity search, using heuristics to produce results quickly. It also
503 calculates an “expect value” that estimates how many matches would have
504 occurred at a given score by chance, which can aid a user in judging how much
505 confidence to have in an alignment.")
506 ;; Most of the sources are in the public domain, with the following
507 ;; exceptions:
508 ;; * Expat:
509 ;; * ./c++/include/util/bitset/
510 ;; * ./c++/src/html/ncbi_menu*.js
511 ;; * Boost license:
512 ;; * ./c++/include/util/impl/floating_point_comparison.hpp
513 ;; * LGPL 2+:
514 ;; * ./c++/include/dbapi/driver/odbc/unix_odbc/
515 ;; * ASL 2.0:
516 ;; * ./c++/src/corelib/teamcity_*
517 (license (list license:public-domain
518 license:expat
519 license:boost1.0
520 license:lgpl2.0+
521 license:asl2.0))))
522
523 (define-public bowtie
524 (package
525 (name "bowtie")
526 (version "2.2.4")
527 (source (origin
528 (method url-fetch)
529 (uri (string-append "https://github.com/BenLangmead/bowtie2/archive/v"
530 version ".tar.gz"))
531 (file-name (string-append name "-" version ".tar.gz"))
532 (sha256
533 (base32
534 "15dnbqippwvhyh9zqjhaxkabk7lm1xbh1nvar1x4b5kwm117zijn"))
535 (modules '((guix build utils)))
536 (snippet
537 '(substitute* "Makefile"
538 (("^CC = .*$") "CC = gcc")
539 (("^CPP = .*$") "CPP = g++")
540 ;; replace BUILD_HOST and BUILD_TIME for deterministic build
541 (("-DBUILD_HOST=.*") "-DBUILD_HOST=\"\\\"guix\\\"\"")
542 (("-DBUILD_TIME=.*") "-DBUILD_TIME=\"\\\"0\\\"\"")))
543 (patches (list (search-patch "bowtie-fix-makefile.patch")))))
544 (build-system gnu-build-system)
545 (inputs `(("perl" ,perl)
546 ("perl-clone" ,perl-clone)
547 ("perl-test-deep" ,perl-test-deep)
548 ("perl-test-simple" ,perl-test-simple)
549 ("python" ,python-2)))
550 (arguments
551 '(#:make-flags '("allall")
552 #:phases
553 (alist-delete
554 'configure
555 (alist-replace
556 'install
557 (lambda* (#:key outputs #:allow-other-keys)
558 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
559 (mkdir-p bin)
560 (for-each (lambda (file)
561 (copy-file file (string-append bin file)))
562 (find-files "." "bowtie2.*"))))
563 (alist-replace
564 'check
565 (lambda* (#:key outputs #:allow-other-keys)
566 (system* "perl"
567 "scripts/test/simple_tests.pl"
568 "--bowtie2=./bowtie2"
569 "--bowtie2-build=./bowtie2-build"))
570 %standard-phases)))))
571 (home-page "http://bowtie-bio.sourceforge.net/bowtie2/index.shtml")
572 (synopsis "Fast and sensitive nucleotide sequence read aligner")
573 (description
574 "Bowtie 2 is a fast and memory-efficient tool for aligning sequencing
575 reads to long reference sequences. It is particularly good at aligning reads
576 of about 50 up to 100s or 1,000s of characters, and particularly good at
577 aligning to relatively long (e.g. mammalian) genomes. Bowtie 2 indexes the
578 genome with an FM Index to keep its memory footprint small: for the human
579 genome, its memory footprint is typically around 3.2 GB. Bowtie 2 supports
580 gapped, local, and paired-end alignment modes.")
581 (supported-systems '("x86_64-linux"))
582 (license license:gpl3+)))
583
584 (define-public bwa
585 (package
586 (name "bwa")
587 (version "0.7.12")
588 (source (origin
589 (method url-fetch)
590 (uri (string-append "mirror://sourceforge/bio-bwa/bwa-"
591 version ".tar.bz2"))
592 (sha256
593 (base32
594 "1330dpqncv0px3pbhjzz1gwgg39kkcv2r9qp2xs0sixf8z8wl7bh"))))
595 (build-system gnu-build-system)
596 (arguments
597 '(#:tests? #f ;no "check" target
598 #:phases
599 (alist-replace
600 'install
601 (lambda* (#:key outputs #:allow-other-keys)
602 (let ((bin (string-append
603 (assoc-ref outputs "out") "/bin"))
604 (doc (string-append
605 (assoc-ref outputs "out") "/share/doc/bwa"))
606 (man (string-append
607 (assoc-ref outputs "out") "/share/man/man1")))
608 (mkdir-p bin)
609 (mkdir-p doc)
610 (mkdir-p man)
611 (copy-file "bwa" (string-append bin "/bwa"))
612 (copy-file "README.md" (string-append doc "/README.md"))
613 (copy-file "bwa.1" (string-append man "/bwa.1"))))
614 ;; no "configure" script
615 (alist-delete 'configure %standard-phases))))
616 (inputs `(("zlib" ,zlib)))
617 (home-page "http://bio-bwa.sourceforge.net/")
618 (synopsis "Burrows-Wheeler sequence aligner")
619 (description
620 "BWA is a software package for mapping low-divergent sequences against a
621 large reference genome, such as the human genome. It consists of three
622 algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is
623 designed for Illumina sequence reads up to 100bp, while the rest two for
624 longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share similar
625 features such as long-read support and split alignment, but BWA-MEM, which is
626 the latest, is generally recommended for high-quality queries as it is faster
627 and more accurate. BWA-MEM also has better performance than BWA-backtrack for
628 70-100bp Illumina reads.")
629 (license license:gpl3+)))
630
631 (define-public python2-bx-python
632 (package
633 (name "python2-bx-python")
634 (version "0.7.2")
635 (source (origin
636 (method url-fetch)
637 (uri (string-append
638 "https://pypi.python.org/packages/source/b/bx-python/bx-python-"
639 version ".tar.gz"))
640 (sha256
641 (base32
642 "0ld49idhc5zjdvbhvjq1a2qmpjj7h5v58rqr25dzmfq7g34b50xh"))
643 (modules '((guix build utils)))
644 (snippet
645 '(substitute* "setup.py"
646 ;; remove dependency on outdated "distribute" module
647 (("^from distribute_setup import use_setuptools") "")
648 (("^use_setuptools\\(\\)") "")))))
649 (build-system python-build-system)
650 (arguments
651 `(#:tests? #f ;tests fail because test data are not included
652 #:python ,python-2))
653 (inputs
654 `(("python-numpy" ,python2-numpy)
655 ("zlib" ,zlib)))
656 (native-inputs
657 `(("python-nose" ,python2-nose)
658 ("python-setuptools" ,python2-setuptools)))
659 (home-page "http://bitbucket.org/james_taylor/bx-python/")
660 (synopsis "Tools for manipulating biological data")
661 (description
662 "bx-python provides tools for manipulating biological data, particularly
663 multiple sequence alignments.")
664 (license license:expat)))
665
666 (define-public clipper
667 (package
668 (name "clipper")
669 (version "0.3.0")
670 (source (origin
671 (method url-fetch)
672 (uri (string-append
673 "https://github.com/YeoLab/clipper/archive/"
674 version ".tar.gz"))
675 (sha256
676 (base32
677 "1q7jpimsqln7ic44i8v2rx2haj5wvik8hc1s2syd31zcn0xk1iyq"))
678 (modules '((guix build utils)))
679 (snippet
680 ;; remove unnecessary setup dependency
681 '(substitute* "setup.py"
682 (("setup_requires = .*") "")))))
683 (build-system python-build-system)
684 (arguments `(#:python ,python-2)) ; only Python 2 is supported
685 (inputs
686 `(("htseq" ,htseq)
687 ("python-pybedtools" ,python2-pybedtools)
688 ("python-cython" ,python2-cython)
689 ("python-scikit-learn" ,python2-scikit-learn)
690 ("python-matplotlib" ,python2-matplotlib)
691 ("python-pysam" ,python2-pysam)
692 ("python-numpy" ,python2-numpy)
693 ("python-scipy" ,python2-scipy)))
694 (native-inputs
695 `(("python-mock" ,python2-mock) ; for tests
696 ("python-pytz" ,python2-pytz) ; for tests
697 ("python-setuptools" ,python2-setuptools)))
698 (home-page "https://github.com/YeoLab/clipper")
699 (synopsis "CLIP peak enrichment recognition")
700 (description
701 "CLIPper is a tool to define peaks in CLIP-seq datasets.")
702 (license license:gpl2)))
703
704 (define-public couger
705 (package
706 (name "couger")
707 (version "1.8.2")
708 (source (origin
709 (method url-fetch)
710 (uri (string-append
711 "http://couger.oit.duke.edu/static/assets/COUGER"
712 version ".zip"))
713 (sha256
714 (base32
715 "04p2b14nmhzxw5h72mpzdhalv21bx4w9b87z0wpw0xzxpysyncmq"))))
716 (build-system gnu-build-system)
717 (arguments
718 `(#:tests? #f
719 #:phases
720 (modify-phases %standard-phases
721 (delete 'configure)
722 (delete 'build)
723 (replace
724 'install
725 (lambda* (#:key outputs #:allow-other-keys)
726 (let ((out (assoc-ref outputs "out")))
727 (copy-recursively "src" (string-append out "/src"))
728 (mkdir (string-append out "/bin"))
729 ;; Add "src" directory to module lookup path.
730 (substitute* "couger"
731 (("from argparse")
732 (string-append "import sys\nsys.path.append(\""
733 out "\")\nfrom argparse")))
734 (copy-file "couger" (string-append out "/bin/couger")))
735 #t))
736 (add-after
737 'install 'wrap-program
738 (lambda* (#:key inputs outputs #:allow-other-keys)
739 ;; Make sure 'couger' runs with the correct PYTHONPATH.
740 (let* ((out (assoc-ref outputs "out"))
741 (path (getenv "PYTHONPATH")))
742 (wrap-program (string-append out "/bin/couger")
743 `("PYTHONPATH" ":" prefix (,path))))
744 #t)))))
745 (inputs
746 `(("python" ,python-2)
747 ("python2-pillow" ,python2-pillow)
748 ("python2-numpy" ,python2-numpy)
749 ("python2-scipy" ,python2-scipy)
750 ("python2-matplotlib" ,python2-matplotlib)))
751 (propagated-inputs
752 `(("r" ,r)
753 ("libsvm" ,libsvm)
754 ("randomjungle" ,randomjungle)))
755 (native-inputs
756 `(("unzip" ,unzip)))
757 (home-page "http://couger.oit.duke.edu")
758 (synopsis "Identify co-factors in sets of genomic regions")
759 (description
760 "COUGER can be applied to any two sets of genomic regions bound by
761 paralogous TFs (e.g., regions derived from ChIP-seq experiments) to identify
762 putative co-factors that provide specificity to each TF. The framework
763 determines the genomic targets uniquely-bound by each TF, and identifies a
764 small set of co-factors that best explain the in vivo binding differences
765 between the two TFs.
766
767 COUGER uses classification algorithms (support vector machines and random
768 forests) with features that reflect the DNA binding specificities of putative
769 co-factors. The features are generated either from high-throughput TF-DNA
770 binding data (from protein binding microarray experiments), or from large
771 collections of DNA motifs.")
772 (license license:gpl3+)))
773
774 (define-public clustal-omega
775 (package
776 (name "clustal-omega")
777 (version "1.2.1")
778 (source (origin
779 (method url-fetch)
780 (uri (string-append
781 "http://www.clustal.org/omega/clustal-omega-"
782 version ".tar.gz"))
783 (sha256
784 (base32
785 "02ibkx0m0iwz8nscg998bh41gg251y56cgh86bvyrii5m8kjgwqf"))))
786 (build-system gnu-build-system)
787 (inputs
788 `(("argtable" ,argtable)))
789 (home-page "http://www.clustal.org/omega/")
790 (synopsis "Multiple sequence aligner for protein and DNA/RNA")
791 (description
792 "Clustal-Omega is a general purpose multiple sequence alignment (MSA)
793 program for protein and DNA/RNA. It produces high quality MSAs and is capable
794 of handling data-sets of hundreds of thousands of sequences in reasonable
795 time.")
796 (license license:gpl2+)))
797
798 (define-public crossmap
799 (package
800 (name "crossmap")
801 (version "0.1.6")
802 (source (origin
803 (method url-fetch)
804 (uri (string-append "mirror://sourceforge/crossmap/CrossMap-"
805 version ".tar.gz"))
806 (sha256
807 (base32
808 "163hi5gjgij6cndxlvbkp5jjwr0k4wbm9im6d2210278q7k9kpnp"))
809 ;; patch has been sent upstream already
810 (patches (list
811 (search-patch "crossmap-allow-system-pysam.patch")))
812 (modules '((guix build utils)))
813 ;; remove bundled copy of pysam
814 (snippet
815 '(delete-file-recursively "lib/pysam"))))
816 (build-system python-build-system)
817 (arguments
818 `(#:python ,python-2
819 #:phases
820 (alist-cons-after
821 'unpack 'set-env
822 (lambda _ (setenv "CROSSMAP_USE_SYSTEM_PYSAM" "1"))
823 %standard-phases)))
824 (inputs
825 `(("python-numpy" ,python2-numpy)
826 ("python-pysam" ,python2-pysam)
827 ("zlib" ,zlib)))
828 (native-inputs
829 `(("python-cython" ,python2-cython)
830 ("python-nose" ,python2-nose)
831 ("python-setuptools" ,python2-setuptools)))
832 (home-page "http://crossmap.sourceforge.net/")
833 (synopsis "Convert genome coordinates between assemblies")
834 (description
835 "CrossMap is a program for conversion of genome coordinates or annotation
836 files between different genome assemblies. It supports most commonly used
837 file formats including SAM/BAM, Wiggle/BigWig, BED, GFF/GTF, VCF.")
838 (license license:gpl2+)))
839
840 (define-public cutadapt
841 (package
842 (name "cutadapt")
843 (version "1.8")
844 (source (origin
845 (method url-fetch)
846 (uri (string-append
847 "https://github.com/marcelm/cutadapt/archive/v"
848 version ".tar.gz"))
849 (file-name (string-append name "-" version ".tar.gz"))
850 (sha256
851 (base32
852 "161bp87y6gd6r5bmvjpn2b1k942i3fizfpa139f0jn6jv1wcp5h5"))))
853 (build-system python-build-system)
854 (arguments
855 ;; tests must be run after install
856 `(#:phases (alist-cons-after
857 'install 'check
858 (lambda* (#:key inputs outputs #:allow-other-keys)
859 (setenv "PYTHONPATH"
860 (string-append
861 (getenv "PYTHONPATH")
862 ":" (assoc-ref outputs "out")
863 "/lib/python"
864 (string-take (string-take-right
865 (assoc-ref inputs "python") 5) 3)
866 "/site-packages"))
867 (zero? (system* "nosetests" "-P" "tests")))
868 (alist-delete 'check %standard-phases))))
869 (native-inputs
870 `(("python-cython" ,python-cython)
871 ("python-nose" ,python-nose)
872 ("python-setuptools" ,python-setuptools)))
873 (home-page "https://code.google.com/p/cutadapt/")
874 (synopsis "Remove adapter sequences from nucleotide sequencing reads")
875 (description
876 "Cutadapt finds and removes adapter sequences, primers, poly-A tails and
877 other types of unwanted sequence from high-throughput sequencing reads.")
878 (license license:expat)))
879
880 (define-public deeptools
881 (package
882 (name "deeptools")
883 (version "1.5.11")
884 (source (origin
885 (method url-fetch)
886 (uri (string-append
887 "https://github.com/fidelram/deepTools/archive/"
888 version ".tar.gz"))
889 (file-name (string-append name "-" version ".tar.gz"))
890 (sha256
891 (base32
892 "1kaagygcbvjs9sxd9cqmskd02wcfp9imvb735r087w7hwqpvz6fs"))))
893 (build-system python-build-system)
894 (arguments
895 `(#:python ,python-2))
896 (propagated-inputs
897 `(("python-scipy" ,python2-scipy)
898 ("python-numpy" ,python2-numpy)
899 ("python-matplotlib" ,python2-matplotlib)
900 ("python-bx-python" ,python2-bx-python)
901 ("python-pysam" ,python2-pysam)))
902 (native-inputs
903 `(("python-mock" ,python2-mock) ;for tests
904 ("python-pytz" ,python2-pytz) ;for tests
905 ("python-setuptools" ,python2-setuptools)))
906 (home-page "https://github.com/fidelram/deepTools")
907 (synopsis "Tools for normalizing and visualizing deep-sequencing data")
908 (description
909 "DeepTools addresses the challenge of handling the large amounts of data
910 that are now routinely generated from DNA sequencing centers. To do so,
911 deepTools contains useful modules to process the mapped reads data to create
912 coverage files in standard bedGraph and bigWig file formats. By doing so,
913 deepTools allows the creation of normalized coverage files or the comparison
914 between two files (for example, treatment and control). Finally, using such
915 normalized and standardized files, multiple visualizations can be created to
916 identify enrichments with functional annotations of the genome.")
917 (license license:gpl3+)))
918
919 (define-public diamond
920 (package
921 (name "diamond")
922 (version "0.7.9")
923 (source (origin
924 (method url-fetch)
925 (uri (string-append
926 "https://github.com/bbuchfink/diamond/archive/v"
927 version ".tar.gz"))
928 (file-name (string-append name "-" version ".tar.gz"))
929 (sha256
930 (base32
931 "0hfkcfv9f76h5brbyw9fyvmc0l9cmbsxrcdqk0fa9xv82zj47p15"))
932 (snippet '(begin
933 (delete-file "bin/diamond")
934 #t))))
935 (build-system gnu-build-system)
936 (arguments
937 '(#:tests? #f ;no "check" target
938 #:phases
939 (modify-phases %standard-phases
940 (add-after 'unpack 'enter-source-dir
941 (lambda _
942 (chdir "src")
943 #t))
944 (delete 'configure)
945 (replace 'install
946 (lambda* (#:key outputs #:allow-other-keys)
947 (let ((bin (string-append (assoc-ref outputs "out")
948 "/bin")))
949 (mkdir-p bin)
950 (copy-file "../bin/diamond"
951 (string-append bin "/diamond"))
952 #t))))))
953 (native-inputs
954 `(("bc" ,bc)))
955 (inputs
956 `(("boost" ,boost)
957 ("zlib" ,zlib)))
958 (home-page "https://github.com/bbuchfink/diamond")
959 (synopsis "Accelerated BLAST compatible local sequence aligner")
960 (description
961 "DIAMOND is a BLAST-compatible local aligner for mapping protein and
962 translated DNA query sequences against a protein reference database (BLASTP
963 and BLASTX alignment mode). The speedup over BLAST is up to 20,000 on short
964 reads at a typical sensitivity of 90-99% relative to BLAST depending on the
965 data and settings.")
966 (license (license:non-copyleft "file://src/COPYING"
967 "See src/COPYING in the distribution."))))
968
969 (define-public edirect
970 (package
971 (name "edirect")
972 (version "2.50")
973 (source (origin
974 (method url-fetch)
975 ;; Note: older versions are not retained.
976 (uri "ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/edirect.zip")
977 (sha256
978 (base32
979 "08afhz2ph66h8h381hl1mqyxkdi5nbvzsyj9gfw3jfbdijnpi4qj"))))
980 (build-system perl-build-system)
981 (arguments
982 `(#:tests? #f ;no "check" target
983 #:phases
984 (modify-phases %standard-phases
985 (delete 'configure)
986 (delete 'build)
987 (replace 'install
988 (lambda* (#:key outputs #:allow-other-keys)
989 (let ((target (string-append (assoc-ref outputs "out")
990 "/bin")))
991 (mkdir-p target)
992 (copy-file "edirect.pl"
993 (string-append target "/edirect.pl"))
994 #t)))
995 (add-after
996 'install 'wrap-program
997 (lambda* (#:key inputs outputs #:allow-other-keys)
998 ;; Make sure 'edirect.pl' finds all perl inputs at runtime.
999 (let* ((out (assoc-ref outputs "out"))
1000 (path (getenv "PERL5LIB")))
1001 (wrap-program (string-append out "/bin/edirect.pl")
1002 `("PERL5LIB" ":" prefix (,path)))))))))
1003 (inputs
1004 `(("perl-html-parser" ,perl-html-parser)
1005 ("perl-encode-locale" ,perl-encode-locale)
1006 ("perl-file-listing" ,perl-file-listing)
1007 ("perl-html-tagset" ,perl-html-tagset)
1008 ("perl-html-tree" ,perl-html-tree)
1009 ("perl-http-cookies" ,perl-http-cookies)
1010 ("perl-http-date" ,perl-http-date)
1011 ("perl-http-message" ,perl-http-message)
1012 ("perl-http-negotiate" ,perl-http-negotiate)
1013 ("perl-lwp-mediatypes" ,perl-lwp-mediatypes)
1014 ("perl-lwp-protocol-https" ,perl-lwp-protocol-https)
1015 ("perl-net-http" ,perl-net-http)
1016 ("perl-uri" ,perl-uri)
1017 ("perl-www-robotrules" ,perl-www-robotrules)
1018 ("perl" ,perl)))
1019 (native-inputs
1020 `(("unzip" ,unzip)))
1021 (home-page "http://www.ncbi.nlm.nih.gov/books/NBK179288")
1022 (synopsis "Tools for accessing the NCBI's set of databases")
1023 (description
1024 "Entrez Direct (EDirect) is a method for accessing the National Center
1025 for Biotechnology Information's (NCBI) set of interconnected
1026 databases (publication, sequence, structure, gene, variation, expression,
1027 etc.) from a terminal. Functions take search terms from command-line
1028 arguments. Individual operations are combined to build multi-step queries.
1029 Record retrieval and formatting normally complete the process.
1030
1031 EDirect also provides an argument-driven function that simplifies the
1032 extraction of data from document summaries or other results that are returned
1033 in structured XML format. This can eliminate the need for writing custom
1034 software to answer ad hoc questions.")
1035 (license license:public-domain)))
1036
1037 (define-public express
1038 (package
1039 (name "express")
1040 (version "1.5.1")
1041 (source (origin
1042 (method url-fetch)
1043 (uri
1044 (string-append
1045 "http://bio.math.berkeley.edu/eXpress/downloads/express-"
1046 version "/express-" version "-src.tgz"))
1047 (sha256
1048 (base32
1049 "03rczxd0gjp2l1jxcmjfmf5j94j77zqyxa6x063zsc585nj40n0c"))))
1050 (build-system cmake-build-system)
1051 (arguments
1052 `(#:tests? #f ;no "check" target
1053 #:phases
1054 (alist-cons-after
1055 'unpack 'use-shared-boost-libs-and-set-bamtools-paths
1056 (lambda* (#:key inputs #:allow-other-keys)
1057 (substitute* "CMakeLists.txt"
1058 (("set\\(Boost_USE_STATIC_LIBS ON\\)")
1059 "set(Boost_USE_STATIC_LIBS OFF)")
1060 (("\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/bamtools/include")
1061 (string-append (assoc-ref inputs "bamtools") "/include/bamtools")))
1062 (substitute* "src/CMakeLists.txt"
1063 (("\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/\\.\\./bamtools/lib")
1064 (string-append (assoc-ref inputs "bamtools") "/lib/bamtools")))
1065 #t)
1066 %standard-phases)))
1067 (inputs
1068 `(("boost" ,boost)
1069 ("bamtools" ,bamtools)
1070 ("protobuf" ,protobuf)
1071 ("zlib" ,zlib)))
1072 (home-page "http://bio.math.berkeley.edu/eXpress")
1073 (synopsis "Streaming quantification for high-throughput genomic sequencing")
1074 (description
1075 "eXpress is a streaming tool for quantifying the abundances of a set of
1076 target sequences from sampled subsequences. Example applications include
1077 transcript-level RNA-Seq quantification, allele-specific/haplotype expression
1078 analysis (from RNA-Seq), transcription factor binding quantification in
1079 ChIP-Seq, and analysis of metagenomic data.")
1080 (license license:artistic2.0)))
1081
1082 (define-public fasttree
1083 (package
1084 (name "fasttree")
1085 (version "2.1.8")
1086 (source (origin
1087 (method url-fetch)
1088 (uri (string-append
1089 "http://www.microbesonline.org/fasttree/FastTree-"
1090 version ".c"))
1091 (sha256
1092 (base32
1093 "0dzqc9vr9iiiw21y159xfjl2z90vw0y7r4x6456pcaxiy5hd2wmi"))))
1094 (build-system gnu-build-system)
1095 (arguments
1096 `(#:tests? #f ; no "check" target
1097 #:phases
1098 (modify-phases %standard-phases
1099 (delete 'unpack)
1100 (delete 'configure)
1101 (replace 'build
1102 (lambda* (#:key source #:allow-other-keys)
1103 (and (zero? (system* "gcc"
1104 "-O3"
1105 "-finline-functions"
1106 "-funroll-loops"
1107 "-Wall"
1108 "-o"
1109 "FastTree"
1110 source
1111 "-lm"))
1112 (zero? (system* "gcc"
1113 "-DOPENMP"
1114 "-fopenmp"
1115 "-O3"
1116 "-finline-functions"
1117 "-funroll-loops"
1118 "-Wall"
1119 "-o"
1120 "FastTreeMP"
1121 source
1122 "-lm")))))
1123 (replace 'install
1124 (lambda* (#:key outputs #:allow-other-keys)
1125 (let ((bin (string-append (assoc-ref outputs "out")
1126 "/bin")))
1127 (mkdir-p bin)
1128 (copy-file "FastTree"
1129 (string-append bin "/FastTree"))
1130 (copy-file "FastTreeMP"
1131 (string-append bin "/FastTreeMP"))
1132 #t))))))
1133 (home-page "http://www.microbesonline.org/fasttree")
1134 (synopsis "Infers approximately-maximum-likelihood phylogenetic trees")
1135 (description
1136 "FastTree can handle alignments with up to a million of sequences in a
1137 reasonable amount of time and memory. For large alignments, FastTree is
1138 100-1,000 times faster than PhyML 3.0 or RAxML 7.")
1139 (license license:gpl2+)))
1140
1141 (define-public fastx-toolkit
1142 (package
1143 (name "fastx-toolkit")
1144 (version "0.0.14")
1145 (source (origin
1146 (method url-fetch)
1147 (uri
1148 (string-append
1149 "https://github.com/agordon/fastx_toolkit/releases/download/"
1150 version "/fastx_toolkit-" version ".tar.bz2"))
1151 (sha256
1152 (base32
1153 "01jqzw386873sr0pjp1wr4rn8fsga2vxs1qfmicvx1pjr72007wy"))))
1154 (build-system gnu-build-system)
1155 (inputs
1156 `(("libgtextutils" ,libgtextutils)))
1157 (native-inputs
1158 `(("pkg-config" ,pkg-config)))
1159 (home-page "http://hannonlab.cshl.edu/fastx_toolkit/")
1160 (synopsis "Tools for FASTA/FASTQ file preprocessing")
1161 (description
1162 "The FASTX-Toolkit is a collection of command line tools for Short-Reads
1163 FASTA/FASTQ files preprocessing.
1164
1165 Next-Generation sequencing machines usually produce FASTA or FASTQ files,
1166 containing multiple short-reads sequences. The main processing of such
1167 FASTA/FASTQ files is mapping the sequences to reference genomes. However, it
1168 is sometimes more productive to preprocess the files before mapping the
1169 sequences to the genome---manipulating the sequences to produce better mapping
1170 results. The FASTX-Toolkit tools perform some of these preprocessing tasks.")
1171 (license license:agpl3+)))
1172
1173 (define-public flexbar
1174 (package
1175 (name "flexbar")
1176 (version "2.5")
1177 (source (origin
1178 (method url-fetch)
1179 (uri
1180 (string-append "mirror://sourceforge/flexbar/"
1181 version "/flexbar_v" version "_src.tgz"))
1182 (sha256
1183 (base32
1184 "13jaykc3y1x8y5nn9j8ljnb79s5y51kyxz46hdmvvjj6qhyympmf"))))
1185 (build-system cmake-build-system)
1186 (arguments
1187 `(#:configure-flags (list
1188 (string-append "-DFLEXBAR_BINARY_DIR="
1189 (assoc-ref %outputs "out")
1190 "/bin/"))
1191 #:phases
1192 (alist-replace
1193 'check
1194 (lambda* (#:key outputs #:allow-other-keys)
1195 (setenv "PATH" (string-append
1196 (assoc-ref outputs "out") "/bin:"
1197 (getenv "PATH")))
1198 (chdir "../flexbar_v2.5_src/test")
1199 (zero? (system* "bash" "flexbar_validate.sh")))
1200 (alist-delete 'install %standard-phases))))
1201 (inputs
1202 `(("tbb" ,tbb)
1203 ("zlib" ,zlib)))
1204 (native-inputs
1205 `(("pkg-config" ,pkg-config)
1206 ("seqan" ,seqan)))
1207 (home-page "http://flexbar.sourceforge.net")
1208 (synopsis "Barcode and adapter removal tool for sequencing platforms")
1209 (description
1210 "Flexbar preprocesses high-throughput nucleotide sequencing data
1211 efficiently. It demultiplexes barcoded runs and removes adapter sequences.
1212 Moreover, trimming and filtering features are provided. Flexbar increases
1213 read mapping rates and improves genome and transcriptome assemblies. It
1214 supports next-generation sequencing data in fasta/q and csfasta/q format from
1215 Illumina, Roche 454, and the SOLiD platform.")
1216 (license license:gpl3)))
1217
1218 (define-public grit
1219 (package
1220 (name "grit")
1221 (version "2.0.2")
1222 (source (origin
1223 (method url-fetch)
1224 (uri (string-append
1225 "https://github.com/nboley/grit/archive/"
1226 version ".tar.gz"))
1227 (file-name (string-append name "-" version ".tar.gz"))
1228 (sha256
1229 (base32
1230 "157in84dj70wimbind3x7sy1whs3h57qfgcnj2s6lrd38fbrb7mj"))))
1231 (build-system python-build-system)
1232 (arguments
1233 `(#:python ,python-2
1234 #:phases
1235 (alist-cons-after
1236 'unpack 'generate-from-cython-sources
1237 (lambda* (#:key inputs outputs #:allow-other-keys)
1238 ;; Delete these C files to force fresh generation from pyx sources.
1239 (delete-file "grit/sparsify_support_fns.c")
1240 (delete-file "grit/call_peaks_support_fns.c")
1241 (substitute* "setup.py"
1242 (("Cython.Setup") "Cython.Build")
1243 ;; Add numpy include path to fix compilation
1244 (("pyx\", \\]")
1245 (string-append "pyx\", ], include_dirs = ['"
1246 (assoc-ref inputs "python-numpy")
1247 "/lib/python2.7/site-packages/numpy/core/include/"
1248 "']"))) #t)
1249 %standard-phases)))
1250 (inputs
1251 `(("python-scipy" ,python2-scipy)
1252 ("python-numpy" ,python2-numpy)
1253 ("python-pysam" ,python2-pysam)
1254 ("python-networkx" ,python2-networkx)))
1255 (native-inputs
1256 `(("python-cython" ,python2-cython)
1257 ("python-setuptools" ,python2-setuptools)))
1258 (home-page "http://grit-bio.org")
1259 (synopsis "Tool for integrative analysis of RNA-seq type assays")
1260 (description
1261 "GRIT is designed to use RNA-seq, TES, and TSS data to build and quantify
1262 full length transcript models. When none of these data sources are available,
1263 GRIT can be run by providing a candidate set of TES or TSS sites. In
1264 addition, GRIT can merge in reference junctions and gene boundaries. GRIT can
1265 also be run in quantification mode, where it uses a provided GTF file and just
1266 estimates transcript expression.")
1267 (license license:gpl3+)))
1268
1269 (define-public hisat
1270 (package
1271 (name "hisat")
1272 (version "0.1.4")
1273 (source (origin
1274 (method url-fetch)
1275 (uri (string-append
1276 "http://ccb.jhu.edu/software/hisat/downloads/hisat-"
1277 version "-beta-source.zip"))
1278 (sha256
1279 (base32
1280 "1k381ydranqxp09yf2y7w1d0chz5d59vb6jchi89hbb0prq19lk5"))))
1281 (build-system gnu-build-system)
1282 (arguments
1283 `(#:tests? #f ;no check target
1284 #:make-flags '("allall"
1285 ;; Disable unsupported `popcnt' instructions on
1286 ;; architectures other than x86_64
1287 ,@(if (string-prefix? "x86_64"
1288 (or (%current-target-system)
1289 (%current-system)))
1290 '()
1291 '("POPCNT_CAPABILITY=0")))
1292 #:phases
1293 (alist-cons-after
1294 'unpack 'patch-sources
1295 (lambda _
1296 ;; XXX Cannot use snippet because zip files are not supported
1297 (substitute* "Makefile"
1298 (("^CC = .*$") "CC = gcc")
1299 (("^CPP = .*$") "CPP = g++")
1300 ;; replace BUILD_HOST and BUILD_TIME for deterministic build
1301 (("-DBUILD_HOST=.*") "-DBUILD_HOST=\"\\\"guix\\\"\"")
1302 (("-DBUILD_TIME=.*") "-DBUILD_TIME=\"\\\"0\\\"\""))
1303 (substitute* '("hisat-build" "hisat-inspect")
1304 (("/usr/bin/env") (which "env"))))
1305 (alist-replace
1306 'install
1307 (lambda* (#:key outputs #:allow-other-keys)
1308 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
1309 (mkdir-p bin)
1310 (for-each
1311 (lambda (file)
1312 (copy-file file (string-append bin file)))
1313 (find-files
1314 "."
1315 "hisat(-(build|align|inspect)(-(s|l)(-debug)*)*)*$"))))
1316 (alist-delete 'configure %standard-phases)))))
1317 (native-inputs
1318 `(("unzip" ,unzip)))
1319 (inputs
1320 `(("perl" ,perl)
1321 ("python" ,python)
1322 ("zlib" ,zlib)))
1323 (home-page "http://ccb.jhu.edu/software/hisat/index.shtml")
1324 (synopsis "Hierarchical indexing for spliced alignment of transcripts")
1325 (description
1326 "HISAT is a fast and sensitive spliced alignment program for mapping
1327 RNA-seq reads. In addition to one global FM index that represents a whole
1328 genome, HISAT uses a large set of small FM indexes that collectively cover the
1329 whole genome. These small indexes (called local indexes) combined with
1330 several alignment strategies enable effective alignment of RNA-seq reads, in
1331 particular, reads spanning multiple exons.")
1332 (license license:gpl3+)))
1333
1334 (define-public hmmer
1335 (package
1336 (name "hmmer")
1337 (version "3.1b2")
1338 (source (origin
1339 (method url-fetch)
1340 (uri (string-append
1341 "http://selab.janelia.org/software/hmmer"
1342 (version-prefix version 1) "/"
1343 version "/hmmer-" version ".tar.gz"))
1344 (sha256
1345 (base32
1346 "0djmgc0pfli0jilfx8hql1axhwhqxqb8rxg2r5rg07aw73sfs5nx"))))
1347 (build-system gnu-build-system)
1348 (native-inputs `(("perl", perl)))
1349 (home-page "http://hmmer.janelia.org")
1350 (synopsis "Biosequence analysis using profile hidden Markov models")
1351 (description
1352 "HMMER is used for searching sequence databases for homologs of protein
1353 sequences, and for making protein sequence alignments. It implements methods
1354 using probabilistic models called profile hidden Markov models (profile
1355 HMMs).")
1356 (license (list license:gpl3+
1357 ;; The bundled library 'easel' is distributed
1358 ;; under The Janelia Farm Software License.
1359 (license:non-copyleft
1360 "file://easel/LICENSE"
1361 "See easel/LICENSE in the distribution.")))))
1362
1363 (define-public htseq
1364 (package
1365 (name "htseq")
1366 (version "0.6.1")
1367 (source (origin
1368 (method url-fetch)
1369 (uri (string-append
1370 "https://pypi.python.org/packages/source/H/HTSeq/HTSeq-"
1371 version ".tar.gz"))
1372 (sha256
1373 (base32
1374 "1i85ppf2j2lj12m0x690qq5nn17xxk23pbbx2c83r8ayb5wngzwv"))))
1375 (build-system python-build-system)
1376 (arguments `(#:python ,python-2)) ; only Python 2 is supported
1377 (inputs
1378 `(("python-numpy" ,python2-numpy)
1379 ("python-setuptools" ,python2-setuptools)))
1380 (home-page "http://www-huber.embl.de/users/anders/HTSeq/")
1381 (synopsis "Analysing high-throughput sequencing data with Python")
1382 (description
1383 "HTSeq is a Python package that provides infrastructure to process data
1384 from high-throughput sequencing assays.")
1385 (license license:gpl3+)))
1386
1387 (define-public htsjdk
1388 (package
1389 (name "htsjdk")
1390 (version "1.129")
1391 (source (origin
1392 (method url-fetch)
1393 (uri (string-append
1394 "https://github.com/samtools/htsjdk/archive/"
1395 version ".tar.gz"))
1396 (file-name (string-append name "-" version ".tar.gz"))
1397 (sha256
1398 (base32
1399 "0asdk9b8jx2ij7yd6apg9qx03li8q7z3ml0qy2r2qczkra79y6fw"))
1400 (modules '((guix build utils)))
1401 ;; remove build dependency on git
1402 (snippet '(substitute* "build.xml"
1403 (("failifexecutionfails=\"true\"")
1404 "failifexecutionfails=\"false\"")))))
1405 (build-system gnu-build-system)
1406 (arguments
1407 `(#:modules ((srfi srfi-1)
1408 (guix build gnu-build-system)
1409 (guix build utils))
1410 #:phases (alist-replace
1411 'build
1412 (lambda _
1413 (setenv "JAVA_HOME" (assoc-ref %build-inputs "jdk"))
1414 (zero? (system* "ant" "all"
1415 (string-append "-Ddist="
1416 (assoc-ref %outputs "out")
1417 "/share/java/htsjdk/"))))
1418 (fold alist-delete %standard-phases
1419 '(configure install check)))))
1420 (native-inputs
1421 `(("ant" ,ant)
1422 ("jdk" ,icedtea6 "jdk")))
1423 (home-page "http://samtools.github.io/htsjdk/")
1424 (synopsis "Java API for high-throughput sequencing data (HTS) formats")
1425 (description
1426 "HTSJDK is an implementation of a unified Java library for accessing
1427 common file formats, such as SAM and VCF, used for high-throughput
1428 sequencing (HTS) data. There are also an number of useful utilities for
1429 manipulating HTS data.")
1430 (license license:expat)))
1431
1432 (define-public htslib
1433 (package
1434 (name "htslib")
1435 (version "1.2.1")
1436 (source (origin
1437 (method url-fetch)
1438 (uri (string-append
1439 "https://github.com/samtools/htslib/releases/download/"
1440 version "/htslib-" version ".tar.bz2"))
1441 (sha256
1442 (base32
1443 "1c32ssscbnjwfw3dra140fq7riarp2x990qxybh34nr1p5r17nxx"))))
1444 (build-system gnu-build-system)
1445 (arguments
1446 `(#:phases
1447 (modify-phases %standard-phases
1448 (add-after
1449 'unpack 'patch-tests
1450 (lambda _
1451 (substitute* "test/test.pl"
1452 (("/bin/bash") (which "bash")))
1453 #t)))))
1454 (inputs
1455 `(("zlib" ,zlib)))
1456 (native-inputs
1457 `(("perl" ,perl)))
1458 (home-page "http://www.htslib.org")
1459 (synopsis "C library for reading/writing high-throughput sequencing data")
1460 (description
1461 "HTSlib is a C library for reading/writing high-throughput sequencing
1462 data. It also provides the bgzip, htsfile, and tabix utilities.")
1463 ;; Files under cram/ are released under the modified BSD license;
1464 ;; the rest is released under the Expat license
1465 (license (list license:expat license:bsd-3))))
1466
1467 (define-public idr
1468 (package
1469 (name "idr")
1470 (version "2.0.0")
1471 (source (origin
1472 (method url-fetch)
1473 (uri (string-append
1474 "https://github.com/nboley/idr/archive/"
1475 version ".tar.gz"))
1476 (file-name (string-append name "-" version ".tar.gz"))
1477 (sha256
1478 (base32
1479 "1k3x44biak00aiv3hpm1yd6nn4hhp7n0qnbs3zh2q9sw7qr1qj5r"))))
1480 (build-system python-build-system)
1481 (arguments
1482 `(#:phases
1483 (modify-phases %standard-phases
1484 (add-after
1485 'install 'wrap-program
1486 (lambda* (#:key inputs outputs #:allow-other-keys)
1487 (let* ((out (assoc-ref outputs "out"))
1488 (python-version (string-take (string-take-right
1489 (assoc-ref inputs "python") 5) 3))
1490 (path (string-join
1491 (map (lambda (name)
1492 (string-append (assoc-ref inputs name)
1493 "/lib/python" python-version
1494 "/site-packages"))
1495 '("python-scipy"
1496 "python-numpy"
1497 "python-matplotlib"))
1498 ":")))
1499 (wrap-program (string-append out "/bin/idr")
1500 `("PYTHONPATH" ":" prefix (,path))))
1501 #t)))))
1502 (inputs
1503 `(("python-scipy" ,python-scipy)
1504 ("python-numpy" ,python-numpy)
1505 ("python-matplotlib" ,python-matplotlib)))
1506 (native-inputs
1507 `(("python-cython" ,python-cython)
1508 ("python-setuptools" ,python-setuptools)))
1509 (home-page "https://github.com/nboley/idr")
1510 (synopsis "Tool to measure the irreproducible discovery rate (IDR)")
1511 (description
1512 "The IDR (Irreproducible Discovery Rate) framework is a unified approach
1513 to measure the reproducibility of findings identified from replicate
1514 experiments and provide highly stable thresholds based on reproducibility.")
1515 (license license:gpl3+)))
1516
1517 (define-public macs
1518 (package
1519 (name "macs")
1520 (version "2.1.0.20140616")
1521 (source (origin
1522 (method url-fetch)
1523 (uri (string-append
1524 "https://pypi.python.org/packages/source/M/MACS2/MACS2-"
1525 version ".tar.gz"))
1526 (sha256
1527 (base32
1528 "11lmiw6avqhwn75sn59g4lfkrr2kk20r3rgfbx9xfqb8rg9mi2n6"))))
1529 (build-system python-build-system)
1530 (arguments
1531 `(#:python ,python-2 ; only compatible with Python 2.7
1532 #:tests? #f)) ; no test target
1533 (inputs
1534 `(("python-numpy" ,python2-numpy)))
1535 (native-inputs
1536 `(("python-setuptools" ,python2-setuptools)))
1537 (home-page "http://github.com/taoliu/MACS/")
1538 (synopsis "Model based analysis for ChIP-Seq data")
1539 (description
1540 "MACS is an implementation of a ChIP-Seq analysis algorithm for
1541 identifying transcript factor binding sites named Model-based Analysis of
1542 ChIP-Seq (MACS). MACS captures the influence of genome complexity to evaluate
1543 the significance of enriched ChIP regions and it improves the spatial
1544 resolution of binding sites through combining the information of both
1545 sequencing tag position and orientation.")
1546 (license license:bsd-3)))
1547
1548 (define-public mafft
1549 (package
1550 (name "mafft")
1551 (version "7.221")
1552 (source (origin
1553 (method url-fetch)
1554 (uri (string-append
1555 "http://mafft.cbrc.jp/alignment/software/mafft-" version
1556 "-without-extensions-src.tgz"))
1557 (file-name (string-append name "-" version ".tgz"))
1558 (sha256
1559 (base32
1560 "0xi7klbsgi049vsrk6jiwh9wfj3b770gz3c8c7zwij448v0dr73d"))))
1561 (build-system gnu-build-system)
1562 (arguments
1563 `(#:tests? #f ; no automated tests, though there are tests in the read me
1564 #:make-flags (let ((out (assoc-ref %outputs "out")))
1565 (list (string-append "PREFIX=" out)
1566 (string-append "BINDIR="
1567 (string-append out "/bin"))))
1568 #:phases
1569 (modify-phases %standard-phases
1570 (add-after 'unpack 'enter-dir
1571 (lambda _ (chdir "core") #t))
1572 (add-after 'enter-dir 'patch-makefile
1573 (lambda _
1574 ;; on advice from the MAFFT authors, there is no need to
1575 ;; distribute mafft-profile, mafft-distance, or
1576 ;; mafft-homologs.rb as they are too "specialised".
1577 (substitute* "Makefile"
1578 ;; remove mafft-homologs.rb from SCRIPTS
1579 (("^SCRIPTS = mafft mafft-homologs.rb")
1580 "SCRIPTS = mafft")
1581 ;; remove mafft-distance from PROGS
1582 (("^PROGS = dvtditr dndfast7 dndblast sextet5 mafft-distance")
1583 "PROGS = dvtditr dndfast7 dndblast sextet5")
1584 ;; remove mafft-profile from PROGS
1585 (("splittbfast disttbfast tbfast mafft-profile 2cl mccaskillwrap")
1586 "splittbfast disttbfast tbfast f2cl mccaskillwrap")
1587 (("^rm -f mafft-profile mafft-profile.exe") "#")
1588 (("^rm -f mafft-distance mafft-distance.exe") ")#")
1589 ;; do not install MAN pages in libexec folder
1590 (("^\t\\$\\(INSTALL\\) -m 644 \\$\\(MANPAGES\\) \
1591 \\$\\(DESTDIR\\)\\$\\(LIBDIR\\)") "#"))
1592 #t))
1593 (delete 'configure))))
1594 (inputs
1595 `(("perl" ,perl)))
1596 (home-page "http://mafft.cbrc.jp/alignment/software/")
1597 (synopsis "Multiple sequence alignment program")
1598 (description
1599 "MAFFT offers a range of multiple alignment methods for nucleotide and
1600 protein sequences. For instance, it offers L-INS-i (accurate; for alignment
1601 of <~200 sequences) and FFT-NS-2 (fast; for alignment of <~30,000
1602 sequences).")
1603 (license (license:non-copyleft
1604 "http://mafft.cbrc.jp/alignment/software/license.txt"
1605 "BSD-3 with different formatting"))))
1606
1607 (define-public metabat
1608 (package
1609 (name "metabat")
1610 (version "0.26.1")
1611 (source (origin
1612 (method url-fetch)
1613 (uri (string-append
1614 "https://bitbucket.org/berkeleylab/metabat/get/"
1615 version ".tar.bz2"))
1616 (file-name (string-append name "-" version ".tar.bz2"))
1617 (sha256
1618 (base32
1619 "0vgrhbaxg4dkxyax2kbigak7w0arhqvw0szwp6gd9wmyilc44kfa"))))
1620 (build-system gnu-build-system)
1621 (arguments
1622 `(#:phases
1623 (modify-phases %standard-phases
1624 (add-after 'unpack 'fix-includes
1625 (lambda _
1626 (substitute* "SConstruct"
1627 (("/include/bam/bam.h")
1628 "/include/samtools/bam.h"))
1629 (substitute* "src/BamUtils.h"
1630 (("^#include \"bam/bam\\.h\"")
1631 "#include \"samtools/bam.h\"")
1632 (("^#include \"bam/sam\\.h\"")
1633 "#include \"samtools/sam.h\""))
1634 (substitute* "src/KseqReader.h"
1635 (("^#include \"bam/kseq\\.h\"")
1636 "#include \"samtools/kseq.h\""))
1637 #t))
1638 (add-after 'unpack 'fix-scons
1639 (lambda _
1640 (substitute* "SConstruct" ; Do not distribute README
1641 (("^env\\.Install\\(idir_prefix, 'README\\.md'\\)")
1642 ""))
1643 #t))
1644 (delete 'configure)
1645 (replace 'build
1646 (lambda* (#:key inputs outputs #:allow-other-keys)
1647 (mkdir (assoc-ref outputs "out"))
1648 (zero? (system* "scons"
1649 (string-append
1650 "PREFIX="
1651 (assoc-ref outputs "out"))
1652 (string-append
1653 "HTSLIB_DIR="
1654 (assoc-ref inputs "htslib"))
1655 (string-append
1656 "SAMTOOLS_DIR="
1657 (assoc-ref inputs "samtools"))
1658 (string-append
1659 "BOOST_ROOT="
1660 (assoc-ref inputs "boost"))
1661 "install"))))
1662 ;; check and install carried out during build phase
1663 (delete 'check)
1664 (delete 'install))))
1665 (inputs
1666 `(("zlib" ,zlib)
1667 ("perl" ,perl)
1668 ("samtools" ,samtools)
1669 ("htslib" ,htslib)
1670 ("boost" ,boost)))
1671 (native-inputs
1672 `(("scons" ,scons)))
1673 (home-page "https://bitbucket.org/berkeleylab/metabat")
1674 (synopsis
1675 "Reconstruction of single genomes from complex microbial communities")
1676 (description
1677 "Grouping large genomic fragments assembled from shotgun metagenomic
1678 sequences to deconvolute complex microbial communities, or metagenome binning,
1679 enables the study of individual organisms and their interactions. MetaBAT is
1680 an automated metagenome binning software, which integrates empirical
1681 probabilistic distances of genome abundance and tetranucleotide frequency.")
1682 (license (license:non-copyleft "file://license.txt"
1683 "See license.txt in the distribution."))))
1684
1685 (define-public miso
1686 (package
1687 (name "miso")
1688 (version "0.5.3")
1689 (source (origin
1690 (method url-fetch)
1691 (uri (string-append
1692 "https://pypi.python.org/packages/source/m/misopy/misopy-"
1693 version ".tar.gz"))
1694 (sha256
1695 (base32
1696 "0x446867az8ir0z8c1vjqffkp0ma37wm4sylixnkhgawllzx8v5w"))
1697 (modules '((guix build utils)))
1698 (snippet
1699 '(substitute* "setup.py"
1700 ;; Use setuptools, or else the executables are not
1701 ;; installed.
1702 (("distutils.core") "setuptools")
1703 ;; use "gcc" instead of "cc" for compilation
1704 (("^defines")
1705 "cc.set_executables(
1706 compiler='gcc',
1707 compiler_so='gcc',
1708 linker_exe='gcc',
1709 linker_so='gcc -shared'); defines")))))
1710 (build-system python-build-system)
1711 (arguments
1712 `(#:python ,python-2 ; only Python 2 is supported
1713 #:tests? #f)) ; no "test" target
1714 (inputs
1715 `(("samtools" ,samtools)
1716 ("python-numpy" ,python2-numpy)
1717 ("python-pysam" ,python2-pysam)
1718 ("python-scipy" ,python2-scipy)
1719 ("python-matplotlib" ,python2-matplotlib)))
1720 (native-inputs
1721 `(("python-mock" ,python2-mock) ;for tests
1722 ("python-pytz" ,python2-pytz) ;for tests
1723 ("python-setuptools" ,python2-setuptools)))
1724 (home-page "http://genes.mit.edu/burgelab/miso/index.html")
1725 (synopsis "Mixture of Isoforms model for RNA-Seq isoform quantitation")
1726 (description
1727 "MISO (Mixture-of-Isoforms) is a probabilistic framework that quantitates
1728 the expression level of alternatively spliced genes from RNA-Seq data, and
1729 identifies differentially regulated isoforms or exons across samples. By
1730 modeling the generative process by which reads are produced from isoforms in
1731 RNA-Seq, the MISO model uses Bayesian inference to compute the probability
1732 that a read originated from a particular isoform.")
1733 (license license:gpl2)))
1734
1735 (define-public orfm
1736 (package
1737 (name "orfm")
1738 (version "0.4.1")
1739 (source (origin
1740 (method url-fetch)
1741 (uri (string-append
1742 "https://github.com/wwood/OrfM/releases/download/v"
1743 version "/orfm-" version ".tar.gz"))
1744 (sha256
1745 (base32
1746 "05fmw145snk646ly076zby0fjav0k7ysbclck5d4s9pmgcfpijc2"))))
1747 (build-system gnu-build-system)
1748 (inputs `(("zlib" ,zlib)))
1749 (synopsis "Simple and not slow open reading frame (ORF) caller")
1750 (description
1751 "An ORF caller finds stretches of DNA that when translated are not
1752 interrupted by stop codons. OrfM finds and prints these ORFs.")
1753 (home-page "https://github.com/wwood/OrfM")
1754 (license license:lgpl3+)))
1755
1756 (define-public python2-pbcore
1757 (package
1758 (name "python2-pbcore")
1759 (version "0.9.3")
1760 (source (origin
1761 (method url-fetch)
1762 (uri (string-append
1763 "https://github.com/PacificBiosciences/pbcore/archive/"
1764 version ".tar.gz"))
1765 (file-name (string-append name "-" version ".tar.gz"))
1766 (sha256
1767 (base32
1768 "1z46rwjac93jm87cbj2zgjg6qvsgs65140wkbbxsvxps7ai4pm09"))))
1769 (build-system python-build-system)
1770 (arguments `(#:python ,python-2)) ; pbcore requires Python 2.7
1771 (inputs
1772 `(("python-cython" ,python2-cython)
1773 ("python-numpy" ,python2-numpy)
1774 ("python-pysam" ,python2-pysam)
1775 ("python-h5py" ,python2-h5py)))
1776 (native-inputs
1777 `(("python-setuptools" ,python2-setuptools)))
1778 (home-page "http://pacificbiosciences.github.io/pbcore/")
1779 (synopsis "Library for reading and writing PacBio data files")
1780 (description
1781 "The pbcore package provides Python APIs for interacting with PacBio data
1782 files and writing bioinformatics applications.")
1783 (license license:bsd-3)))
1784
1785 (define-public python2-warpedlmm
1786 (package
1787 (name "python2-warpedlmm")
1788 (version "0.21")
1789 (source
1790 (origin
1791 (method url-fetch)
1792 (uri (string-append
1793 "https://pypi.python.org/packages/source/W/WarpedLMM/WarpedLMM-"
1794 version ".zip"))
1795 (sha256
1796 (base32
1797 "1agfz6zqa8nc6cw47yh0s3y14gkpa9wqazwcj7mwwj3ffnw39p3j"))))
1798 (build-system python-build-system)
1799 (arguments
1800 `(#:python ,python-2 ; requires Python 2.7
1801 #:phases
1802 (modify-phases %standard-phases
1803 (add-after
1804 'install 'remove-bin-directory
1805 (lambda* (#:key outputs #:allow-other-keys)
1806 ;; The "bin" directory only contains wrappers for running
1807 ;; the module tests. They are not needed after the
1808 ;; "check" phase.
1809 (delete-file-recursively
1810 (string-append (assoc-ref outputs "out") "/bin"))
1811 #t)))))
1812 (propagated-inputs
1813 `(("python-scipy" ,python2-scipy)
1814 ("python-numpy" ,python2-numpy)
1815 ("python-matplotlib" ,python2-matplotlib)
1816 ("python-fastlmm" ,python2-fastlmm)
1817 ("python-pandas" ,python2-pandas)
1818 ("python-pysnptools" ,python2-pysnptools)))
1819 (native-inputs
1820 `(("python-setuptools" ,python2-setuptools)
1821 ("python-mock" ,python2-mock)
1822 ("python-nose" ,python2-nose)
1823 ("unzip" ,unzip)))
1824 (home-page "https://github.com/PMBio/warpedLMM")
1825 (synopsis "Implementation of warped linear mixed models")
1826 (description
1827 "WarpedLMM is a Python implementation of the warped linear mixed model,
1828 which automatically learns an optimal warping function (or transformation) for
1829 the phenotype as it models the data.")
1830 (license license:asl2.0)))
1831
1832 (define-public pbtranscript-tofu
1833 (let ((commit "c7bbd5472"))
1834 (package
1835 (name "pbtranscript-tofu")
1836 (version (string-append "0.4.1." commit))
1837 (source (origin
1838 (method git-fetch)
1839 (uri (git-reference
1840 (url "https://github.com/PacificBiosciences/cDNA_primer.git")
1841 (commit commit)))
1842 (file-name (string-append name "-" version ".tar.gz"))
1843 (sha256
1844 (base32
1845 "148xkzi689c49g6fdhckp6mnmj2qhjdf1j4wifm6ja7ij95d7fxx"))))
1846 (build-system python-build-system)
1847 (arguments
1848 `(#:python ,python-2
1849 ;; With standard flags, the install phase attempts to create a zip'd
1850 ;; egg file, and fails with an error: 'ZIP does not support timestamps
1851 ;; before 1980'
1852 #:configure-flags '("--single-version-externally-managed"
1853 "--record=pbtranscript-tofu.txt")
1854 #:phases
1855 (alist-cons-after
1856 'unpack 'enter-directory-and-clean-up
1857 (lambda _
1858 (chdir "pbtranscript-tofu/pbtranscript/")
1859 ;; Delete clutter
1860 (delete-file-recursively "dist/")
1861 (delete-file-recursively "build/")
1862 (delete-file-recursively "setuptools_cython-0.2.1-py2.6.egg/")
1863 (delete-file-recursively "pbtools.pbtranscript.egg-info")
1864 (delete-file "Cython-0.20.1.tar.gz")
1865 (delete-file "setuptools_cython-0.2.1-py2.7.egg")
1866 (delete-file "setuptools_cython-0.2.1.tar.gz")
1867 (delete-file "setup.cfg")
1868 (for-each delete-file
1869 (find-files "." "\\.so$"))
1870 ;; files should be writable for install phase
1871 (for-each (lambda (f) (chmod f #o755))
1872 (find-files "." "\\.py$")))
1873 %standard-phases)))
1874 (inputs
1875 `(("python-cython" ,python2-cython)
1876 ("python-numpy" ,python2-numpy)
1877 ("python-bx-python" ,python2-bx-python)
1878 ("python-networkx" ,python2-networkx)
1879 ("python-scipy" ,python2-scipy)
1880 ("python-pbcore" ,python2-pbcore)))
1881 (native-inputs
1882 `(("python-nose" ,python2-nose)
1883 ("python-setuptools" ,python2-setuptools)))
1884 (home-page "https://github.com/PacificBiosciences/cDNA_primer")
1885 (synopsis "Analyze transcriptome data generated with the Iso-Seq protocol")
1886 (description
1887 "pbtranscript-tofu contains scripts to analyze transcriptome data
1888 generated using the PacBio Iso-Seq protocol.")
1889 (license license:bsd-3))))
1890
1891 (define-public prodigal
1892 (package
1893 (name "prodigal")
1894 (version "2.6.2")
1895 (source (origin
1896 (method url-fetch)
1897 (uri (string-append
1898 "https://github.com/hyattpd/Prodigal/archive/v"
1899 version ".tar.gz"))
1900 (file-name (string-append name "-" version ".tar.gz"))
1901 (sha256
1902 (base32
1903 "0m8sb0fg6lmxrlpzna0am6svbnlmd3dckrhgzxxgb3gxr5fyj284"))))
1904 (build-system gnu-build-system)
1905 (arguments
1906 `(#:tests? #f ;no check target
1907 #:make-flags (list (string-append "INSTALLDIR="
1908 (assoc-ref %outputs "out")
1909 "/bin"))
1910 #:phases
1911 (modify-phases %standard-phases
1912 (delete 'configure))))
1913 (home-page "http://prodigal.ornl.gov")
1914 (synopsis "Protein-coding gene prediction for Archaea and Bacteria")
1915 (description
1916 "Prodigal runs smoothly on finished genomes, draft genomes, and
1917 metagenomes, providing gene predictions in GFF3, Genbank, or Sequin table
1918 format. It runs quickly, in an unsupervised fashion, handles gaps, handles
1919 partial genes, and identifies translation initiation sites.")
1920 (license license:gpl3+)))
1921
1922 (define-public rsem
1923 (package
1924 (name "rsem")
1925 (version "1.2.20")
1926 (source
1927 (origin
1928 (method url-fetch)
1929 (uri
1930 (string-append "http://deweylab.biostat.wisc.edu/rsem/src/rsem-"
1931 version ".tar.gz"))
1932 (sha256
1933 (base32 "0nzdc0j0hjllhsd5f2xli95dafm3nawskigs140xzvjk67xh0r9q"))
1934 (patches (list (search-patch "rsem-makefile.patch")))
1935 (modules '((guix build utils)))
1936 (snippet
1937 '(begin
1938 ;; remove bundled copy of boost
1939 (delete-file-recursively "boost")
1940 #t))))
1941 (build-system gnu-build-system)
1942 (arguments
1943 `(#:tests? #f ;no "check" target
1944 #:phases
1945 (modify-phases %standard-phases
1946 ;; No "configure" script.
1947 ;; Do not build bundled samtools library.
1948 (replace 'configure
1949 (lambda _
1950 (substitute* "Makefile"
1951 (("^all : sam/libbam.a") "all : "))
1952 #t))
1953 (replace 'install
1954 (lambda* (#:key outputs #:allow-other-keys)
1955 (let* ((out (string-append (assoc-ref outputs "out")))
1956 (bin (string-append out "/bin/"))
1957 (perl (string-append out "/lib/perl5/site_perl")))
1958 (mkdir-p bin)
1959 (mkdir-p perl)
1960 (for-each (lambda (file)
1961 (copy-file file
1962 (string-append bin (basename file))))
1963 (find-files "." "rsem-.*"))
1964 (copy-file "rsem_perl_utils.pm"
1965 (string-append perl "/rsem_perl_utils.pm")))
1966 #t))
1967 (add-after
1968 'install 'wrap-program
1969 (lambda* (#:key outputs #:allow-other-keys)
1970 (let ((out (assoc-ref outputs "out")))
1971 (for-each (lambda (prog)
1972 (wrap-program (string-append out "/bin/" prog)
1973 `("PERL5LIB" ":" prefix
1974 (,(string-append out "/lib/perl5/site_perl")))))
1975 '("rsem-plot-transcript-wiggles"
1976 "rsem-calculate-expression"
1977 "rsem-generate-ngvector"
1978 "rsem-run-ebseq"
1979 "rsem-prepare-reference")))
1980 #t)))))
1981 (inputs
1982 `(("boost" ,boost)
1983 ("ncurses" ,ncurses)
1984 ("r" ,r)
1985 ("perl" ,perl)
1986 ("samtools" ,samtools-0.1)
1987 ("zlib" ,zlib)))
1988 (home-page "http://deweylab.biostat.wisc.edu/rsem/")
1989 (synopsis "Estimate gene expression levels from RNA-Seq data")
1990 (description
1991 "RSEM is a software package for estimating gene and isoform expression
1992 levels from RNA-Seq data. The RSEM package provides a user-friendly
1993 interface, supports threads for parallel computation of the EM algorithm,
1994 single-end and paired-end read data, quality scores, variable-length reads and
1995 RSPD estimation. In addition, it provides posterior mean and 95% credibility
1996 interval estimates for expression levels. For visualization, it can generate
1997 BAM and Wiggle files in both transcript-coordinate and genomic-coordinate.")
1998 (license license:gpl3+)))
1999
2000 (define-public rseqc
2001 (package
2002 (name "rseqc")
2003 (version "2.6.1")
2004 (source
2005 (origin
2006 (method url-fetch)
2007 (uri
2008 (string-append "mirror://sourceforge/rseqc/"
2009 version "/RSeQC-" version ".tar.gz"))
2010 (sha256
2011 (base32 "15ly0254yi032qzkdplg00q144qfdsd986gh62829rl5bkxhj330"))
2012 (modules '((guix build utils)))
2013 (snippet
2014 '(begin
2015 ;; remove bundled copy of pysam
2016 (delete-file-recursively "lib/pysam")
2017 (substitute* "setup.py"
2018 ;; remove dependency on outdated "distribute" module
2019 (("^from distribute_setup import use_setuptools") "")
2020 (("^use_setuptools\\(\\)") "")
2021 ;; do not use bundled copy of pysam
2022 (("^have_pysam = False") "have_pysam = True"))))))
2023 (build-system python-build-system)
2024 (arguments `(#:python ,python-2))
2025 (inputs
2026 `(("python-cython" ,python2-cython)
2027 ("python-pysam" ,python2-pysam)
2028 ("python-numpy" ,python2-numpy)
2029 ("python-setuptools" ,python2-setuptools)
2030 ("zlib" ,zlib)))
2031 (native-inputs
2032 `(("python-nose" ,python2-nose)))
2033 (home-page "http://rseqc.sourceforge.net/")
2034 (synopsis "RNA-seq quality control package")
2035 (description
2036 "RSeQC provides a number of modules that can comprehensively evaluate
2037 high throughput sequence data, especially RNA-seq data. Some basic modules
2038 inspect sequence quality, nucleotide composition bias, PCR bias and GC bias,
2039 while RNA-seq specific modules evaluate sequencing saturation, mapped reads
2040 distribution, coverage uniformity, strand specificity, etc.")
2041 (license license:gpl3+)))
2042
2043 (define-public samtools
2044 (package
2045 (name "samtools")
2046 (version "1.2")
2047 (source
2048 (origin
2049 (method url-fetch)
2050 (uri
2051 (string-append "mirror://sourceforge/samtools/"
2052 version "/samtools-" version ".tar.bz2"))
2053 (sha256
2054 (base32
2055 "1akdqb685pk9xk1nb6sa9aq8xssjjhvvc06kp4cpdqvz2157l3j2"))))
2056 (build-system gnu-build-system)
2057 (arguments
2058 `(;; There are 87 test failures when building on non-64-bit architectures
2059 ;; due to invalid test data. This has since been fixed upstream (see
2060 ;; <https://github.com/samtools/samtools/pull/307>), but as there has
2061 ;; not been a new release we disable the tests for all non-64-bit
2062 ;; systems.
2063 #:tests? ,(string=? (or (%current-system) (%current-target-system))
2064 "x86_64-linux")
2065 #:modules ((ice-9 ftw)
2066 (ice-9 regex)
2067 (guix build gnu-build-system)
2068 (guix build utils))
2069 #:make-flags (list "LIBCURSES=-lncurses"
2070 (string-append "prefix=" (assoc-ref %outputs "out")))
2071 #:phases
2072 (alist-cons-after
2073 'unpack
2074 'patch-tests
2075 (lambda* (#:key inputs #:allow-other-keys)
2076 (let ((bash (assoc-ref inputs "bash")))
2077 (substitute* "test/test.pl"
2078 ;; The test script calls out to /bin/bash
2079 (("/bin/bash")
2080 (string-append bash "/bin/bash"))
2081 ;; There are two failing tests upstream relating to the "stats"
2082 ;; subcommand in test_usage_subcommand ("did not have Usage"
2083 ;; and "usage did not mention samtools stats"), so we disable
2084 ;; them.
2085 (("(test_usage_subcommand\\(.*\\);)" cmd)
2086 (string-append "unless ($subcommand eq 'stats') {" cmd "};")))))
2087 (alist-cons-after
2088 'install 'install-library
2089 (lambda* (#:key outputs #:allow-other-keys)
2090 (let ((lib (string-append (assoc-ref outputs "out") "/lib")))
2091 (mkdir-p lib)
2092 (copy-file "libbam.a" (string-append lib "/libbam.a"))))
2093 (alist-cons-after
2094 'install 'install-headers
2095 (lambda* (#:key outputs #:allow-other-keys)
2096 (let ((include (string-append (assoc-ref outputs "out")
2097 "/include/samtools/")))
2098 (mkdir-p include)
2099 (for-each (lambda (file)
2100 (copy-file file (string-append include
2101 (basename file))))
2102 (scandir "." (lambda (name) (string-match "\\.h$" name))))
2103 #t))
2104 (alist-delete 'configure %standard-phases))))))
2105 (native-inputs `(("pkg-config" ,pkg-config)))
2106 (inputs `(("ncurses" ,ncurses)
2107 ("perl" ,perl)
2108 ("python" ,python)
2109 ("zlib" ,zlib)))
2110 (home-page "http://samtools.sourceforge.net")
2111 (synopsis "Utilities to efficiently manipulate nucleotide sequence alignments")
2112 (description
2113 "Samtools implements various utilities for post-processing nucleotide
2114 sequence alignments in the SAM, BAM, and CRAM formats, including indexing,
2115 variant calling (in conjunction with bcftools), and a simple alignment
2116 viewer.")
2117 (license license:expat)))
2118
2119 (define-public samtools-0.1
2120 ;; This is the most recent version of the 0.1 line of samtools. The input
2121 ;; and output formats differ greatly from that used and produced by samtools
2122 ;; 1.x and is still used in many bioinformatics pipelines.
2123 (package (inherit samtools)
2124 (version "0.1.19")
2125 (source
2126 (origin
2127 (method url-fetch)
2128 (uri
2129 (string-append "mirror://sourceforge/samtools/"
2130 version "/samtools-" version ".tar.bz2"))
2131 (sha256
2132 (base32 "1m33xsfwz0s8qi45lylagfllqg7fphf4dr0780rsvw75av9wk06h"))))
2133 (arguments
2134 (substitute-keyword-arguments (package-arguments samtools)
2135 ((#:tests? tests) #f) ;no "check" target
2136 ((#:phases phases)
2137 `(modify-phases ,phases
2138 (replace 'install
2139 (lambda* (#:key outputs #:allow-other-keys)
2140 (let ((bin (string-append
2141 (assoc-ref outputs "out") "/bin")))
2142 (mkdir-p bin)
2143 (copy-file "samtools"
2144 (string-append bin "/samtools")))))
2145 (delete 'patch-tests)))))))
2146
2147 (define-public ngs-sdk
2148 (package
2149 (name "ngs-sdk")
2150 (version "1.1.1")
2151 (source
2152 (origin
2153 (method url-fetch)
2154 (uri
2155 (string-append "https://github.com/ncbi/ngs/archive/"
2156 version ".tar.gz"))
2157 (file-name (string-append name "-" version ".tar.gz"))
2158 (sha256
2159 (base32
2160 "1x58gpm574n0xmk2a98gmikbgycq78ia0bvnb42k5ck34fmd5v8y"))))
2161 (build-system gnu-build-system)
2162 (arguments
2163 `(#:parallel-build? #f ; not supported
2164 #:tests? #f ; no "check" target
2165 #:phases
2166 (alist-replace
2167 'configure
2168 (lambda* (#:key outputs #:allow-other-keys)
2169 (let ((out (assoc-ref outputs "out")))
2170 ;; The 'configure' script doesn't recognize things like
2171 ;; '--enable-fast-install'.
2172 (zero? (system* "./configure"
2173 (string-append "--build-prefix=" (getcwd) "/build")
2174 (string-append "--prefix=" out)))))
2175 (alist-cons-after
2176 'unpack 'enter-dir
2177 (lambda _ (chdir "ngs-sdk") #t)
2178 %standard-phases))))
2179 (native-inputs `(("perl" ,perl)))
2180 (home-page "https://github.com/ncbi/ngs")
2181 (synopsis "API for accessing Next Generation Sequencing data")
2182 (description
2183 "NGS is a domain-specific API for accessing reads, alignments and pileups
2184 produced from Next Generation Sequencing. The API itself is independent from
2185 any particular back-end implementation, and supports use of multiple back-ends
2186 simultaneously.")
2187 (license license:public-domain)))
2188
2189 (define-public ngs-java
2190 (package (inherit ngs-sdk)
2191 (name "ngs-java")
2192 (arguments
2193 `(,@(substitute-keyword-arguments
2194 `(#:modules ((guix build gnu-build-system)
2195 (guix build utils)
2196 (srfi srfi-1)
2197 (srfi srfi-26))
2198 ,@(package-arguments ngs-sdk))
2199 ((#:phases phases)
2200 `(alist-cons-after
2201 'enter-dir 'fix-java-symlink-installation
2202 (lambda _
2203 ;; Only replace the version suffix, not the version number in
2204 ;; the directory name. Reported here:
2205 ;; https://github.com/ncbi/ngs/pull/4
2206 (substitute* "Makefile.java"
2207 (((string-append "\\$\\(subst "
2208 "(\\$\\(VERSION[^\\)]*\\)),"
2209 "(\\$\\([^\\)]+\\)),"
2210 "(\\$\\([^\\)]+\\)|\\$\\@)"
2211 "\\)")
2212 _ pattern replacement target)
2213 (string-append "$(patsubst "
2214 "%" pattern ","
2215 "%" replacement ","
2216 target ")"))))
2217 (alist-replace
2218 'enter-dir (lambda _ (chdir "ngs-java") #t)
2219 ,phases))))))
2220 (inputs
2221 `(("jdk" ,icedtea6 "jdk")
2222 ("ngs-sdk" ,ngs-sdk)))
2223 (synopsis "Java bindings for NGS SDK")))
2224
2225 (define-public ncbi-vdb
2226 (package
2227 (name "ncbi-vdb")
2228 (version "2.4.5-5")
2229 (source
2230 (origin
2231 (method url-fetch)
2232 (uri
2233 (string-append "https://github.com/ncbi/ncbi-vdb/archive/"
2234 version ".tar.gz"))
2235 (file-name (string-append name "-" version ".tar.gz"))
2236 (sha256
2237 (base32
2238 "1cj8nk6if8sqagv20vx36v566fdvhcaadf0x1ycnbgql6chbs6vy"))))
2239 (build-system gnu-build-system)
2240 (arguments
2241 `(#:parallel-build? #f ; not supported
2242 #:tests? #f ; no "check" target
2243 #:phases
2244 (alist-replace
2245 'configure
2246 (lambda* (#:key inputs outputs #:allow-other-keys)
2247 (let ((out (assoc-ref outputs "out")))
2248 ;; Only replace the version suffix, not the version number in the
2249 ;; directory name; fixed in commit 4dbba5c6a809 (no release yet).
2250 (substitute* "setup/konfigure.perl"
2251 (((string-append "\\$\\(subst "
2252 "(\\$\\(VERSION[^\\)]*\\)),"
2253 "(\\$\\([^\\)]+\\)),"
2254 "(\\$\\([^\\)]+\\)|\\$\\@)"
2255 "\\)")
2256 _ pattern replacement target)
2257 (string-append "$(patsubst "
2258 "%" pattern ","
2259 "%" replacement ","
2260 target ")")))
2261
2262 ;; Override include path for libmagic
2263 (substitute* "setup/package.prl"
2264 (("name => 'magic', Include => '/usr/include'")
2265 (string-append "name=> 'magic', Include => '"
2266 (assoc-ref inputs "libmagic")
2267 "/include" "'")))
2268
2269 ;; Install kdf5 library (needed by sra-tools)
2270 (substitute* "build/Makefile.install"
2271 (("LIBRARIES_TO_INSTALL =")
2272 "LIBRARIES_TO_INSTALL = kdf5.$(VERSION_LIBX) kdf5.$(VERSION_SHLX)"))
2273
2274 ;; The 'configure' script doesn't recognize things like
2275 ;; '--enable-fast-install'.
2276 (zero? (system*
2277 "./configure"
2278 (string-append "--build-prefix=" (getcwd) "/build")
2279 (string-append "--prefix=" (assoc-ref outputs "out"))
2280 (string-append "--debug")
2281 (string-append "--with-xml2-prefix="
2282 (assoc-ref inputs "libxml2"))
2283 (string-append "--with-ngs-sdk-prefix="
2284 (assoc-ref inputs "ngs-sdk"))
2285 (string-append "--with-ngs-java-prefix="
2286 (assoc-ref inputs "ngs-java"))
2287 (string-append "--with-hdf5-prefix="
2288 (assoc-ref inputs "hdf5"))))))
2289 (alist-cons-after
2290 'install 'install-interfaces
2291 (lambda* (#:key outputs #:allow-other-keys)
2292 ;; Install interface libraries. On i686 the interface libraries
2293 ;; are installed to "linux/gcc/i386", so we need to use the Linux
2294 ;; architecture name ("i386") instead of the target system prefix
2295 ;; ("i686").
2296 (mkdir (string-append (assoc-ref outputs "out") "/ilib"))
2297 (copy-recursively (string-append "build/ncbi-vdb/linux/gcc/"
2298 ,(system->linux-architecture
2299 (or (%current-target-system)
2300 (%current-system)))
2301 "/rel/ilib")
2302 (string-append (assoc-ref outputs "out")
2303 "/ilib"))
2304 ;; Install interface headers
2305 (copy-recursively "interfaces"
2306 (string-append (assoc-ref outputs "out")
2307 "/include")))
2308 %standard-phases))))
2309 (inputs
2310 `(("libxml2" ,libxml2)
2311 ("ngs-sdk" ,ngs-sdk)
2312 ("ngs-java" ,ngs-java)
2313 ("libmagic" ,file)
2314 ("hdf5" ,hdf5)))
2315 (native-inputs `(("perl" ,perl)))
2316 (home-page "https://github.com/ncbi/ncbi-vdb")
2317 (synopsis "Database engine for genetic information")
2318 (description
2319 "The NCBI-VDB library implements a highly compressed columnar data
2320 warehousing engine that is most often used to store genetic information.
2321 Databases are stored in a portable image within the file system, and can be
2322 accessed/downloaded on demand across HTTP.")
2323 (license license:public-domain)))
2324
2325 (define-public plink
2326 (package
2327 (name "plink")
2328 (version "1.07")
2329 (source
2330 (origin
2331 (method url-fetch)
2332 (uri (string-append
2333 "http://pngu.mgh.harvard.edu/~purcell/plink/dist/plink-"
2334 version "-src.zip"))
2335 (sha256
2336 (base32 "0as8gxm4pjyc8dxmm1sl873rrd7wn5qs0l29nqfnl31x8i467xaa"))
2337 (patches (list (search-patch "plink-1.07-unclobber-i.patch")))))
2338 (build-system gnu-build-system)
2339 (arguments
2340 '(#:tests? #f ;no "check" target
2341 #:make-flags (list (string-append "LIB_LAPACK="
2342 (assoc-ref %build-inputs "lapack")
2343 "/lib/liblapack.so")
2344 "WITH_LAPACK=1"
2345 "FORCE_DYNAMIC=1"
2346 ;; disable phoning home
2347 "WITH_WEBCHECK=")
2348 #:phases
2349 (modify-phases %standard-phases
2350 ;; no "configure" script
2351 (delete 'configure)
2352 (replace 'install
2353 (lambda* (#:key outputs #:allow-other-keys)
2354 (let ((bin (string-append (assoc-ref outputs "out")
2355 "/bin/")))
2356 (mkdir-p bin)
2357 (copy-file "plink" (string-append bin "plink"))
2358 #t))))))
2359 (inputs
2360 `(("zlib" ,zlib)
2361 ("lapack" ,lapack)))
2362 (native-inputs
2363 `(("unzip" ,unzip)))
2364 (home-page "http://pngu.mgh.harvard.edu/~purcell/plink/")
2365 (synopsis "Whole genome association analysis toolset")
2366 (description
2367 "PLINK is a whole genome association analysis toolset, designed to
2368 perform a range of basic, large-scale analyses in a computationally efficient
2369 manner. The focus of PLINK is purely on analysis of genotype/phenotype data,
2370 so there is no support for steps prior to this (e.g. study design and
2371 planning, generating genotype or CNV calls from raw data). Through
2372 integration with gPLINK and Haploview, there is some support for the
2373 subsequent visualization, annotation and storage of results.")
2374 ;; Code is released under GPLv2, except for fisher.h, which is under
2375 ;; LGPLv2.1+
2376 (license (list license:gpl2 license:lgpl2.1+))))
2377
2378 (define-public preseq
2379 (package
2380 (name "preseq")
2381 (version "1.0.2")
2382 (source (origin
2383 (method url-fetch)
2384 (uri
2385 (string-append "http://smithlabresearch.org/downloads/preseq-"
2386 version ".tar.bz2"))
2387 (sha256
2388 (base32 "0r7sw07p6nv8ygvc17gd78lisbw5336v3vhs86b5wv8mw3pwqksc"))
2389 (patches (list (search-patch "preseq-1.0.2-install-to-PREFIX.patch")
2390 (search-patch "preseq-1.0.2-link-with-libbam.patch")))
2391 (modules '((guix build utils)))
2392 (snippet
2393 ;; Remove bundled samtools.
2394 '(delete-file-recursively "preseq-master/samtools"))))
2395 (build-system gnu-build-system)
2396 (arguments
2397 `(#:tests? #f ;no "check" target
2398 #:phases
2399 (modify-phases %standard-phases
2400 (add-after
2401 'unpack 'enter-dir
2402 (lambda _
2403 (chdir "preseq-master")
2404 #t))
2405 (add-after
2406 'enter-dir 'use-samtools-headers
2407 (lambda _
2408 (substitute* '("smithlab_cpp/SAM.cpp"
2409 "smithlab_cpp/SAM.hpp")
2410 (("sam.h") "samtools/sam.h"))
2411 #t))
2412 (delete 'configure))
2413 #:make-flags (list (string-append "PREFIX="
2414 (assoc-ref %outputs "out"))
2415 (string-append "LIBBAM="
2416 (assoc-ref %build-inputs "samtools")
2417 "/lib/libbam.a"))))
2418 (inputs
2419 `(("gsl" ,gsl)
2420 ("samtools" ,samtools-0.1)
2421 ("zlib" ,zlib)))
2422 (home-page "http://smithlabresearch.org/software/preseq/")
2423 (synopsis "Program for analyzing library complexity")
2424 (description
2425 "The preseq package is aimed at predicting and estimating the complexity
2426 of a genomic sequencing library, equivalent to predicting and estimating the
2427 number of redundant reads from a given sequencing depth and how many will be
2428 expected from additional sequencing using an initial sequencing experiment.
2429 The estimates can then be used to examine the utility of further sequencing,
2430 optimize the sequencing depth, or to screen multiple libraries to avoid low
2431 complexity samples.")
2432 (license license:gpl3+)))
2433
2434 (define-public sra-tools
2435 (package
2436 (name "sra-tools")
2437 (version "2.4.5-5")
2438 (source
2439 (origin
2440 (method url-fetch)
2441 (uri
2442 (string-append "https://github.com/ncbi/sra-tools/archive/"
2443 version ".tar.gz"))
2444 (file-name (string-append name "-" version ".tar.gz"))
2445 (sha256
2446 (base32
2447 "11nrnvz7a012f4iryf0wiwrid0h111grsfxbxa9j51h3f2xbvgns"))))
2448 (build-system gnu-build-system)
2449 (arguments
2450 `(#:parallel-build? #f ; not supported
2451 #:tests? #f ; no "check" target
2452 #:phases
2453 (alist-replace
2454 'configure
2455 (lambda* (#:key inputs outputs #:allow-other-keys)
2456 ;; The build system expects a directory containing the sources and
2457 ;; raw build output of ncbi-vdb, including files that are not
2458 ;; installed. Since we are building against an installed version of
2459 ;; ncbi-vdb, the following modifications are needed.
2460 (substitute* "setup/konfigure.perl"
2461 ;; Make the configure script look for the "ilib" directory of
2462 ;; "ncbi-vdb" without first checking for the existence of a
2463 ;; matching library in its "lib" directory.
2464 (("^ my \\$f = File::Spec->catdir\\(\\$libdir, \\$lib\\);")
2465 "my $f = File::Spec->catdir($ilibdir, $ilib);")
2466 ;; Look for interface libraries in ncbi-vdb's "ilib" directory.
2467 (("my \\$ilibdir = File::Spec->catdir\\(\\$builddir, 'ilib'\\);")
2468 "my $ilibdir = File::Spec->catdir($dir, 'ilib');"))
2469
2470 ;; The 'configure' script doesn't recognize things like
2471 ;; '--enable-fast-install'.
2472 (zero? (system*
2473 "./configure"
2474 (string-append "--build-prefix=" (getcwd) "/build")
2475 (string-append "--prefix=" (assoc-ref outputs "out"))
2476 (string-append "--debug")
2477 (string-append "--with-fuse-prefix="
2478 (assoc-ref inputs "fuse"))
2479 (string-append "--with-magic-prefix="
2480 (assoc-ref inputs "libmagic"))
2481 ;; TODO: building with libxml2 fails with linker errors
2482 ;; (string-append "--with-xml2-prefix="
2483 ;; (assoc-ref inputs "libxml2"))
2484 (string-append "--with-ncbi-vdb-sources="
2485 (assoc-ref inputs "ncbi-vdb"))
2486 (string-append "--with-ncbi-vdb-build="
2487 (assoc-ref inputs "ncbi-vdb"))
2488 (string-append "--with-ngs-sdk-prefix="
2489 (assoc-ref inputs "ngs-sdk"))
2490 (string-append "--with-hdf5-prefix="
2491 (assoc-ref inputs "hdf5")))))
2492 %standard-phases)))
2493 (native-inputs `(("perl" ,perl)))
2494 (inputs
2495 `(("ngs-sdk" ,ngs-sdk)
2496 ("ncbi-vdb" ,ncbi-vdb)
2497 ("libmagic" ,file)
2498 ("fuse" ,fuse)
2499 ("hdf5" ,hdf5)
2500 ("zlib" ,zlib)))
2501 (home-page "http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software")
2502 (synopsis "Tools and libraries for reading and writing sequencing data")
2503 (description
2504 "The SRA Toolkit from NCBI is a collection of tools and libraries for
2505 reading of sequencing files from the Sequence Read Archive (SRA) database and
2506 writing files into the .sra format.")
2507 (license license:public-domain)))
2508
2509 (define-public seqan
2510 (package
2511 (name "seqan")
2512 (version "1.4.2")
2513 (source (origin
2514 (method url-fetch)
2515 (uri (string-append "http://packages.seqan.de/seqan-library/"
2516 "seqan-library-" version ".tar.bz2"))
2517 (sha256
2518 (base32
2519 "05s3wrrwn50f81aklfm65i4a749zag1vr8z03k21xm0pdxy47yvp"))))
2520 ;; The documentation is 7.8MB and the includes are 3.6MB heavy, so it
2521 ;; makes sense to split the outputs.
2522 (outputs '("out" "doc"))
2523 (build-system trivial-build-system)
2524 (arguments
2525 `(#:modules ((guix build utils))
2526 #:builder
2527 (begin
2528 (use-modules (guix build utils))
2529 (let ((tar (assoc-ref %build-inputs "tar"))
2530 (bzip (assoc-ref %build-inputs "bzip2"))
2531 (out (assoc-ref %outputs "out"))
2532 (doc (assoc-ref %outputs "doc")))
2533 (setenv "PATH" (string-append tar "/bin:" bzip "/bin"))
2534 (system* "tar" "xvf" (assoc-ref %build-inputs "source"))
2535 (chdir (string-append "seqan-library-" ,version))
2536 (copy-recursively "include" (string-append out "/include"))
2537 (copy-recursively "share" (string-append doc "/share"))))))
2538 (native-inputs
2539 `(("source" ,source)
2540 ("tar" ,tar)
2541 ("bzip2" ,bzip2)))
2542 (home-page "http://www.seqan.de")
2543 (synopsis "Library for nucleotide sequence analysis")
2544 (description
2545 "SeqAn is a C++ library of efficient algorithms and data structures for
2546 the analysis of sequences with the focus on biological data. It contains
2547 algorithms and data structures for string representation and their
2548 manipulation, online and indexed string search, efficient I/O of
2549 bioinformatics file formats, sequence alignment, and more.")
2550 (license license:bsd-3)))
2551
2552 (define-public star
2553 (package
2554 (name "star")
2555 (version "2.4.2a")
2556 (source (origin
2557 (method url-fetch)
2558 (uri (string-append
2559 "https://github.com/alexdobin/STAR/archive/STAR_"
2560 version ".tar.gz"))
2561 (sha256
2562 (base32
2563 "1c3rnm7r5l0kl3d04gl1g7938xqf1c2l0mla87rlplqg1hcns5mc"))
2564 (modules '((guix build utils)))
2565 (snippet
2566 '(substitute* "source/Makefile"
2567 (("/bin/rm") "rm")))))
2568 (build-system gnu-build-system)
2569 (arguments
2570 '(#:tests? #f ;no check target
2571 #:make-flags '("STAR")
2572 #:phases
2573 (alist-cons-after
2574 'unpack 'enter-source-dir (lambda _ (chdir "source"))
2575 (alist-replace
2576 'install
2577 (lambda* (#:key outputs #:allow-other-keys)
2578 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
2579 (mkdir-p bin)
2580 (copy-file "STAR" (string-append bin "STAR"))))
2581 (alist-delete
2582 'configure %standard-phases)))))
2583 (native-inputs
2584 `(("vim" ,vim))) ; for xxd
2585 (inputs
2586 `(("zlib" ,zlib)))
2587 (home-page "https://github.com/alexdobin/STAR")
2588 (synopsis "Universal RNA-seq aligner")
2589 (description
2590 "The Spliced Transcripts Alignment to a Reference (STAR) software is
2591 based on a previously undescribed RNA-seq alignment algorithm that uses
2592 sequential maximum mappable seed search in uncompressed suffix arrays followed
2593 by seed clustering and stitching procedure. In addition to unbiased de novo
2594 detection of canonical junctions, STAR can discover non-canonical splices and
2595 chimeric (fusion) transcripts, and is also capable of mapping full-length RNA
2596 sequences.")
2597 ;; STAR is licensed under GPLv3 or later; htslib is MIT-licensed.
2598 (license license:gpl3+)))
2599
2600 (define-public subread
2601 (package
2602 (name "subread")
2603 (version "1.4.6-p2")
2604 (source (origin
2605 (method url-fetch)
2606 (uri (string-append
2607 "mirror://sourceforge/subread/subread-"
2608 version "-source.tar.gz"))
2609 (sha256
2610 (base32
2611 "06sv9mpcsdj6p68y15d6gi70lca3lxmzk0dn61hg0kfsa7rxmsr3"))))
2612 (build-system gnu-build-system)
2613 (arguments
2614 `(#:tests? #f ;no "check" target
2615 #:make-flags '("-f" "Makefile.Linux")
2616 #:phases
2617 (alist-cons-after
2618 'unpack 'enter-dir
2619 (lambda _ (chdir "src") #t)
2620 (alist-replace
2621 'install
2622 (lambda* (#:key outputs #:allow-other-keys)
2623 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
2624 (mkdir-p bin)
2625 (copy-recursively "../bin" bin)))
2626 ;; no "configure" script
2627 (alist-delete 'configure %standard-phases)))))
2628 (inputs `(("zlib" ,zlib)))
2629 (home-page "http://bioinf.wehi.edu.au/subread-package/")
2630 (synopsis "Tool kit for processing next-gen sequencing data")
2631 (description
2632 "The subread package contains the following tools: subread aligner, a
2633 general-purpose read aligner; subjunc aligner: detecting exon-exon junctions
2634 and mapping RNA-seq reads; featureCounts: counting mapped reads for genomic
2635 features; exactSNP: a SNP caller that discovers SNPs by testing signals
2636 against local background noises.")
2637 (license license:gpl3+)))
2638
2639 (define-public vcftools
2640 (package
2641 (name "vcftools")
2642 (version "0.1.12b")
2643 (source (origin
2644 (method url-fetch)
2645 (uri (string-append
2646 "mirror://sourceforge/vcftools/vcftools_"
2647 version ".tar.gz"))
2648 (sha256
2649 (base32
2650 "148al9h7f8g8my2qdnpax51kdd2yjrivlx6frvakf4lz5r8j88wx"))))
2651 (build-system gnu-build-system)
2652 (arguments
2653 `(#:tests? #f ; no "check" target
2654 #:make-flags (list
2655 "CFLAGS=-O2" ; override "-m64" flag
2656 (string-append "PREFIX=" (assoc-ref %outputs "out"))
2657 (string-append "MANDIR=" (assoc-ref %outputs "out")
2658 "/share/man/man1"))
2659 #:phases
2660 (alist-cons-after
2661 'unpack 'patch-manpage-install
2662 (lambda _
2663 (substitute* "Makefile"
2664 (("cp \\$\\{PREFIX\\}/cpp/vcftools.1") "cp ./cpp/vcftools.1")))
2665 (alist-delete 'configure %standard-phases))))
2666 (inputs
2667 `(("perl" ,perl)
2668 ("zlib" ,zlib)))
2669 (home-page "http://vcftools.sourceforge.net/")
2670 (synopsis "Tools for working with VCF files")
2671 (description
2672 "VCFtools is a program package designed for working with VCF files, such
2673 as those generated by the 1000 Genomes Project. The aim of VCFtools is to
2674 provide easily accessible methods for working with complex genetic variation
2675 data in the form of VCF files.")
2676 ;; The license is declared as LGPLv3 in the README and
2677 ;; at http://vcftools.sourceforge.net/license.html
2678 (license license:lgpl3)))
2679
2680 (define-public bio-locus
2681 (package
2682 (name "bio-locus")
2683 (version "0.0.7")
2684 (source
2685 (origin
2686 (method url-fetch)
2687 (uri (rubygems-uri "bio-locus" version))
2688 (sha256
2689 (base32
2690 "02vmrxyimkj9sahsp4zhfhnmbvz6dbbqz1y01vglf8cbwvkajfl0"))))
2691 (build-system ruby-build-system)
2692 (native-inputs
2693 `(("ruby-rspec" ,ruby-rspec)))
2694 (synopsis "Tool for fast querying of genome locations")
2695 (description
2696 "Bio-locus is a tabix-like tool for fast querying of genome
2697 locations. Many file formats in bioinformatics contain records that
2698 start with a chromosome name and a position for a SNP, or a start-end
2699 position for indels. Bio-locus allows users to store this chr+pos or
2700 chr+pos+alt information in a database.")
2701 (home-page "https://github.com/pjotrp/bio-locus")
2702 (license license:expat)))
2703
2704 (define-public bioruby
2705 (package
2706 (name "bioruby")
2707 (version "1.5.0")
2708 (source
2709 (origin
2710 (method url-fetch)
2711 (uri (rubygems-uri "bio" version))
2712 (sha256
2713 (base32
2714 "01k2fyjl5fpx4zn8g6gqiqvsg2j1fgixrs9p03vzxckynxdq3wmc"))))
2715 (build-system ruby-build-system)
2716 (propagated-inputs
2717 `(("ruby-libxml" ,ruby-libxml)))
2718 (native-inputs
2719 `(("which" ,which))) ; required for test phase
2720 (arguments
2721 `(#:phases
2722 (modify-phases %standard-phases
2723 (add-before 'build 'patch-test-command
2724 (lambda _
2725 (substitute* '("test/functional/bio/test_command.rb")
2726 (("/bin/sh") (which "sh")))
2727 (substitute* '("test/functional/bio/test_command.rb")
2728 (("/bin/ls") (which "ls")))
2729 (substitute* '("test/functional/bio/test_command.rb")
2730 (("which") (which "which")))
2731 (substitute* '("test/functional/bio/test_command.rb",
2732 "test/data/command/echoarg2.sh")
2733 (("/bin/echo") (which "echo")))
2734 #t)))))
2735 (synopsis "Ruby library, shell and utilities for bioinformatics")
2736 (description "BioRuby comes with a comprehensive set of Ruby development
2737 tools and libraries for bioinformatics and molecular biology. BioRuby has
2738 components for sequence analysis, pathway analysis, protein modelling and
2739 phylogenetic analysis; it supports many widely used data formats and provides
2740 easy access to databases, external programs and public web services, including
2741 BLAST, KEGG, GenBank, MEDLINE and GO.")
2742 (home-page "http://bioruby.org/")
2743 ;; Code is released under Ruby license, except for setup
2744 ;; (LGPLv2.1+) and scripts in samples (which have GPL2 and GPL2+)
2745 (license (list license:ruby license:lgpl2.1+ license:gpl2+ ))))