gnu: Add BioPerl.
[jackhill/guix/guix.git] / gnu / packages / bioinformatics.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2015 Ben Woodcroft <donttrustben@gmail.com>
4 ;;;
5 ;;; This file is part of GNU Guix.
6 ;;;
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
11 ;;;
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
16 ;;;
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20 (define-module (gnu packages bioinformatics)
21 #:use-module ((guix licenses) #:prefix license:)
22 #:use-module (guix packages)
23 #:use-module (guix utils)
24 #:use-module (guix download)
25 #:use-module (guix git-download)
26 #:use-module (guix build-system gnu)
27 #:use-module (guix build-system cmake)
28 #:use-module (guix build-system perl)
29 #:use-module (guix build-system python)
30 #:use-module (guix build-system trivial)
31 #:use-module (gnu packages)
32 #:use-module (gnu packages algebra)
33 #:use-module (gnu packages base)
34 #:use-module (gnu packages boost)
35 #:use-module (gnu packages compression)
36 #:use-module (gnu packages cpio)
37 #:use-module (gnu packages file)
38 #:use-module (gnu packages java)
39 #:use-module (gnu packages linux)
40 #:use-module (gnu packages machine-learning)
41 #:use-module (gnu packages maths)
42 #:use-module (gnu packages ncurses)
43 #:use-module (gnu packages perl)
44 #:use-module (gnu packages pkg-config)
45 #:use-module (gnu packages popt)
46 #:use-module (gnu packages protobuf)
47 #:use-module (gnu packages python)
48 #:use-module (gnu packages statistics)
49 #:use-module (gnu packages tbb)
50 #:use-module (gnu packages textutils)
51 #:use-module (gnu packages vim)
52 #:use-module (gnu packages web)
53 #:use-module (gnu packages xml)
54 #:use-module (gnu packages zip)
55 #:use-module (srfi srfi-1))
56
57 (define-public aragorn
58 (package
59 (name "aragorn")
60 (version "1.2.36")
61 (source (origin
62 (method url-fetch)
63 (uri (string-append
64 "http://mbio-serv2.mbioekol.lu.se/ARAGORN/Downloads/aragorn"
65 version ".tgz"))
66 (sha256
67 (base32
68 "1dg7jlz1qpqy88igjxd6ncs11ccsirb36qv1z01a0np4i4jh61mb"))))
69 (build-system gnu-build-system)
70 (arguments
71 `(#:tests? #f ; there are no tests
72 #:phases
73 (modify-phases %standard-phases
74 (delete 'configure)
75 (replace 'build
76 (lambda _
77 (zero? (system* "gcc"
78 "-O3"
79 "-ffast-math"
80 "-finline-functions"
81 "-o"
82 "aragorn"
83 (string-append "aragorn" ,version ".c")))))
84 (replace 'install
85 (lambda* (#:key outputs #:allow-other-keys)
86 (let* ((out (assoc-ref outputs "out"))
87 (bin (string-append out "/bin"))
88 (man (string-append out "/share/man/man1")))
89 (mkdir-p bin)
90 (copy-file "aragorn"
91 (string-append bin "/aragorn"))
92 (mkdir-p man)
93 (copy-file "aragorn.1"
94 (string-append man "/aragorn.1")))
95 #t)))))
96 (home-page "http://mbio-serv2.mbioekol.lu.se/ARAGORN")
97 (synopsis "Detect tRNA, mtRNA and tmRNA genes in nucleotide sequences")
98 (description
99 "Aragorn identifies transfer RNA, mitochondrial RNA and
100 transfer-messenger RNA from nucleotide sequences, based on homology to known
101 tRNA consensus sequences and RNA structure. It also outputs the secondary
102 structure of the predicted RNA.")
103 (license license:gpl2)))
104
105 (define-public bamtools
106 (package
107 (name "bamtools")
108 (version "2.3.0")
109 (source (origin
110 (method url-fetch)
111 (uri (string-append
112 "https://github.com/pezmaster31/bamtools/archive/v"
113 version ".tar.gz"))
114 (file-name (string-append name "-" version ".tar.gz"))
115 (sha256
116 (base32
117 "1brry29bw2xr2l9pqn240rkqwayg85b8qq78zk2zs6nlspk4d018"))))
118 (build-system cmake-build-system)
119 (arguments
120 `(#:tests? #f ;no "check" target
121 #:phases
122 (modify-phases %standard-phases
123 (add-before
124 'configure 'set-ldflags
125 (lambda* (#:key outputs #:allow-other-keys)
126 (setenv "LDFLAGS"
127 (string-append
128 "-Wl,-rpath="
129 (assoc-ref outputs "out") "/lib/bamtools")))))))
130 (inputs `(("zlib" ,zlib)))
131 (home-page "https://github.com/pezmaster31/bamtools")
132 (synopsis "C++ API and command-line toolkit for working with BAM data")
133 (description
134 "BamTools provides both a C++ API and a command-line toolkit for handling
135 BAM files.")
136 (license license:expat)))
137
138 (define-public bedops
139 (package
140 (name "bedops")
141 (version "2.4.14")
142 (source (origin
143 (method url-fetch)
144 (uri (string-append "https://github.com/bedops/bedops/archive/v"
145 version ".tar.gz"))
146 (file-name (string-append name "-" version ".tar.gz"))
147 (sha256
148 (base32
149 "1kqbac547wyqma81cyky9n7mkgikjpsfd3nnmcm6hpqwanqgh10v"))))
150 (build-system gnu-build-system)
151 (arguments
152 '(#:tests? #f
153 #:make-flags (list (string-append "BINDIR=" %output "/bin"))
154 #:phases
155 (alist-cons-after
156 'unpack 'unpack-tarballs
157 (lambda _
158 ;; FIXME: Bedops includes tarballs of minimally patched upstream
159 ;; libraries jansson, zlib, and bzip2. We cannot just use stock
160 ;; libraries because at least one of the libraries (zlib) is
161 ;; patched to add a C++ function definition (deflateInit2cpp).
162 ;; Until the Bedops developers offer a way to link against system
163 ;; libraries we have to build the in-tree copies of these three
164 ;; libraries.
165
166 ;; See upstream discussion:
167 ;; https://github.com/bedops/bedops/issues/124
168
169 ;; Unpack the tarballs to benefit from shebang patching.
170 (with-directory-excursion "third-party"
171 (and (zero? (system* "tar" "xvf" "jansson-2.6.tar.bz2"))
172 (zero? (system* "tar" "xvf" "zlib-1.2.7.tar.bz2"))
173 (zero? (system* "tar" "xvf" "bzip2-1.0.6.tar.bz2"))))
174 ;; Disable unpacking of tarballs in Makefile.
175 (substitute* "system.mk/Makefile.linux"
176 (("^\tbzcat .*") "\t@echo \"not unpacking\"\n")
177 (("\\./configure") "CONFIG_SHELL=bash ./configure"))
178 (substitute* "third-party/zlib-1.2.7/Makefile.in"
179 (("^SHELL=.*$") "SHELL=bash\n")))
180 (alist-delete 'configure %standard-phases))))
181 (home-page "https://github.com/bedops/bedops")
182 (synopsis "Tools for high-performance genomic feature operations")
183 (description
184 "BEDOPS is a suite of tools to address common questions raised in genomic
185 studies---mostly with regard to overlap and proximity relationships between
186 data sets. It aims to be scalable and flexible, facilitating the efficient
187 and accurate analysis and management of large-scale genomic data.
188
189 BEDOPS provides tools that perform highly efficient and scalable Boolean and
190 other set operations, statistical calculations, archiving, conversion and
191 other management of genomic data of arbitrary scale. Tasks can be easily
192 split by chromosome for distributing whole-genome analyses across a
193 computational cluster.")
194 (license license:gpl2+)))
195
196 (define-public bedtools
197 (package
198 (name "bedtools")
199 (version "2.24.0")
200 (source (origin
201 (method url-fetch)
202 (uri (string-append "https://github.com/arq5x/bedtools2/archive/v"
203 version ".tar.gz"))
204 (file-name (string-append name "-" version ".tar.gz"))
205 (sha256
206 (base32
207 "0lnxrjvs3nnmb4bmskag1wg3h2hd80przz5q3xd0bvs7vyxrvpbl"))
208 (patches (list (search-patch "bedtools-32bit-compilation.patch")))))
209 (build-system gnu-build-system)
210 (native-inputs `(("python" ,python-2)))
211 (inputs `(("samtools" ,samtools)
212 ("zlib" ,zlib)))
213 (arguments
214 '(#:test-target "test"
215 #:phases
216 (alist-cons-after
217 'unpack 'patch-makefile-SHELL-definition
218 (lambda _
219 ;; patch-makefile-SHELL cannot be used here as it does not
220 ;; yet patch definitions with `:='. Since changes to
221 ;; patch-makefile-SHELL result in a full rebuild, features
222 ;; of patch-makefile-SHELL are reimplemented here.
223 (substitute* "Makefile"
224 (("^SHELL := .*$") (string-append "SHELL := " (which "bash") " -e \n"))))
225 (alist-delete
226 'configure
227 (alist-replace
228 'install
229 (lambda* (#:key outputs #:allow-other-keys)
230 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
231 (mkdir-p bin)
232 (for-each (lambda (file)
233 (copy-file file (string-append bin (basename file))))
234 (find-files "bin" ".*"))))
235 %standard-phases)))))
236 (home-page "https://github.com/arq5x/bedtools2")
237 (synopsis "Tools for genome analysis and arithmetic")
238 (description
239 "Collectively, the bedtools utilities are a swiss-army knife of tools for
240 a wide-range of genomics analysis tasks. The most widely-used tools enable
241 genome arithmetic: that is, set theory on the genome. For example, bedtools
242 allows one to intersect, merge, count, complement, and shuffle genomic
243 intervals from multiple files in widely-used genomic file formats such as BAM,
244 BED, GFF/GTF, VCF.")
245 (license license:gpl2)))
246
247 (define-public python2-pybedtools
248 (package
249 (name "python2-pybedtools")
250 (version "0.6.9")
251 (source (origin
252 (method url-fetch)
253 (uri (string-append
254 "https://pypi.python.org/packages/source/p/pybedtools/pybedtools-"
255 version ".tar.gz"))
256 (sha256
257 (base32
258 "1ldzdxw1p4y3g2ignmggsdypvqkcwqwzhdha4rbgpih048z5p4an"))))
259 (build-system python-build-system)
260 (arguments `(#:python ,python-2)) ; no Python 3 support
261 (inputs
262 `(("python-cython" ,python2-cython)
263 ("python-matplotlib" ,python2-matplotlib)))
264 (propagated-inputs
265 `(("bedtools" ,bedtools)
266 ("samtools" ,samtools)))
267 (native-inputs
268 `(("python-pyyaml" ,python2-pyyaml)
269 ("python-nose" ,python2-nose)
270 ("python-setuptools" ,python2-setuptools)))
271 (home-page "https://pythonhosted.org/pybedtools/")
272 (synopsis "Python wrapper for BEDtools programs")
273 (description
274 "pybedtools is a Python wrapper for Aaron Quinlan's BEDtools programs,
275 which are widely used for genomic interval manipulation or \"genome algebra\".
276 pybedtools extends BEDTools by offering feature-level manipulations from with
277 Python.")
278 (license license:gpl2+)))
279
280 (define-public bioperl-minimal
281 (let* ((inputs `(("perl-module-build" ,perl-module-build)
282 ("perl-data-stag" ,perl-data-stag)
283 ("perl-libwww" ,perl-libwww)
284 ("perl-uri" ,perl-uri)))
285 (transitive-inputs
286 (map (compose package-name cadr)
287 (delete-duplicates
288 (concatenate
289 (map (compose package-transitive-target-inputs cadr) inputs))))))
290 (package
291 (name "bioperl-minimal")
292 (version "1.6.924")
293 (source
294 (origin
295 (method url-fetch)
296 (uri (string-append "mirror://cpan/authors/id/C/CJ/CJFIELDS/BioPerl-"
297 version ".tar.gz"))
298 (sha256
299 (base32
300 "1l3npcvvvwjlhkna9dndpfv1hklhrgva013kw96m0n1wpd37ask1"))))
301 (build-system perl-build-system)
302 (arguments
303 `(#:phases
304 (modify-phases %standard-phases
305 (add-after
306 'install 'wrap-programs
307 (lambda* (#:key outputs #:allow-other-keys)
308 ;; Make sure all executables in "bin" find the required Perl
309 ;; modules at runtime. As the PERL5LIB variable contains also
310 ;; the paths of native inputs, we pick the transitive target
311 ;; inputs from %build-inputs.
312 (let* ((out (assoc-ref outputs "out"))
313 (bin (string-append out "/bin/"))
314 (path (string-join
315 (cons (string-append out "/lib/perl5/site_perl")
316 (map (lambda (name)
317 (assoc-ref %build-inputs name))
318 ',transitive-inputs))
319 ":")))
320 (for-each (lambda (file)
321 (wrap-program file
322 `("PERL5LIB" ":" prefix (,path))))
323 (find-files bin "\\.pl$"))
324 #t))))))
325 (inputs inputs)
326 (native-inputs
327 `(("perl-test-most" ,perl-test-most)))
328 (home-page "http://search.cpan.org/dist/BioPerl")
329 (synopsis "Bioinformatics toolkit")
330 (description
331 "BioPerl is the product of a community effort to produce Perl code which
332 is useful in biology. Examples include Sequence objects, Alignment objects
333 and database searching objects. These objects not only do what they are
334 advertised to do in the documentation, but they also interact - Alignment
335 objects are made from the Sequence objects, Sequence objects have access to
336 Annotation and SeqFeature objects and databases, Blast objects can be
337 converted to Alignment objects, and so on. This means that the objects
338 provide a coordinated and extensible framework to do computational biology.")
339 (license (package-license perl)))))
340
341 (define-public python-biopython
342 (package
343 (name "python-biopython")
344 (version "1.65")
345 (source (origin
346 (method url-fetch)
347 (uri (string-append
348 "http://biopython.org/DIST/biopython-"
349 version ".tar.gz"))
350 (sha256
351 (base32
352 "13m8s9jkrw40zvdp1rl709n6lmgdh4f52aann7gzr6sfp0fwhg26"))))
353 (build-system python-build-system)
354 (inputs
355 `(("python-numpy" ,python-numpy)))
356 (native-inputs
357 `(("python-setuptools" ,python2-setuptools)))
358 (home-page "http://biopython.org/")
359 (synopsis "Tools for biological computation in Python")
360 (description
361 "Biopython is a set of tools for biological computation including parsers
362 for bioinformatics files into Python data structures; interfaces to common
363 bioinformatics programs; a standard sequence class and tools for performing
364 common operations on them; code to perform data classification; code for
365 dealing with alignments; code making it easy to split up parallelizable tasks
366 into separate processes; and more.")
367 (license (license:non-copyleft "http://www.biopython.org/DIST/LICENSE"))))
368
369 (define-public python2-biopython
370 (package (inherit (package-with-python2 python-biopython))
371 (inputs
372 `(("python2-numpy" ,python2-numpy)))))
373
374 (define-public blast+
375 (package
376 (name "blast+")
377 (version "2.2.31")
378 (source (origin
379 (method url-fetch)
380 (uri (string-append
381 "ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/"
382 version "/ncbi-blast-" version "+-src.tar.gz"))
383 (sha256
384 (base32
385 "19gq6as4k1jrgsd26158ads6h7v4jca3h4r5dzg1y0m6ya50x5ph"))
386 (modules '((guix build utils)))
387 (snippet
388 '(begin
389 ;; Remove bundled bzip2 and zlib
390 (delete-file-recursively "c++/src/util/compress/bzip2")
391 (delete-file-recursively "c++/src/util/compress/zlib")
392 (substitute* "c++/src/util/compress/Makefile.in"
393 (("bzip2 zlib api") "api"))
394 ;; Remove useless msbuild directory
395 (delete-file-recursively
396 "c++/src/build-system/project_tree_builder/msbuild")
397 #t))))
398 (build-system gnu-build-system)
399 (arguments
400 `(;; There are three(!) tests for this massive library, and all fail with
401 ;; "unparsable timing stats".
402 ;; ERR [127] -- [util/regexp] test_pcre.sh (unparsable timing stats)
403 ;; ERR [127] -- [serial/datatool] datatool.sh (unparsable timing stats)
404 ;; ERR [127] -- [serial/datatool] datatool_xml.sh (unparsable timing stats)
405 #:tests? #f
406 #:out-of-source? #t
407 #:parallel-build? #f ; not supported
408 #:phases
409 (modify-phases %standard-phases
410 (add-before
411 'configure 'set-HOME
412 ;; $HOME needs to be set at some point during the configure phase
413 (lambda _ (setenv "HOME" "/tmp") #t))
414 (add-after
415 'unpack 'enter-dir
416 (lambda _ (chdir "c++") #t))
417 (add-after
418 'enter-dir 'fix-build-system
419 (lambda _
420 (define (which* cmd)
421 (cond ((string=? cmd "date")
422 ;; make call to "date" deterministic
423 "date -d @0")
424 ((which cmd)
425 => identity)
426 (else
427 (format (current-error-port)
428 "WARNING: Unable to find absolute path for ~s~%"
429 cmd)
430 #f)))
431
432 ;; Rewrite hardcoded paths to various tools
433 (substitute* (append '("src/build-system/configure.ac"
434 "src/build-system/configure"
435 "scripts/common/impl/if_diff.sh"
436 "scripts/common/impl/run_with_lock.sh"
437 "src/build-system/Makefile.configurables.real"
438 "src/build-system/Makefile.in.top"
439 "src/build-system/Makefile.meta.gmake=no"
440 "src/build-system/Makefile.meta.in"
441 "src/build-system/Makefile.meta_l"
442 "src/build-system/Makefile.meta_p"
443 "src/build-system/Makefile.meta_r"
444 "src/build-system/Makefile.mk.in"
445 "src/build-system/Makefile.requirements"
446 "src/build-system/Makefile.rules_with_autodep.in")
447 (find-files "scripts/common/check" "\\.sh$"))
448 (("(/usr/bin/|/bin/)([a-z][-_.a-z]*)" all dir cmd)
449 (or (which* cmd) all)))
450
451 (substitute* (find-files "src/build-system" "^config.*")
452 (("LN_S=/bin/\\$LN_S") (string-append "LN_S=" (which "ln")))
453 (("^PATH=.*") ""))
454
455 ;; rewrite "/var/tmp" in check script
456 (substitute* "scripts/common/check/check_make_unix.sh"
457 (("/var/tmp") "/tmp"))
458
459 ;; do not reset PATH
460 (substitute* (find-files "scripts/common/impl/" "\\.sh$")
461 (("^ *PATH=.*") "")
462 (("action=/bin/") "action=")
463 (("export PATH") ":"))
464 #t))
465 (replace
466 'configure
467 (lambda* (#:key inputs outputs #:allow-other-keys)
468 (let ((out (assoc-ref outputs "out"))
469 (lib (string-append (assoc-ref outputs "lib") "/lib"))
470 (include (string-append (assoc-ref outputs "include")
471 "/include/ncbi-tools++")))
472 ;; The 'configure' script doesn't recognize things like
473 ;; '--enable-fast-install'.
474 (zero? (system* "./configure.orig"
475 (string-append "--with-build-root=" (getcwd) "/build")
476 (string-append "--prefix=" out)
477 (string-append "--libdir=" lib)
478 (string-append "--includedir=" include)
479 (string-append "--with-bz2="
480 (assoc-ref inputs "bzip2"))
481 (string-append "--with-z="
482 (assoc-ref inputs "zlib"))
483 ;; Each library is built twice by default, once
484 ;; with "-static" in its name, and again
485 ;; without.
486 "--without-static"
487 "--with-dll"))))))))
488 (outputs '("out" ; 19 MB
489 "lib" ; 203 MB
490 "include")) ; 32 MB
491 (inputs
492 `(("bzip2" ,bzip2)
493 ("zlib" ,zlib)))
494 (native-inputs
495 `(("cpio" ,cpio)))
496 (home-page "http://blast.ncbi.nlm.nih.gov")
497 (synopsis "Basic local alignment search tool")
498 (description
499 "BLAST is a popular method of performing a DNA or protein sequence
500 similarity search, using heuristics to produce results quickly. It also
501 calculates an “expect value” that estimates how many matches would have
502 occurred at a given score by chance, which can aid a user in judging how much
503 confidence to have in an alignment.")
504 ;; Most of the sources are in the public domain, with the following
505 ;; exceptions:
506 ;; * Expat:
507 ;; * ./c++/include/util/bitset/
508 ;; * ./c++/src/html/ncbi_menu*.js
509 ;; * Boost license:
510 ;; * ./c++/include/util/impl/floating_point_comparison.hpp
511 ;; * LGPL 2+:
512 ;; * ./c++/include/dbapi/driver/odbc/unix_odbc/
513 ;; * ASL 2.0:
514 ;; * ./c++/src/corelib/teamcity_*
515 (license (list license:public-domain
516 license:expat
517 license:boost1.0
518 license:lgpl2.0+
519 license:asl2.0))))
520
521 (define-public bowtie
522 (package
523 (name "bowtie")
524 (version "2.2.4")
525 (source (origin
526 (method url-fetch)
527 (uri (string-append "https://github.com/BenLangmead/bowtie2/archive/v"
528 version ".tar.gz"))
529 (file-name (string-append name "-" version ".tar.gz"))
530 (sha256
531 (base32
532 "15dnbqippwvhyh9zqjhaxkabk7lm1xbh1nvar1x4b5kwm117zijn"))
533 (modules '((guix build utils)))
534 (snippet
535 '(substitute* "Makefile"
536 (("^CC = .*$") "CC = gcc")
537 (("^CPP = .*$") "CPP = g++")
538 ;; replace BUILD_HOST and BUILD_TIME for deterministic build
539 (("-DBUILD_HOST=.*") "-DBUILD_HOST=\"\\\"guix\\\"\"")
540 (("-DBUILD_TIME=.*") "-DBUILD_TIME=\"\\\"0\\\"\"")))
541 (patches (list (search-patch "bowtie-fix-makefile.patch")))))
542 (build-system gnu-build-system)
543 (inputs `(("perl" ,perl)
544 ("perl-clone" ,perl-clone)
545 ("perl-test-deep" ,perl-test-deep)
546 ("perl-test-simple" ,perl-test-simple)
547 ("python" ,python-2)))
548 (arguments
549 '(#:make-flags '("allall")
550 #:phases
551 (alist-delete
552 'configure
553 (alist-replace
554 'install
555 (lambda* (#:key outputs #:allow-other-keys)
556 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
557 (mkdir-p bin)
558 (for-each (lambda (file)
559 (copy-file file (string-append bin file)))
560 (find-files "." "bowtie2.*"))))
561 (alist-replace
562 'check
563 (lambda* (#:key outputs #:allow-other-keys)
564 (system* "perl"
565 "scripts/test/simple_tests.pl"
566 "--bowtie2=./bowtie2"
567 "--bowtie2-build=./bowtie2-build"))
568 %standard-phases)))))
569 (home-page "http://bowtie-bio.sourceforge.net/bowtie2/index.shtml")
570 (synopsis "Fast and sensitive nucleotide sequence read aligner")
571 (description
572 "Bowtie 2 is a fast and memory-efficient tool for aligning sequencing
573 reads to long reference sequences. It is particularly good at aligning reads
574 of about 50 up to 100s or 1,000s of characters, and particularly good at
575 aligning to relatively long (e.g. mammalian) genomes. Bowtie 2 indexes the
576 genome with an FM Index to keep its memory footprint small: for the human
577 genome, its memory footprint is typically around 3.2 GB. Bowtie 2 supports
578 gapped, local, and paired-end alignment modes.")
579 (supported-systems '("x86_64-linux"))
580 (license license:gpl3+)))
581
582 (define-public bwa
583 (package
584 (name "bwa")
585 (version "0.7.12")
586 (source (origin
587 (method url-fetch)
588 (uri (string-append "mirror://sourceforge/bio-bwa/bwa-"
589 version ".tar.bz2"))
590 (sha256
591 (base32
592 "1330dpqncv0px3pbhjzz1gwgg39kkcv2r9qp2xs0sixf8z8wl7bh"))))
593 (build-system gnu-build-system)
594 (arguments
595 '(#:tests? #f ;no "check" target
596 #:phases
597 (alist-replace
598 'install
599 (lambda* (#:key outputs #:allow-other-keys)
600 (let ((bin (string-append
601 (assoc-ref outputs "out") "/bin"))
602 (doc (string-append
603 (assoc-ref outputs "out") "/share/doc/bwa"))
604 (man (string-append
605 (assoc-ref outputs "out") "/share/man/man1")))
606 (mkdir-p bin)
607 (mkdir-p doc)
608 (mkdir-p man)
609 (copy-file "bwa" (string-append bin "/bwa"))
610 (copy-file "README.md" (string-append doc "/README.md"))
611 (copy-file "bwa.1" (string-append man "/bwa.1"))))
612 ;; no "configure" script
613 (alist-delete 'configure %standard-phases))))
614 (inputs `(("zlib" ,zlib)))
615 (home-page "http://bio-bwa.sourceforge.net/")
616 (synopsis "Burrows-Wheeler sequence aligner")
617 (description
618 "BWA is a software package for mapping low-divergent sequences against a
619 large reference genome, such as the human genome. It consists of three
620 algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is
621 designed for Illumina sequence reads up to 100bp, while the rest two for
622 longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share similar
623 features such as long-read support and split alignment, but BWA-MEM, which is
624 the latest, is generally recommended for high-quality queries as it is faster
625 and more accurate. BWA-MEM also has better performance than BWA-backtrack for
626 70-100bp Illumina reads.")
627 (license license:gpl3+)))
628
629 (define-public python2-bx-python
630 (package
631 (name "python2-bx-python")
632 (version "0.7.2")
633 (source (origin
634 (method url-fetch)
635 (uri (string-append
636 "https://pypi.python.org/packages/source/b/bx-python/bx-python-"
637 version ".tar.gz"))
638 (sha256
639 (base32
640 "0ld49idhc5zjdvbhvjq1a2qmpjj7h5v58rqr25dzmfq7g34b50xh"))
641 (modules '((guix build utils)))
642 (snippet
643 '(substitute* "setup.py"
644 ;; remove dependency on outdated "distribute" module
645 (("^from distribute_setup import use_setuptools") "")
646 (("^use_setuptools\\(\\)") "")))))
647 (build-system python-build-system)
648 (arguments
649 `(#:tests? #f ;tests fail because test data are not included
650 #:python ,python-2))
651 (inputs
652 `(("python-numpy" ,python2-numpy)
653 ("zlib" ,zlib)))
654 (native-inputs
655 `(("python-nose" ,python2-nose)
656 ("python-setuptools" ,python2-setuptools)))
657 (home-page "http://bitbucket.org/james_taylor/bx-python/")
658 (synopsis "Tools for manipulating biological data")
659 (description
660 "bx-python provides tools for manipulating biological data, particularly
661 multiple sequence alignments.")
662 (license license:expat)))
663
664 (define-public clipper
665 (package
666 (name "clipper")
667 (version "0.3.0")
668 (source (origin
669 (method url-fetch)
670 (uri (string-append
671 "https://github.com/YeoLab/clipper/archive/"
672 version ".tar.gz"))
673 (sha256
674 (base32
675 "1q7jpimsqln7ic44i8v2rx2haj5wvik8hc1s2syd31zcn0xk1iyq"))
676 (modules '((guix build utils)))
677 (snippet
678 ;; remove unnecessary setup dependency
679 '(substitute* "setup.py"
680 (("setup_requires = .*") "")))))
681 (build-system python-build-system)
682 (arguments `(#:python ,python-2)) ; only Python 2 is supported
683 (inputs
684 `(("htseq" ,htseq)
685 ("python-pybedtools" ,python2-pybedtools)
686 ("python-cython" ,python2-cython)
687 ("python-scikit-learn" ,python2-scikit-learn)
688 ("python-matplotlib" ,python2-matplotlib)
689 ("python-pysam" ,python2-pysam)
690 ("python-numpy" ,python2-numpy)
691 ("python-scipy" ,python2-scipy)))
692 (native-inputs
693 `(("python-mock" ,python2-mock) ; for tests
694 ("python-pytz" ,python2-pytz) ; for tests
695 ("python-setuptools" ,python2-setuptools)))
696 (home-page "https://github.com/YeoLab/clipper")
697 (synopsis "CLIP peak enrichment recognition")
698 (description
699 "CLIPper is a tool to define peaks in CLIP-seq datasets.")
700 (license license:gpl2)))
701
702 (define-public couger
703 (package
704 (name "couger")
705 (version "1.8.2")
706 (source (origin
707 (method url-fetch)
708 (uri (string-append
709 "http://couger.oit.duke.edu/static/assets/COUGER"
710 version ".zip"))
711 (sha256
712 (base32
713 "04p2b14nmhzxw5h72mpzdhalv21bx4w9b87z0wpw0xzxpysyncmq"))))
714 (build-system gnu-build-system)
715 (arguments
716 `(#:tests? #f
717 #:phases
718 (modify-phases %standard-phases
719 (delete 'configure)
720 (delete 'build)
721 (replace
722 'install
723 (lambda* (#:key outputs #:allow-other-keys)
724 (let ((out (assoc-ref outputs "out")))
725 (copy-recursively "src" (string-append out "/src"))
726 (mkdir (string-append out "/bin"))
727 ;; Add "src" directory to module lookup path.
728 (substitute* "couger"
729 (("from argparse")
730 (string-append "import sys\nsys.path.append(\""
731 out "\")\nfrom argparse")))
732 (copy-file "couger" (string-append out "/bin/couger")))
733 #t))
734 (add-after
735 'install 'wrap-program
736 (lambda* (#:key inputs outputs #:allow-other-keys)
737 ;; Make sure 'couger' runs with the correct PYTHONPATH.
738 (let* ((out (assoc-ref outputs "out"))
739 (path (getenv "PYTHONPATH")))
740 (wrap-program (string-append out "/bin/couger")
741 `("PYTHONPATH" ":" prefix (,path))))
742 #t)))))
743 (inputs
744 `(("python" ,python-2)
745 ("python2-pillow" ,python2-pillow)
746 ("python2-numpy" ,python2-numpy)
747 ("python2-scipy" ,python2-scipy)
748 ("python2-matplotlib" ,python2-matplotlib)))
749 (propagated-inputs
750 `(("r" ,r)
751 ("libsvm" ,libsvm)
752 ("randomjungle" ,randomjungle)))
753 (native-inputs
754 `(("unzip" ,unzip)))
755 (home-page "http://couger.oit.duke.edu")
756 (synopsis "Identify co-factors in sets of genomic regions")
757 (description
758 "COUGER can be applied to any two sets of genomic regions bound by
759 paralogous TFs (e.g., regions derived from ChIP-seq experiments) to identify
760 putative co-factors that provide specificity to each TF. The framework
761 determines the genomic targets uniquely-bound by each TF, and identifies a
762 small set of co-factors that best explain the in vivo binding differences
763 between the two TFs.
764
765 COUGER uses classification algorithms (support vector machines and random
766 forests) with features that reflect the DNA binding specificities of putative
767 co-factors. The features are generated either from high-throughput TF-DNA
768 binding data (from protein binding microarray experiments), or from large
769 collections of DNA motifs.")
770 (license license:gpl3+)))
771
772 (define-public clustal-omega
773 (package
774 (name "clustal-omega")
775 (version "1.2.1")
776 (source (origin
777 (method url-fetch)
778 (uri (string-append
779 "http://www.clustal.org/omega/clustal-omega-"
780 version ".tar.gz"))
781 (sha256
782 (base32
783 "02ibkx0m0iwz8nscg998bh41gg251y56cgh86bvyrii5m8kjgwqf"))))
784 (build-system gnu-build-system)
785 (inputs
786 `(("argtable" ,argtable)))
787 (home-page "http://www.clustal.org/omega/")
788 (synopsis "Multiple sequence aligner for protein and DNA/RNA")
789 (description
790 "Clustal-Omega is a general purpose multiple sequence alignment (MSA)
791 program for protein and DNA/RNA. It produces high quality MSAs and is capable
792 of handling data-sets of hundreds of thousands of sequences in reasonable
793 time.")
794 (license license:gpl2+)))
795
796 (define-public crossmap
797 (package
798 (name "crossmap")
799 (version "0.1.6")
800 (source (origin
801 (method url-fetch)
802 (uri (string-append "mirror://sourceforge/crossmap/CrossMap-"
803 version ".tar.gz"))
804 (sha256
805 (base32
806 "163hi5gjgij6cndxlvbkp5jjwr0k4wbm9im6d2210278q7k9kpnp"))
807 ;; patch has been sent upstream already
808 (patches (list
809 (search-patch "crossmap-allow-system-pysam.patch")))
810 (modules '((guix build utils)))
811 ;; remove bundled copy of pysam
812 (snippet
813 '(delete-file-recursively "lib/pysam"))))
814 (build-system python-build-system)
815 (arguments
816 `(#:python ,python-2
817 #:phases
818 (alist-cons-after
819 'unpack 'set-env
820 (lambda _ (setenv "CROSSMAP_USE_SYSTEM_PYSAM" "1"))
821 %standard-phases)))
822 (inputs
823 `(("python-numpy" ,python2-numpy)
824 ("python-pysam" ,python2-pysam)
825 ("zlib" ,zlib)))
826 (native-inputs
827 `(("python-cython" ,python2-cython)
828 ("python-nose" ,python2-nose)
829 ("python-setuptools" ,python2-setuptools)))
830 (home-page "http://crossmap.sourceforge.net/")
831 (synopsis "Convert genome coordinates between assemblies")
832 (description
833 "CrossMap is a program for conversion of genome coordinates or annotation
834 files between different genome assemblies. It supports most commonly used
835 file formats including SAM/BAM, Wiggle/BigWig, BED, GFF/GTF, VCF.")
836 (license license:gpl2+)))
837
838 (define-public cutadapt
839 (package
840 (name "cutadapt")
841 (version "1.8")
842 (source (origin
843 (method url-fetch)
844 (uri (string-append
845 "https://github.com/marcelm/cutadapt/archive/v"
846 version ".tar.gz"))
847 (file-name (string-append name "-" version ".tar.gz"))
848 (sha256
849 (base32
850 "161bp87y6gd6r5bmvjpn2b1k942i3fizfpa139f0jn6jv1wcp5h5"))))
851 (build-system python-build-system)
852 (arguments
853 ;; tests must be run after install
854 `(#:phases (alist-cons-after
855 'install 'check
856 (lambda* (#:key inputs outputs #:allow-other-keys)
857 (setenv "PYTHONPATH"
858 (string-append
859 (getenv "PYTHONPATH")
860 ":" (assoc-ref outputs "out")
861 "/lib/python"
862 (string-take (string-take-right
863 (assoc-ref inputs "python") 5) 3)
864 "/site-packages"))
865 (zero? (system* "nosetests" "-P" "tests")))
866 (alist-delete 'check %standard-phases))))
867 (native-inputs
868 `(("python-cython" ,python-cython)
869 ("python-nose" ,python-nose)
870 ("python-setuptools" ,python-setuptools)))
871 (home-page "https://code.google.com/p/cutadapt/")
872 (synopsis "Remove adapter sequences from nucleotide sequencing reads")
873 (description
874 "Cutadapt finds and removes adapter sequences, primers, poly-A tails and
875 other types of unwanted sequence from high-throughput sequencing reads.")
876 (license license:expat)))
877
878 (define-public diamond
879 (package
880 (name "diamond")
881 (version "0.7.9")
882 (source (origin
883 (method url-fetch)
884 (uri (string-append
885 "https://github.com/bbuchfink/diamond/archive/v"
886 version ".tar.gz"))
887 (file-name (string-append name "-" version ".tar.gz"))
888 (sha256
889 (base32
890 "0hfkcfv9f76h5brbyw9fyvmc0l9cmbsxrcdqk0fa9xv82zj47p15"))
891 (snippet '(begin
892 (delete-file "bin/diamond")
893 #t))))
894 (build-system gnu-build-system)
895 (arguments
896 '(#:tests? #f ;no "check" target
897 #:phases
898 (modify-phases %standard-phases
899 (add-after 'unpack 'enter-source-dir
900 (lambda _
901 (chdir "src")
902 #t))
903 (delete 'configure)
904 (replace 'install
905 (lambda* (#:key outputs #:allow-other-keys)
906 (let ((bin (string-append (assoc-ref outputs "out")
907 "/bin")))
908 (mkdir-p bin)
909 (copy-file "../bin/diamond"
910 (string-append bin "/diamond"))
911 #t))))))
912 (native-inputs
913 `(("bc" ,bc)))
914 (inputs
915 `(("boost" ,boost)
916 ("zlib" ,zlib)))
917 (home-page "https://github.com/bbuchfink/diamond")
918 (synopsis "Accelerated BLAST compatible local sequence aligner")
919 (description
920 "DIAMOND is a BLAST-compatible local aligner for mapping protein and
921 translated DNA query sequences against a protein reference database (BLASTP
922 and BLASTX alignment mode). The speedup over BLAST is up to 20,000 on short
923 reads at a typical sensitivity of 90-99% relative to BLAST depending on the
924 data and settings.")
925 (license (license:non-copyleft "file://src/COPYING"
926 "See src/COPYING in the distribution."))))
927
928 (define-public edirect
929 (package
930 (name "edirect")
931 (version "2.50")
932 (source (origin
933 (method url-fetch)
934 ;; Note: older versions are not retained.
935 (uri "ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/edirect.zip")
936 (sha256
937 (base32
938 "08afhz2ph66h8h381hl1mqyxkdi5nbvzsyj9gfw3jfbdijnpi4qj"))))
939 (build-system perl-build-system)
940 (arguments
941 `(#:tests? #f ;no "check" target
942 #:phases
943 (modify-phases %standard-phases
944 (delete 'configure)
945 (delete 'build)
946 (replace 'install
947 (lambda* (#:key outputs #:allow-other-keys)
948 (let ((target (string-append (assoc-ref outputs "out")
949 "/bin")))
950 (mkdir-p target)
951 (copy-file "edirect.pl"
952 (string-append target "/edirect.pl"))
953 #t)))
954 (add-after
955 'install 'wrap-program
956 (lambda* (#:key inputs outputs #:allow-other-keys)
957 ;; Make sure 'edirect.pl' finds all perl inputs at runtime.
958 (let* ((out (assoc-ref outputs "out"))
959 (path (getenv "PERL5LIB")))
960 (wrap-program (string-append out "/bin/edirect.pl")
961 `("PERL5LIB" ":" prefix (,path)))))))))
962 (inputs
963 `(("perl-html-parser" ,perl-html-parser)
964 ("perl-encode-locale" ,perl-encode-locale)
965 ("perl-file-listing" ,perl-file-listing)
966 ("perl-html-tagset" ,perl-html-tagset)
967 ("perl-html-tree" ,perl-html-tree)
968 ("perl-http-cookies" ,perl-http-cookies)
969 ("perl-http-date" ,perl-http-date)
970 ("perl-http-message" ,perl-http-message)
971 ("perl-http-negotiate" ,perl-http-negotiate)
972 ("perl-lwp-mediatypes" ,perl-lwp-mediatypes)
973 ("perl-lwp-protocol-https" ,perl-lwp-protocol-https)
974 ("perl-net-http" ,perl-net-http)
975 ("perl-uri" ,perl-uri)
976 ("perl-www-robotrules" ,perl-www-robotrules)
977 ("perl" ,perl)))
978 (native-inputs
979 `(("unzip" ,unzip)))
980 (home-page "http://www.ncbi.nlm.nih.gov/books/NBK179288")
981 (synopsis "Tools for accessing the NCBI's set of databases")
982 (description
983 "Entrez Direct (EDirect) is a method for accessing the National Center
984 for Biotechnology Information's (NCBI) set of interconnected
985 databases (publication, sequence, structure, gene, variation, expression,
986 etc.) from a terminal. Functions take search terms from command-line
987 arguments. Individual operations are combined to build multi-step queries.
988 Record retrieval and formatting normally complete the process.
989
990 EDirect also provides an argument-driven function that simplifies the
991 extraction of data from document summaries or other results that are returned
992 in structured XML format. This can eliminate the need for writing custom
993 software to answer ad hoc questions.")
994 (license license:public-domain)))
995
996 (define-public express
997 (package
998 (name "express")
999 (version "1.5.1")
1000 (source (origin
1001 (method url-fetch)
1002 (uri
1003 (string-append
1004 "http://bio.math.berkeley.edu/eXpress/downloads/express-"
1005 version "/express-" version "-src.tgz"))
1006 (sha256
1007 (base32
1008 "03rczxd0gjp2l1jxcmjfmf5j94j77zqyxa6x063zsc585nj40n0c"))))
1009 (build-system cmake-build-system)
1010 (arguments
1011 `(#:tests? #f ;no "check" target
1012 #:phases
1013 (alist-cons-after
1014 'unpack 'use-shared-boost-libs-and-set-bamtools-paths
1015 (lambda* (#:key inputs #:allow-other-keys)
1016 (substitute* "CMakeLists.txt"
1017 (("set\\(Boost_USE_STATIC_LIBS ON\\)")
1018 "set(Boost_USE_STATIC_LIBS OFF)")
1019 (("\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/bamtools/include")
1020 (string-append (assoc-ref inputs "bamtools") "/include/bamtools")))
1021 (substitute* "src/CMakeLists.txt"
1022 (("\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/\\.\\./bamtools/lib")
1023 (string-append (assoc-ref inputs "bamtools") "/lib/bamtools")))
1024 #t)
1025 %standard-phases)))
1026 (inputs
1027 `(("boost" ,boost)
1028 ("bamtools" ,bamtools)
1029 ("protobuf" ,protobuf)
1030 ("zlib" ,zlib)))
1031 (home-page "http://bio.math.berkeley.edu/eXpress")
1032 (synopsis "Streaming quantification for high-throughput genomic sequencing")
1033 (description
1034 "eXpress is a streaming tool for quantifying the abundances of a set of
1035 target sequences from sampled subsequences. Example applications include
1036 transcript-level RNA-Seq quantification, allele-specific/haplotype expression
1037 analysis (from RNA-Seq), transcription factor binding quantification in
1038 ChIP-Seq, and analysis of metagenomic data.")
1039 (license license:artistic2.0)))
1040
1041 (define-public fasttree
1042 (package
1043 (name "fasttree")
1044 (version "2.1.8")
1045 (source (origin
1046 (method url-fetch)
1047 (uri (string-append
1048 "http://www.microbesonline.org/fasttree/FastTree-"
1049 version ".c"))
1050 (sha256
1051 (base32
1052 "0dzqc9vr9iiiw21y159xfjl2z90vw0y7r4x6456pcaxiy5hd2wmi"))))
1053 (build-system gnu-build-system)
1054 (arguments
1055 `(#:tests? #f ; no "check" target
1056 #:phases
1057 (modify-phases %standard-phases
1058 (delete 'unpack)
1059 (delete 'configure)
1060 (replace 'build
1061 (lambda* (#:key source #:allow-other-keys)
1062 (and (zero? (system* "gcc"
1063 "-O3"
1064 "-finline-functions"
1065 "-funroll-loops"
1066 "-Wall"
1067 "-o"
1068 "FastTree"
1069 source
1070 "-lm"))
1071 (zero? (system* "gcc"
1072 "-DOPENMP"
1073 "-fopenmp"
1074 "-O3"
1075 "-finline-functions"
1076 "-funroll-loops"
1077 "-Wall"
1078 "-o"
1079 "FastTreeMP"
1080 source
1081 "-lm")))))
1082 (replace 'install
1083 (lambda* (#:key outputs #:allow-other-keys)
1084 (let ((bin (string-append (assoc-ref outputs "out")
1085 "/bin")))
1086 (mkdir-p bin)
1087 (copy-file "FastTree"
1088 (string-append bin "/FastTree"))
1089 (copy-file "FastTreeMP"
1090 (string-append bin "/FastTreeMP"))
1091 #t))))))
1092 (home-page "http://www.microbesonline.org/fasttree")
1093 (synopsis "Infers approximately-maximum-likelihood phylogenetic trees")
1094 (description
1095 "FastTree can handle alignments with up to a million of sequences in a
1096 reasonable amount of time and memory. For large alignments, FastTree is
1097 100-1,000 times faster than PhyML 3.0 or RAxML 7.")
1098 (license license:gpl2+)))
1099
1100 (define-public fastx-toolkit
1101 (package
1102 (name "fastx-toolkit")
1103 (version "0.0.14")
1104 (source (origin
1105 (method url-fetch)
1106 (uri
1107 (string-append
1108 "https://github.com/agordon/fastx_toolkit/releases/download/"
1109 version "/fastx_toolkit-" version ".tar.bz2"))
1110 (sha256
1111 (base32
1112 "01jqzw386873sr0pjp1wr4rn8fsga2vxs1qfmicvx1pjr72007wy"))))
1113 (build-system gnu-build-system)
1114 (inputs
1115 `(("libgtextutils" ,libgtextutils)))
1116 (native-inputs
1117 `(("pkg-config" ,pkg-config)))
1118 (home-page "http://hannonlab.cshl.edu/fastx_toolkit/")
1119 (synopsis "Tools for FASTA/FASTQ file preprocessing")
1120 (description
1121 "The FASTX-Toolkit is a collection of command line tools for Short-Reads
1122 FASTA/FASTQ files preprocessing.
1123
1124 Next-Generation sequencing machines usually produce FASTA or FASTQ files,
1125 containing multiple short-reads sequences. The main processing of such
1126 FASTA/FASTQ files is mapping the sequences to reference genomes. However, it
1127 is sometimes more productive to preprocess the files before mapping the
1128 sequences to the genome---manipulating the sequences to produce better mapping
1129 results. The FASTX-Toolkit tools perform some of these preprocessing tasks.")
1130 (license license:agpl3+)))
1131
1132 (define-public flexbar
1133 (package
1134 (name "flexbar")
1135 (version "2.5")
1136 (source (origin
1137 (method url-fetch)
1138 (uri
1139 (string-append "mirror://sourceforge/flexbar/"
1140 version "/flexbar_v" version "_src.tgz"))
1141 (sha256
1142 (base32
1143 "13jaykc3y1x8y5nn9j8ljnb79s5y51kyxz46hdmvvjj6qhyympmf"))))
1144 (build-system cmake-build-system)
1145 (arguments
1146 `(#:configure-flags (list
1147 (string-append "-DFLEXBAR_BINARY_DIR="
1148 (assoc-ref %outputs "out")
1149 "/bin/"))
1150 #:phases
1151 (alist-replace
1152 'check
1153 (lambda* (#:key outputs #:allow-other-keys)
1154 (setenv "PATH" (string-append
1155 (assoc-ref outputs "out") "/bin:"
1156 (getenv "PATH")))
1157 (chdir "../flexbar_v2.5_src/test")
1158 (zero? (system* "bash" "flexbar_validate.sh")))
1159 (alist-delete 'install %standard-phases))))
1160 (inputs
1161 `(("tbb" ,tbb)
1162 ("zlib" ,zlib)))
1163 (native-inputs
1164 `(("pkg-config" ,pkg-config)
1165 ("seqan" ,seqan)))
1166 (home-page "http://flexbar.sourceforge.net")
1167 (synopsis "Barcode and adapter removal tool for sequencing platforms")
1168 (description
1169 "Flexbar preprocesses high-throughput nucleotide sequencing data
1170 efficiently. It demultiplexes barcoded runs and removes adapter sequences.
1171 Moreover, trimming and filtering features are provided. Flexbar increases
1172 read mapping rates and improves genome and transcriptome assemblies. It
1173 supports next-generation sequencing data in fasta/q and csfasta/q format from
1174 Illumina, Roche 454, and the SOLiD platform.")
1175 (license license:gpl3)))
1176
1177 (define-public grit
1178 (package
1179 (name "grit")
1180 (version "2.0.2")
1181 (source (origin
1182 (method url-fetch)
1183 (uri (string-append
1184 "https://github.com/nboley/grit/archive/"
1185 version ".tar.gz"))
1186 (file-name (string-append name "-" version ".tar.gz"))
1187 (sha256
1188 (base32
1189 "157in84dj70wimbind3x7sy1whs3h57qfgcnj2s6lrd38fbrb7mj"))))
1190 (build-system python-build-system)
1191 (arguments
1192 `(#:python ,python-2
1193 #:phases
1194 (alist-cons-after
1195 'unpack 'generate-from-cython-sources
1196 (lambda* (#:key inputs outputs #:allow-other-keys)
1197 ;; Delete these C files to force fresh generation from pyx sources.
1198 (delete-file "grit/sparsify_support_fns.c")
1199 (delete-file "grit/call_peaks_support_fns.c")
1200 (substitute* "setup.py"
1201 (("Cython.Setup") "Cython.Build")
1202 ;; Add numpy include path to fix compilation
1203 (("pyx\", \\]")
1204 (string-append "pyx\", ], include_dirs = ['"
1205 (assoc-ref inputs "python-numpy")
1206 "/lib/python2.7/site-packages/numpy/core/include/"
1207 "']"))) #t)
1208 %standard-phases)))
1209 (inputs
1210 `(("python-scipy" ,python2-scipy)
1211 ("python-numpy" ,python2-numpy)
1212 ("python-pysam" ,python2-pysam)
1213 ("python-networkx" ,python2-networkx)))
1214 (native-inputs
1215 `(("python-cython" ,python2-cython)
1216 ("python-setuptools" ,python2-setuptools)))
1217 (home-page "http://grit-bio.org")
1218 (synopsis "Tool for integrative analysis of RNA-seq type assays")
1219 (description
1220 "GRIT is designed to use RNA-seq, TES, and TSS data to build and quantify
1221 full length transcript models. When none of these data sources are available,
1222 GRIT can be run by providing a candidate set of TES or TSS sites. In
1223 addition, GRIT can merge in reference junctions and gene boundaries. GRIT can
1224 also be run in quantification mode, where it uses a provided GTF file and just
1225 estimates transcript expression.")
1226 (license license:gpl3+)))
1227
1228 (define-public hisat
1229 (package
1230 (name "hisat")
1231 (version "0.1.4")
1232 (source (origin
1233 (method url-fetch)
1234 (uri (string-append
1235 "http://ccb.jhu.edu/software/hisat/downloads/hisat-"
1236 version "-beta-source.zip"))
1237 (sha256
1238 (base32
1239 "1k381ydranqxp09yf2y7w1d0chz5d59vb6jchi89hbb0prq19lk5"))))
1240 (build-system gnu-build-system)
1241 (arguments
1242 `(#:tests? #f ;no check target
1243 #:make-flags '("allall"
1244 ;; Disable unsupported `popcnt' instructions on
1245 ;; architectures other than x86_64
1246 ,@(if (string-prefix? "x86_64"
1247 (or (%current-target-system)
1248 (%current-system)))
1249 '()
1250 '("POPCNT_CAPABILITY=0")))
1251 #:phases
1252 (alist-cons-after
1253 'unpack 'patch-sources
1254 (lambda _
1255 ;; XXX Cannot use snippet because zip files are not supported
1256 (substitute* "Makefile"
1257 (("^CC = .*$") "CC = gcc")
1258 (("^CPP = .*$") "CPP = g++")
1259 ;; replace BUILD_HOST and BUILD_TIME for deterministic build
1260 (("-DBUILD_HOST=.*") "-DBUILD_HOST=\"\\\"guix\\\"\"")
1261 (("-DBUILD_TIME=.*") "-DBUILD_TIME=\"\\\"0\\\"\""))
1262 (substitute* '("hisat-build" "hisat-inspect")
1263 (("/usr/bin/env") (which "env"))))
1264 (alist-replace
1265 'install
1266 (lambda* (#:key outputs #:allow-other-keys)
1267 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
1268 (mkdir-p bin)
1269 (for-each
1270 (lambda (file)
1271 (copy-file file (string-append bin file)))
1272 (find-files
1273 "."
1274 "hisat(-(build|align|inspect)(-(s|l)(-debug)*)*)*$"))))
1275 (alist-delete 'configure %standard-phases)))))
1276 (native-inputs
1277 `(("unzip" ,unzip)))
1278 (inputs
1279 `(("perl" ,perl)
1280 ("python" ,python)
1281 ("zlib" ,zlib)))
1282 (home-page "http://ccb.jhu.edu/software/hisat/index.shtml")
1283 (synopsis "Hierarchical indexing for spliced alignment of transcripts")
1284 (description
1285 "HISAT is a fast and sensitive spliced alignment program for mapping
1286 RNA-seq reads. In addition to one global FM index that represents a whole
1287 genome, HISAT uses a large set of small FM indexes that collectively cover the
1288 whole genome. These small indexes (called local indexes) combined with
1289 several alignment strategies enable effective alignment of RNA-seq reads, in
1290 particular, reads spanning multiple exons.")
1291 (license license:gpl3+)))
1292
1293 (define-public hmmer
1294 (package
1295 (name "hmmer")
1296 (version "3.1b2")
1297 (source (origin
1298 (method url-fetch)
1299 (uri (string-append
1300 "http://selab.janelia.org/software/hmmer"
1301 (version-prefix version 1) "/"
1302 version "/hmmer-" version ".tar.gz"))
1303 (sha256
1304 (base32
1305 "0djmgc0pfli0jilfx8hql1axhwhqxqb8rxg2r5rg07aw73sfs5nx"))))
1306 (build-system gnu-build-system)
1307 (native-inputs `(("perl", perl)))
1308 (home-page "http://hmmer.janelia.org")
1309 (synopsis "Biosequence analysis using profile hidden Markov models")
1310 (description
1311 "HMMER is used for searching sequence databases for homologs of protein
1312 sequences, and for making protein sequence alignments. It implements methods
1313 using probabilistic models called profile hidden Markov models (profile
1314 HMMs).")
1315 (license (list license:gpl3+
1316 ;; The bundled library 'easel' is distributed
1317 ;; under The Janelia Farm Software License.
1318 (license:non-copyleft
1319 "file://easel/LICENSE"
1320 "See easel/LICENSE in the distribution.")))))
1321
1322 (define-public htseq
1323 (package
1324 (name "htseq")
1325 (version "0.6.1")
1326 (source (origin
1327 (method url-fetch)
1328 (uri (string-append
1329 "https://pypi.python.org/packages/source/H/HTSeq/HTSeq-"
1330 version ".tar.gz"))
1331 (sha256
1332 (base32
1333 "1i85ppf2j2lj12m0x690qq5nn17xxk23pbbx2c83r8ayb5wngzwv"))))
1334 (build-system python-build-system)
1335 (arguments `(#:python ,python-2)) ; only Python 2 is supported
1336 (inputs
1337 `(("python-numpy" ,python2-numpy)
1338 ("python-setuptools" ,python2-setuptools)))
1339 (home-page "http://www-huber.embl.de/users/anders/HTSeq/")
1340 (synopsis "Analysing high-throughput sequencing data with Python")
1341 (description
1342 "HTSeq is a Python package that provides infrastructure to process data
1343 from high-throughput sequencing assays.")
1344 (license license:gpl3+)))
1345
1346 (define-public htsjdk
1347 (package
1348 (name "htsjdk")
1349 (version "1.129")
1350 (source (origin
1351 (method url-fetch)
1352 (uri (string-append
1353 "https://github.com/samtools/htsjdk/archive/"
1354 version ".tar.gz"))
1355 (file-name (string-append name "-" version ".tar.gz"))
1356 (sha256
1357 (base32
1358 "0asdk9b8jx2ij7yd6apg9qx03li8q7z3ml0qy2r2qczkra79y6fw"))
1359 (modules '((guix build utils)))
1360 ;; remove build dependency on git
1361 (snippet '(substitute* "build.xml"
1362 (("failifexecutionfails=\"true\"")
1363 "failifexecutionfails=\"false\"")))))
1364 (build-system gnu-build-system)
1365 (arguments
1366 `(#:modules ((srfi srfi-1)
1367 (guix build gnu-build-system)
1368 (guix build utils))
1369 #:phases (alist-replace
1370 'build
1371 (lambda _
1372 (setenv "JAVA_HOME" (assoc-ref %build-inputs "jdk"))
1373 (zero? (system* "ant" "all"
1374 (string-append "-Ddist="
1375 (assoc-ref %outputs "out")
1376 "/share/java/htsjdk/"))))
1377 (fold alist-delete %standard-phases
1378 '(configure install check)))))
1379 (native-inputs
1380 `(("ant" ,ant)
1381 ("jdk" ,icedtea6 "jdk")))
1382 (home-page "http://samtools.github.io/htsjdk/")
1383 (synopsis "Java API for high-throughput sequencing data (HTS) formats")
1384 (description
1385 "HTSJDK is an implementation of a unified Java library for accessing
1386 common file formats, such as SAM and VCF, used for high-throughput
1387 sequencing (HTS) data. There are also an number of useful utilities for
1388 manipulating HTS data.")
1389 (license license:expat)))
1390
1391 (define-public htslib
1392 (package
1393 (name "htslib")
1394 (version "1.2.1")
1395 (source (origin
1396 (method url-fetch)
1397 (uri (string-append
1398 "https://github.com/samtools/htslib/releases/download/"
1399 version "/htslib-" version ".tar.bz2"))
1400 (sha256
1401 (base32
1402 "1c32ssscbnjwfw3dra140fq7riarp2x990qxybh34nr1p5r17nxx"))))
1403 (build-system gnu-build-system)
1404 (arguments
1405 `(#:phases
1406 (modify-phases %standard-phases
1407 (add-after
1408 'unpack 'patch-tests
1409 (lambda _
1410 (substitute* "test/test.pl"
1411 (("/bin/bash") (which "bash")))
1412 #t)))))
1413 (inputs
1414 `(("zlib" ,zlib)))
1415 (native-inputs
1416 `(("perl" ,perl)))
1417 (home-page "http://www.htslib.org")
1418 (synopsis "C library for reading/writing high-throughput sequencing data")
1419 (description
1420 "HTSlib is a C library for reading/writing high-throughput sequencing
1421 data. It also provides the bgzip, htsfile, and tabix utilities.")
1422 ;; Files under cram/ are released under the modified BSD license;
1423 ;; the rest is released under the Expat license
1424 (license (list license:expat license:bsd-3))))
1425
1426 (define-public idr
1427 (package
1428 (name "idr")
1429 (version "2.0.0")
1430 (source (origin
1431 (method url-fetch)
1432 (uri (string-append
1433 "https://github.com/nboley/idr/archive/"
1434 version ".tar.gz"))
1435 (file-name (string-append name "-" version ".tar.gz"))
1436 (sha256
1437 (base32
1438 "1k3x44biak00aiv3hpm1yd6nn4hhp7n0qnbs3zh2q9sw7qr1qj5r"))))
1439 (build-system python-build-system)
1440 (arguments
1441 `(#:phases
1442 (modify-phases %standard-phases
1443 (add-after
1444 'install 'wrap-program
1445 (lambda* (#:key inputs outputs #:allow-other-keys)
1446 (let* ((out (assoc-ref outputs "out"))
1447 (python-version (string-take (string-take-right
1448 (assoc-ref inputs "python") 5) 3))
1449 (path (string-join
1450 (map (lambda (name)
1451 (string-append (assoc-ref inputs name)
1452 "/lib/python" python-version
1453 "/site-packages"))
1454 '("python-scipy"
1455 "python-numpy"
1456 "python-matplotlib"))
1457 ":")))
1458 (wrap-program (string-append out "/bin/idr")
1459 `("PYTHONPATH" ":" prefix (,path))))
1460 #t)))))
1461 (inputs
1462 `(("python-scipy" ,python-scipy)
1463 ("python-numpy" ,python-numpy)
1464 ("python-matplotlib" ,python-matplotlib)))
1465 (native-inputs
1466 `(("python-cython" ,python-cython)
1467 ("python-setuptools" ,python-setuptools)))
1468 (home-page "https://github.com/nboley/idr")
1469 (synopsis "Tool to measure the irreproducible discovery rate (IDR)")
1470 (description
1471 "The IDR (Irreproducible Discovery Rate) framework is a unified approach
1472 to measure the reproducibility of findings identified from replicate
1473 experiments and provide highly stable thresholds based on reproducibility.")
1474 (license license:gpl3+)))
1475
1476 (define-public macs
1477 (package
1478 (name "macs")
1479 (version "2.1.0.20140616")
1480 (source (origin
1481 (method url-fetch)
1482 (uri (string-append
1483 "https://pypi.python.org/packages/source/M/MACS2/MACS2-"
1484 version ".tar.gz"))
1485 (sha256
1486 (base32
1487 "11lmiw6avqhwn75sn59g4lfkrr2kk20r3rgfbx9xfqb8rg9mi2n6"))))
1488 (build-system python-build-system)
1489 (arguments
1490 `(#:python ,python-2 ; only compatible with Python 2.7
1491 #:tests? #f)) ; no test target
1492 (inputs
1493 `(("python-numpy" ,python2-numpy)))
1494 (native-inputs
1495 `(("python-setuptools" ,python2-setuptools)))
1496 (home-page "http://github.com/taoliu/MACS/")
1497 (synopsis "Model based analysis for ChIP-Seq data")
1498 (description
1499 "MACS is an implementation of a ChIP-Seq analysis algorithm for
1500 identifying transcript factor binding sites named Model-based Analysis of
1501 ChIP-Seq (MACS). MACS captures the influence of genome complexity to evaluate
1502 the significance of enriched ChIP regions and it improves the spatial
1503 resolution of binding sites through combining the information of both
1504 sequencing tag position and orientation.")
1505 (license license:bsd-3)))
1506
1507
1508 (define-public metabat
1509 (package
1510 (name "metabat")
1511 (version "0.26.1")
1512 (source (origin
1513 (method url-fetch)
1514 (uri (string-append
1515 "https://bitbucket.org/berkeleylab/metabat/get/"
1516 version ".tar.bz2"))
1517 (file-name (string-append name "-" version ".tar.bz2"))
1518 (sha256
1519 (base32
1520 "0vgrhbaxg4dkxyax2kbigak7w0arhqvw0szwp6gd9wmyilc44kfa"))))
1521 (build-system gnu-build-system)
1522 (arguments
1523 `(#:phases
1524 (modify-phases %standard-phases
1525 (add-after 'unpack 'fix-includes
1526 (lambda _
1527 (substitute* "SConstruct"
1528 (("/include/bam/bam.h")
1529 "/include/samtools/bam.h"))
1530 (substitute* "src/BamUtils.h"
1531 (("^#include \"bam/bam\\.h\"")
1532 "#include \"samtools/bam.h\"")
1533 (("^#include \"bam/sam\\.h\"")
1534 "#include \"samtools/sam.h\""))
1535 (substitute* "src/KseqReader.h"
1536 (("^#include \"bam/kseq\\.h\"")
1537 "#include \"samtools/kseq.h\""))
1538 #t))
1539 (add-after 'unpack 'fix-scons
1540 (lambda _
1541 (substitute* "SConstruct" ; Do not distribute README
1542 (("^env\\.Install\\(idir_prefix, 'README\\.md'\\)")
1543 ""))
1544 #t))
1545 (delete 'configure)
1546 (replace 'build
1547 (lambda* (#:key inputs outputs #:allow-other-keys)
1548 (mkdir (assoc-ref outputs "out"))
1549 (zero? (system* "scons"
1550 (string-append
1551 "PREFIX="
1552 (assoc-ref outputs "out"))
1553 (string-append
1554 "HTSLIB_DIR="
1555 (assoc-ref inputs "htslib"))
1556 (string-append
1557 "SAMTOOLS_DIR="
1558 (assoc-ref inputs "samtools"))
1559 (string-append
1560 "BOOST_ROOT="
1561 (assoc-ref inputs "boost"))
1562 "install"))))
1563 ;; check and install carried out during build phase
1564 (delete 'check)
1565 (delete 'install))))
1566 (inputs
1567 `(("zlib" ,zlib)
1568 ("perl" ,perl)
1569 ("samtools" ,samtools)
1570 ("htslib" ,htslib)
1571 ("boost" ,boost)))
1572 (native-inputs
1573 `(("scons" ,scons)))
1574 (home-page "https://bitbucket.org/berkeleylab/metabat")
1575 (synopsis
1576 "Reconstruction of single genomes from complex microbial communities")
1577 (description
1578 "Grouping large genomic fragments assembled from shotgun metagenomic
1579 sequences to deconvolute complex microbial communities, or metagenome binning,
1580 enables the study of individual organisms and their interactions. MetaBAT is
1581 an automated metagenome binning software, which integrates empirical
1582 probabilistic distances of genome abundance and tetranucleotide frequency.")
1583 (license (license:non-copyleft "file://license.txt"
1584 "See license.txt in the distribution."))))
1585
1586 (define-public miso
1587 (package
1588 (name "miso")
1589 (version "0.5.3")
1590 (source (origin
1591 (method url-fetch)
1592 (uri (string-append
1593 "https://pypi.python.org/packages/source/m/misopy/misopy-"
1594 version ".tar.gz"))
1595 (sha256
1596 (base32
1597 "0x446867az8ir0z8c1vjqffkp0ma37wm4sylixnkhgawllzx8v5w"))
1598 (modules '((guix build utils)))
1599 (snippet
1600 '(substitute* "setup.py"
1601 ;; Use setuptools, or else the executables are not
1602 ;; installed.
1603 (("distutils.core") "setuptools")
1604 ;; use "gcc" instead of "cc" for compilation
1605 (("^defines")
1606 "cc.set_executables(
1607 compiler='gcc',
1608 compiler_so='gcc',
1609 linker_exe='gcc',
1610 linker_so='gcc -shared'); defines")))))
1611 (build-system python-build-system)
1612 (arguments
1613 `(#:python ,python-2 ; only Python 2 is supported
1614 #:tests? #f)) ; no "test" target
1615 (inputs
1616 `(("samtools" ,samtools)
1617 ("python-numpy" ,python2-numpy)
1618 ("python-pysam" ,python2-pysam)
1619 ("python-scipy" ,python2-scipy)
1620 ("python-matplotlib" ,python2-matplotlib)))
1621 (native-inputs
1622 `(("python-mock" ,python2-mock) ;for tests
1623 ("python-pytz" ,python2-pytz) ;for tests
1624 ("python-setuptools" ,python2-setuptools)))
1625 (home-page "http://genes.mit.edu/burgelab/miso/index.html")
1626 (synopsis "Mixture of Isoforms model for RNA-Seq isoform quantitation")
1627 (description
1628 "MISO (Mixture-of-Isoforms) is a probabilistic framework that quantitates
1629 the expression level of alternatively spliced genes from RNA-Seq data, and
1630 identifies differentially regulated isoforms or exons across samples. By
1631 modeling the generative process by which reads are produced from isoforms in
1632 RNA-Seq, the MISO model uses Bayesian inference to compute the probability
1633 that a read originated from a particular isoform.")
1634 (license license:gpl2)))
1635
1636 (define-public orfm
1637 (package
1638 (name "orfm")
1639 (version "0.4.1")
1640 (source (origin
1641 (method url-fetch)
1642 (uri (string-append
1643 "https://github.com/wwood/OrfM/releases/download/v"
1644 version "/orfm-" version ".tar.gz"))
1645 (sha256
1646 (base32
1647 "05fmw145snk646ly076zby0fjav0k7ysbclck5d4s9pmgcfpijc2"))))
1648 (build-system gnu-build-system)
1649 (inputs `(("zlib" ,zlib)))
1650 (synopsis "Simple and not slow open reading frame (ORF) caller")
1651 (description
1652 "An ORF caller finds stretches of DNA that when translated are not
1653 interrupted by stop codons. OrfM finds and prints these ORFs.")
1654 (home-page "https://github.com/wwood/OrfM")
1655 (license license:lgpl3+)))
1656
1657 (define-public python2-pbcore
1658 (package
1659 (name "python2-pbcore")
1660 (version "0.9.3")
1661 (source (origin
1662 (method url-fetch)
1663 (uri (string-append
1664 "https://github.com/PacificBiosciences/pbcore/archive/"
1665 version ".tar.gz"))
1666 (file-name (string-append name "-" version ".tar.gz"))
1667 (sha256
1668 (base32
1669 "1z46rwjac93jm87cbj2zgjg6qvsgs65140wkbbxsvxps7ai4pm09"))))
1670 (build-system python-build-system)
1671 (arguments `(#:python ,python-2)) ; pbcore requires Python 2.7
1672 (inputs
1673 `(("python-cython" ,python2-cython)
1674 ("python-numpy" ,python2-numpy)
1675 ("python-pysam" ,python2-pysam)
1676 ("python-h5py" ,python2-h5py)))
1677 (native-inputs
1678 `(("python-setuptools" ,python2-setuptools)))
1679 (home-page "http://pacificbiosciences.github.io/pbcore/")
1680 (synopsis "Library for reading and writing PacBio data files")
1681 (description
1682 "The pbcore package provides Python APIs for interacting with PacBio data
1683 files and writing bioinformatics applications.")
1684 (license license:bsd-3)))
1685
1686 (define-public python2-warpedlmm
1687 (package
1688 (name "python2-warpedlmm")
1689 (version "0.21")
1690 (source
1691 (origin
1692 (method url-fetch)
1693 (uri (string-append
1694 "https://pypi.python.org/packages/source/W/WarpedLMM/WarpedLMM-"
1695 version ".zip"))
1696 (sha256
1697 (base32
1698 "1agfz6zqa8nc6cw47yh0s3y14gkpa9wqazwcj7mwwj3ffnw39p3j"))))
1699 (build-system python-build-system)
1700 (arguments
1701 `(#:python ,python-2 ; requires Python 2.7
1702 #:phases
1703 (modify-phases %standard-phases
1704 (add-after
1705 'install 'remove-bin-directory
1706 (lambda* (#:key outputs #:allow-other-keys)
1707 ;; The "bin" directory only contains wrappers for running
1708 ;; the module tests. They are not needed after the
1709 ;; "check" phase.
1710 (delete-file-recursively
1711 (string-append (assoc-ref outputs "out") "/bin"))
1712 #t)))))
1713 (propagated-inputs
1714 `(("python-scipy" ,python2-scipy)
1715 ("python-numpy" ,python2-numpy)
1716 ("python-matplotlib" ,python2-matplotlib)
1717 ("python-fastlmm" ,python2-fastlmm)
1718 ("python-pandas" ,python2-pandas)
1719 ("python-pysnptools" ,python2-pysnptools)))
1720 (native-inputs
1721 `(("python-setuptools" ,python2-setuptools)
1722 ("python-mock" ,python2-mock)
1723 ("python-nose" ,python2-nose)
1724 ("unzip" ,unzip)))
1725 (home-page "https://github.com/PMBio/warpedLMM")
1726 (synopsis "Implementation of warped linear mixed models")
1727 (description
1728 "WarpedLMM is a Python implementation of the warped linear mixed model,
1729 which automatically learns an optimal warping function (or transformation) for
1730 the phenotype as it models the data.")
1731 (license license:asl2.0)))
1732
1733 (define-public pbtranscript-tofu
1734 (let ((commit "c7bbd5472"))
1735 (package
1736 (name "pbtranscript-tofu")
1737 (version (string-append "0.4.1." commit))
1738 (source (origin
1739 (method git-fetch)
1740 (uri (git-reference
1741 (url "https://github.com/PacificBiosciences/cDNA_primer.git")
1742 (commit commit)))
1743 (file-name (string-append name "-" version ".tar.gz"))
1744 (sha256
1745 (base32
1746 "148xkzi689c49g6fdhckp6mnmj2qhjdf1j4wifm6ja7ij95d7fxx"))))
1747 (build-system python-build-system)
1748 (arguments
1749 `(#:python ,python-2
1750 ;; With standard flags, the install phase attempts to create a zip'd
1751 ;; egg file, and fails with an error: 'ZIP does not support timestamps
1752 ;; before 1980'
1753 #:configure-flags '("--single-version-externally-managed"
1754 "--record=pbtranscript-tofu.txt")
1755 #:phases
1756 (alist-cons-after
1757 'unpack 'enter-directory-and-clean-up
1758 (lambda _
1759 (chdir "pbtranscript-tofu/pbtranscript/")
1760 ;; Delete clutter
1761 (delete-file-recursively "dist/")
1762 (delete-file-recursively "build/")
1763 (delete-file-recursively "setuptools_cython-0.2.1-py2.6.egg/")
1764 (delete-file-recursively "pbtools.pbtranscript.egg-info")
1765 (delete-file "Cython-0.20.1.tar.gz")
1766 (delete-file "setuptools_cython-0.2.1-py2.7.egg")
1767 (delete-file "setuptools_cython-0.2.1.tar.gz")
1768 (delete-file "setup.cfg")
1769 (for-each delete-file
1770 (find-files "." "\\.so$"))
1771 ;; files should be writable for install phase
1772 (for-each (lambda (f) (chmod f #o755))
1773 (find-files "." "\\.py$")))
1774 %standard-phases)))
1775 (inputs
1776 `(("python-cython" ,python2-cython)
1777 ("python-numpy" ,python2-numpy)
1778 ("python-bx-python" ,python2-bx-python)
1779 ("python-networkx" ,python2-networkx)
1780 ("python-scipy" ,python2-scipy)
1781 ("python-pbcore" ,python2-pbcore)))
1782 (native-inputs
1783 `(("python-nose" ,python2-nose)
1784 ("python-setuptools" ,python2-setuptools)))
1785 (home-page "https://github.com/PacificBiosciences/cDNA_primer")
1786 (synopsis "Analyze transcriptome data generated with the Iso-Seq protocol")
1787 (description
1788 "pbtranscript-tofu contains scripts to analyze transcriptome data
1789 generated using the PacBio Iso-Seq protocol.")
1790 (license license:bsd-3))))
1791
1792 (define-public prodigal
1793 (package
1794 (name "prodigal")
1795 (version "2.6.2")
1796 (source (origin
1797 (method url-fetch)
1798 (uri (string-append
1799 "https://github.com/hyattpd/Prodigal/archive/v"
1800 version ".tar.gz"))
1801 (file-name (string-append name "-" version ".tar.gz"))
1802 (sha256
1803 (base32
1804 "0m8sb0fg6lmxrlpzna0am6svbnlmd3dckrhgzxxgb3gxr5fyj284"))))
1805 (build-system gnu-build-system)
1806 (arguments
1807 `(#:tests? #f ;no check target
1808 #:make-flags (list (string-append "INSTALLDIR="
1809 (assoc-ref %outputs "out")
1810 "/bin"))
1811 #:phases
1812 (modify-phases %standard-phases
1813 (delete 'configure))))
1814 (home-page "http://prodigal.ornl.gov")
1815 (synopsis "Protein-coding gene prediction for Archaea and Bacteria")
1816 (description
1817 "Prodigal runs smoothly on finished genomes, draft genomes, and
1818 metagenomes, providing gene predictions in GFF3, Genbank, or Sequin table
1819 format. It runs quickly, in an unsupervised fashion, handles gaps, handles
1820 partial genes, and identifies translation initiation sites.")
1821 (license license:gpl3+)))
1822
1823 (define-public rsem
1824 (package
1825 (name "rsem")
1826 (version "1.2.20")
1827 (source
1828 (origin
1829 (method url-fetch)
1830 (uri
1831 (string-append "http://deweylab.biostat.wisc.edu/rsem/src/rsem-"
1832 version ".tar.gz"))
1833 (sha256
1834 (base32 "0nzdc0j0hjllhsd5f2xli95dafm3nawskigs140xzvjk67xh0r9q"))
1835 (patches (list (search-patch "rsem-makefile.patch")))
1836 (modules '((guix build utils)))
1837 (snippet
1838 '(begin
1839 ;; remove bundled copy of boost
1840 (delete-file-recursively "boost")
1841 #t))))
1842 (build-system gnu-build-system)
1843 (arguments
1844 `(#:tests? #f ;no "check" target
1845 #:phases
1846 (modify-phases %standard-phases
1847 ;; No "configure" script.
1848 ;; Do not build bundled samtools library.
1849 (replace 'configure
1850 (lambda _
1851 (substitute* "Makefile"
1852 (("^all : sam/libbam.a") "all : "))
1853 #t))
1854 (replace 'install
1855 (lambda* (#:key outputs #:allow-other-keys)
1856 (let* ((out (string-append (assoc-ref outputs "out")))
1857 (bin (string-append out "/bin/"))
1858 (perl (string-append out "/lib/perl5/site_perl")))
1859 (mkdir-p bin)
1860 (mkdir-p perl)
1861 (for-each (lambda (file)
1862 (copy-file file
1863 (string-append bin (basename file))))
1864 (find-files "." "rsem-.*"))
1865 (copy-file "rsem_perl_utils.pm"
1866 (string-append perl "/rsem_perl_utils.pm")))
1867 #t))
1868 (add-after
1869 'install 'wrap-program
1870 (lambda* (#:key outputs #:allow-other-keys)
1871 (let ((out (assoc-ref outputs "out")))
1872 (for-each (lambda (prog)
1873 (wrap-program (string-append out "/bin/" prog)
1874 `("PERL5LIB" ":" prefix
1875 (,(string-append out "/lib/perl5/site_perl")))))
1876 '("rsem-plot-transcript-wiggles"
1877 "rsem-calculate-expression"
1878 "rsem-generate-ngvector"
1879 "rsem-run-ebseq"
1880 "rsem-prepare-reference")))
1881 #t)))))
1882 (inputs
1883 `(("boost" ,boost)
1884 ("ncurses" ,ncurses)
1885 ("r" ,r)
1886 ("perl" ,perl)
1887 ("samtools" ,samtools-0.1)
1888 ("zlib" ,zlib)))
1889 (home-page "http://deweylab.biostat.wisc.edu/rsem/")
1890 (synopsis "Estimate gene expression levels from RNA-Seq data")
1891 (description
1892 "RSEM is a software package for estimating gene and isoform expression
1893 levels from RNA-Seq data. The RSEM package provides a user-friendly
1894 interface, supports threads for parallel computation of the EM algorithm,
1895 single-end and paired-end read data, quality scores, variable-length reads and
1896 RSPD estimation. In addition, it provides posterior mean and 95% credibility
1897 interval estimates for expression levels. For visualization, it can generate
1898 BAM and Wiggle files in both transcript-coordinate and genomic-coordinate.")
1899 (license license:gpl3+)))
1900
1901 (define-public rseqc
1902 (package
1903 (name "rseqc")
1904 (version "2.6.1")
1905 (source
1906 (origin
1907 (method url-fetch)
1908 (uri
1909 (string-append "mirror://sourceforge/rseqc/"
1910 version "/RSeQC-" version ".tar.gz"))
1911 (sha256
1912 (base32 "15ly0254yi032qzkdplg00q144qfdsd986gh62829rl5bkxhj330"))
1913 (modules '((guix build utils)))
1914 (snippet
1915 '(begin
1916 ;; remove bundled copy of pysam
1917 (delete-file-recursively "lib/pysam")
1918 (substitute* "setup.py"
1919 ;; remove dependency on outdated "distribute" module
1920 (("^from distribute_setup import use_setuptools") "")
1921 (("^use_setuptools\\(\\)") "")
1922 ;; do not use bundled copy of pysam
1923 (("^have_pysam = False") "have_pysam = True"))))))
1924 (build-system python-build-system)
1925 (arguments `(#:python ,python-2))
1926 (inputs
1927 `(("python-cython" ,python2-cython)
1928 ("python-pysam" ,python2-pysam)
1929 ("python-numpy" ,python2-numpy)
1930 ("python-setuptools" ,python2-setuptools)
1931 ("zlib" ,zlib)))
1932 (native-inputs
1933 `(("python-nose" ,python2-nose)))
1934 (home-page "http://rseqc.sourceforge.net/")
1935 (synopsis "RNA-seq quality control package")
1936 (description
1937 "RSeQC provides a number of modules that can comprehensively evaluate
1938 high throughput sequence data, especially RNA-seq data. Some basic modules
1939 inspect sequence quality, nucleotide composition bias, PCR bias and GC bias,
1940 while RNA-seq specific modules evaluate sequencing saturation, mapped reads
1941 distribution, coverage uniformity, strand specificity, etc.")
1942 (license license:gpl3+)))
1943
1944 (define-public samtools
1945 (package
1946 (name "samtools")
1947 (version "1.2")
1948 (source
1949 (origin
1950 (method url-fetch)
1951 (uri
1952 (string-append "mirror://sourceforge/samtools/"
1953 version "/samtools-" version ".tar.bz2"))
1954 (sha256
1955 (base32
1956 "1akdqb685pk9xk1nb6sa9aq8xssjjhvvc06kp4cpdqvz2157l3j2"))))
1957 (build-system gnu-build-system)
1958 (arguments
1959 `(;; There are 87 test failures when building on non-64-bit architectures
1960 ;; due to invalid test data. This has since been fixed upstream (see
1961 ;; <https://github.com/samtools/samtools/pull/307>), but as there has
1962 ;; not been a new release we disable the tests for all non-64-bit
1963 ;; systems.
1964 #:tests? ,(string=? (or (%current-system) (%current-target-system))
1965 "x86_64-linux")
1966 #:modules ((ice-9 ftw)
1967 (ice-9 regex)
1968 (guix build gnu-build-system)
1969 (guix build utils))
1970 #:make-flags (list "LIBCURSES=-lncurses"
1971 (string-append "prefix=" (assoc-ref %outputs "out")))
1972 #:phases
1973 (alist-cons-after
1974 'unpack
1975 'patch-tests
1976 (lambda* (#:key inputs #:allow-other-keys)
1977 (let ((bash (assoc-ref inputs "bash")))
1978 (substitute* "test/test.pl"
1979 ;; The test script calls out to /bin/bash
1980 (("/bin/bash")
1981 (string-append bash "/bin/bash"))
1982 ;; There are two failing tests upstream relating to the "stats"
1983 ;; subcommand in test_usage_subcommand ("did not have Usage"
1984 ;; and "usage did not mention samtools stats"), so we disable
1985 ;; them.
1986 (("(test_usage_subcommand\\(.*\\);)" cmd)
1987 (string-append "unless ($subcommand eq 'stats') {" cmd "};")))))
1988 (alist-cons-after
1989 'install 'install-library
1990 (lambda* (#:key outputs #:allow-other-keys)
1991 (let ((lib (string-append (assoc-ref outputs "out") "/lib")))
1992 (mkdir-p lib)
1993 (copy-file "libbam.a" (string-append lib "/libbam.a"))))
1994 (alist-cons-after
1995 'install 'install-headers
1996 (lambda* (#:key outputs #:allow-other-keys)
1997 (let ((include (string-append (assoc-ref outputs "out")
1998 "/include/samtools/")))
1999 (mkdir-p include)
2000 (for-each (lambda (file)
2001 (copy-file file (string-append include
2002 (basename file))))
2003 (scandir "." (lambda (name) (string-match "\\.h$" name))))
2004 #t))
2005 (alist-delete 'configure %standard-phases))))))
2006 (native-inputs `(("pkg-config" ,pkg-config)))
2007 (inputs `(("ncurses" ,ncurses)
2008 ("perl" ,perl)
2009 ("python" ,python)
2010 ("zlib" ,zlib)))
2011 (home-page "http://samtools.sourceforge.net")
2012 (synopsis "Utilities to efficiently manipulate nucleotide sequence alignments")
2013 (description
2014 "Samtools implements various utilities for post-processing nucleotide
2015 sequence alignments in the SAM, BAM, and CRAM formats, including indexing,
2016 variant calling (in conjunction with bcftools), and a simple alignment
2017 viewer.")
2018 (license license:expat)))
2019
2020 (define-public samtools-0.1
2021 ;; This is the most recent version of the 0.1 line of samtools. The input
2022 ;; and output formats differ greatly from that used and produced by samtools
2023 ;; 1.x and is still used in many bioinformatics pipelines.
2024 (package (inherit samtools)
2025 (version "0.1.19")
2026 (source
2027 (origin
2028 (method url-fetch)
2029 (uri
2030 (string-append "mirror://sourceforge/samtools/"
2031 version "/samtools-" version ".tar.bz2"))
2032 (sha256
2033 (base32 "1m33xsfwz0s8qi45lylagfllqg7fphf4dr0780rsvw75av9wk06h"))))
2034 (arguments
2035 (substitute-keyword-arguments (package-arguments samtools)
2036 ((#:tests? tests) #f) ;no "check" target
2037 ((#:phases phases)
2038 `(modify-phases ,phases
2039 (replace 'install
2040 (lambda* (#:key outputs #:allow-other-keys)
2041 (let ((bin (string-append
2042 (assoc-ref outputs "out") "/bin")))
2043 (mkdir-p bin)
2044 (copy-file "samtools"
2045 (string-append bin "/samtools")))))
2046 (delete 'patch-tests)))))))
2047
2048 (define-public ngs-sdk
2049 (package
2050 (name "ngs-sdk")
2051 (version "1.1.1")
2052 (source
2053 (origin
2054 (method url-fetch)
2055 (uri
2056 (string-append "https://github.com/ncbi/ngs/archive/"
2057 version ".tar.gz"))
2058 (file-name (string-append name "-" version ".tar.gz"))
2059 (sha256
2060 (base32
2061 "1x58gpm574n0xmk2a98gmikbgycq78ia0bvnb42k5ck34fmd5v8y"))))
2062 (build-system gnu-build-system)
2063 (arguments
2064 `(#:parallel-build? #f ; not supported
2065 #:tests? #f ; no "check" target
2066 #:phases
2067 (alist-replace
2068 'configure
2069 (lambda* (#:key outputs #:allow-other-keys)
2070 (let ((out (assoc-ref outputs "out")))
2071 ;; The 'configure' script doesn't recognize things like
2072 ;; '--enable-fast-install'.
2073 (zero? (system* "./configure"
2074 (string-append "--build-prefix=" (getcwd) "/build")
2075 (string-append "--prefix=" out)))))
2076 (alist-cons-after
2077 'unpack 'enter-dir
2078 (lambda _ (chdir "ngs-sdk") #t)
2079 %standard-phases))))
2080 (native-inputs `(("perl" ,perl)))
2081 (home-page "https://github.com/ncbi/ngs")
2082 (synopsis "API for accessing Next Generation Sequencing data")
2083 (description
2084 "NGS is a domain-specific API for accessing reads, alignments and pileups
2085 produced from Next Generation Sequencing. The API itself is independent from
2086 any particular back-end implementation, and supports use of multiple back-ends
2087 simultaneously.")
2088 (license license:public-domain)))
2089
2090 (define-public ngs-java
2091 (package (inherit ngs-sdk)
2092 (name "ngs-java")
2093 (arguments
2094 `(,@(substitute-keyword-arguments
2095 `(#:modules ((guix build gnu-build-system)
2096 (guix build utils)
2097 (srfi srfi-1)
2098 (srfi srfi-26))
2099 ,@(package-arguments ngs-sdk))
2100 ((#:phases phases)
2101 `(alist-cons-after
2102 'enter-dir 'fix-java-symlink-installation
2103 (lambda _
2104 ;; Only replace the version suffix, not the version number in
2105 ;; the directory name. Reported here:
2106 ;; https://github.com/ncbi/ngs/pull/4
2107 (substitute* "Makefile.java"
2108 (((string-append "\\$\\(subst "
2109 "(\\$\\(VERSION[^\\)]*\\)),"
2110 "(\\$\\([^\\)]+\\)),"
2111 "(\\$\\([^\\)]+\\)|\\$\\@)"
2112 "\\)")
2113 _ pattern replacement target)
2114 (string-append "$(patsubst "
2115 "%" pattern ","
2116 "%" replacement ","
2117 target ")"))))
2118 (alist-replace
2119 'enter-dir (lambda _ (chdir "ngs-java") #t)
2120 ,phases))))))
2121 (inputs
2122 `(("jdk" ,icedtea6 "jdk")
2123 ("ngs-sdk" ,ngs-sdk)))
2124 (synopsis "Java bindings for NGS SDK")))
2125
2126 (define-public ncbi-vdb
2127 (package
2128 (name "ncbi-vdb")
2129 (version "2.4.5-5")
2130 (source
2131 (origin
2132 (method url-fetch)
2133 (uri
2134 (string-append "https://github.com/ncbi/ncbi-vdb/archive/"
2135 version ".tar.gz"))
2136 (file-name (string-append name "-" version ".tar.gz"))
2137 (sha256
2138 (base32
2139 "1cj8nk6if8sqagv20vx36v566fdvhcaadf0x1ycnbgql6chbs6vy"))))
2140 (build-system gnu-build-system)
2141 (arguments
2142 `(#:parallel-build? #f ; not supported
2143 #:tests? #f ; no "check" target
2144 #:phases
2145 (alist-replace
2146 'configure
2147 (lambda* (#:key inputs outputs #:allow-other-keys)
2148 (let ((out (assoc-ref outputs "out")))
2149 ;; Only replace the version suffix, not the version number in the
2150 ;; directory name; fixed in commit 4dbba5c6a809 (no release yet).
2151 (substitute* "setup/konfigure.perl"
2152 (((string-append "\\$\\(subst "
2153 "(\\$\\(VERSION[^\\)]*\\)),"
2154 "(\\$\\([^\\)]+\\)),"
2155 "(\\$\\([^\\)]+\\)|\\$\\@)"
2156 "\\)")
2157 _ pattern replacement target)
2158 (string-append "$(patsubst "
2159 "%" pattern ","
2160 "%" replacement ","
2161 target ")")))
2162
2163 ;; Override include path for libmagic
2164 (substitute* "setup/package.prl"
2165 (("name => 'magic', Include => '/usr/include'")
2166 (string-append "name=> 'magic', Include => '"
2167 (assoc-ref inputs "libmagic")
2168 "/include" "'")))
2169
2170 ;; Install kdf5 library (needed by sra-tools)
2171 (substitute* "build/Makefile.install"
2172 (("LIBRARIES_TO_INSTALL =")
2173 "LIBRARIES_TO_INSTALL = kdf5.$(VERSION_LIBX) kdf5.$(VERSION_SHLX)"))
2174
2175 ;; The 'configure' script doesn't recognize things like
2176 ;; '--enable-fast-install'.
2177 (zero? (system*
2178 "./configure"
2179 (string-append "--build-prefix=" (getcwd) "/build")
2180 (string-append "--prefix=" (assoc-ref outputs "out"))
2181 (string-append "--debug")
2182 (string-append "--with-xml2-prefix="
2183 (assoc-ref inputs "libxml2"))
2184 (string-append "--with-ngs-sdk-prefix="
2185 (assoc-ref inputs "ngs-sdk"))
2186 (string-append "--with-ngs-java-prefix="
2187 (assoc-ref inputs "ngs-java"))
2188 (string-append "--with-hdf5-prefix="
2189 (assoc-ref inputs "hdf5"))))))
2190 (alist-cons-after
2191 'install 'install-interfaces
2192 (lambda* (#:key outputs #:allow-other-keys)
2193 ;; Install interface libraries. On i686 the interface libraries
2194 ;; are installed to "linux/gcc/i386", so we need to use the Linux
2195 ;; architecture name ("i386") instead of the target system prefix
2196 ;; ("i686").
2197 (mkdir (string-append (assoc-ref outputs "out") "/ilib"))
2198 (copy-recursively (string-append "build/ncbi-vdb/linux/gcc/"
2199 ,(system->linux-architecture
2200 (or (%current-target-system)
2201 (%current-system)))
2202 "/rel/ilib")
2203 (string-append (assoc-ref outputs "out")
2204 "/ilib"))
2205 ;; Install interface headers
2206 (copy-recursively "interfaces"
2207 (string-append (assoc-ref outputs "out")
2208 "/include")))
2209 %standard-phases))))
2210 (inputs
2211 `(("libxml2" ,libxml2)
2212 ("ngs-sdk" ,ngs-sdk)
2213 ("ngs-java" ,ngs-java)
2214 ("libmagic" ,file)
2215 ("hdf5" ,hdf5)))
2216 (native-inputs `(("perl" ,perl)))
2217 (home-page "https://github.com/ncbi/ncbi-vdb")
2218 (synopsis "Database engine for genetic information")
2219 (description
2220 "The NCBI-VDB library implements a highly compressed columnar data
2221 warehousing engine that is most often used to store genetic information.
2222 Databases are stored in a portable image within the file system, and can be
2223 accessed/downloaded on demand across HTTP.")
2224 (license license:public-domain)))
2225
2226 (define-public plink
2227 (package
2228 (name "plink")
2229 (version "1.07")
2230 (source
2231 (origin
2232 (method url-fetch)
2233 (uri (string-append
2234 "http://pngu.mgh.harvard.edu/~purcell/plink/dist/plink-"
2235 version "-src.zip"))
2236 (sha256
2237 (base32 "0as8gxm4pjyc8dxmm1sl873rrd7wn5qs0l29nqfnl31x8i467xaa"))
2238 (patches (list (search-patch "plink-1.07-unclobber-i.patch")))))
2239 (build-system gnu-build-system)
2240 (arguments
2241 '(#:tests? #f ;no "check" target
2242 #:make-flags (list (string-append "LIB_LAPACK="
2243 (assoc-ref %build-inputs "lapack")
2244 "/lib/liblapack.so")
2245 "WITH_LAPACK=1"
2246 "FORCE_DYNAMIC=1"
2247 ;; disable phoning home
2248 "WITH_WEBCHECK=")
2249 #:phases
2250 (modify-phases %standard-phases
2251 ;; no "configure" script
2252 (delete 'configure)
2253 (replace 'install
2254 (lambda* (#:key outputs #:allow-other-keys)
2255 (let ((bin (string-append (assoc-ref outputs "out")
2256 "/bin/")))
2257 (mkdir-p bin)
2258 (copy-file "plink" (string-append bin "plink"))
2259 #t))))))
2260 (inputs
2261 `(("zlib" ,zlib)
2262 ("lapack" ,lapack)))
2263 (native-inputs
2264 `(("unzip" ,unzip)))
2265 (home-page "http://pngu.mgh.harvard.edu/~purcell/plink/")
2266 (synopsis "Whole genome association analysis toolset")
2267 (description
2268 "PLINK is a whole genome association analysis toolset, designed to
2269 perform a range of basic, large-scale analyses in a computationally efficient
2270 manner. The focus of PLINK is purely on analysis of genotype/phenotype data,
2271 so there is no support for steps prior to this (e.g. study design and
2272 planning, generating genotype or CNV calls from raw data). Through
2273 integration with gPLINK and Haploview, there is some support for the
2274 subsequent visualization, annotation and storage of results.")
2275 ;; Code is released under GPLv2, except for fisher.h, which is under
2276 ;; LGPLv2.1+
2277 (license (list license:gpl2 license:lgpl2.1+))))
2278
2279 (define-public preseq
2280 (package
2281 (name "preseq")
2282 (version "1.0.2")
2283 (source (origin
2284 (method url-fetch)
2285 (uri
2286 (string-append "http://smithlabresearch.org/downloads/preseq-"
2287 version ".tar.bz2"))
2288 (sha256
2289 (base32 "0r7sw07p6nv8ygvc17gd78lisbw5336v3vhs86b5wv8mw3pwqksc"))
2290 (patches (list (search-patch "preseq-1.0.2-install-to-PREFIX.patch")
2291 (search-patch "preseq-1.0.2-link-with-libbam.patch")))
2292 (modules '((guix build utils)))
2293 (snippet
2294 ;; Remove bundled samtools.
2295 '(delete-file-recursively "preseq-master/samtools"))))
2296 (build-system gnu-build-system)
2297 (arguments
2298 `(#:tests? #f ;no "check" target
2299 #:phases
2300 (modify-phases %standard-phases
2301 (add-after
2302 'unpack 'enter-dir
2303 (lambda _
2304 (chdir "preseq-master")
2305 #t))
2306 (add-after
2307 'enter-dir 'use-samtools-headers
2308 (lambda _
2309 (substitute* '("smithlab_cpp/SAM.cpp"
2310 "smithlab_cpp/SAM.hpp")
2311 (("sam.h") "samtools/sam.h"))
2312 #t))
2313 (delete 'configure))
2314 #:make-flags (list (string-append "PREFIX="
2315 (assoc-ref %outputs "out"))
2316 (string-append "LIBBAM="
2317 (assoc-ref %build-inputs "samtools")
2318 "/lib/libbam.a"))))
2319 (inputs
2320 `(("gsl" ,gsl)
2321 ("samtools" ,samtools-0.1)
2322 ("zlib" ,zlib)))
2323 (home-page "http://smithlabresearch.org/software/preseq/")
2324 (synopsis "Program for analyzing library complexity")
2325 (description
2326 "The preseq package is aimed at predicting and estimating the complexity
2327 of a genomic sequencing library, equivalent to predicting and estimating the
2328 number of redundant reads from a given sequencing depth and how many will be
2329 expected from additional sequencing using an initial sequencing experiment.
2330 The estimates can then be used to examine the utility of further sequencing,
2331 optimize the sequencing depth, or to screen multiple libraries to avoid low
2332 complexity samples.")
2333 (license license:gpl3+)))
2334
2335 (define-public sra-tools
2336 (package
2337 (name "sra-tools")
2338 (version "2.4.5-5")
2339 (source
2340 (origin
2341 (method url-fetch)
2342 (uri
2343 (string-append "https://github.com/ncbi/sra-tools/archive/"
2344 version ".tar.gz"))
2345 (file-name (string-append name "-" version ".tar.gz"))
2346 (sha256
2347 (base32
2348 "11nrnvz7a012f4iryf0wiwrid0h111grsfxbxa9j51h3f2xbvgns"))))
2349 (build-system gnu-build-system)
2350 (arguments
2351 `(#:parallel-build? #f ; not supported
2352 #:tests? #f ; no "check" target
2353 #:phases
2354 (alist-replace
2355 'configure
2356 (lambda* (#:key inputs outputs #:allow-other-keys)
2357 ;; The build system expects a directory containing the sources and
2358 ;; raw build output of ncbi-vdb, including files that are not
2359 ;; installed. Since we are building against an installed version of
2360 ;; ncbi-vdb, the following modifications are needed.
2361 (substitute* "setup/konfigure.perl"
2362 ;; Make the configure script look for the "ilib" directory of
2363 ;; "ncbi-vdb" without first checking for the existence of a
2364 ;; matching library in its "lib" directory.
2365 (("^ my \\$f = File::Spec->catdir\\(\\$libdir, \\$lib\\);")
2366 "my $f = File::Spec->catdir($ilibdir, $ilib);")
2367 ;; Look for interface libraries in ncbi-vdb's "ilib" directory.
2368 (("my \\$ilibdir = File::Spec->catdir\\(\\$builddir, 'ilib'\\);")
2369 "my $ilibdir = File::Spec->catdir($dir, 'ilib');"))
2370
2371 ;; The 'configure' script doesn't recognize things like
2372 ;; '--enable-fast-install'.
2373 (zero? (system*
2374 "./configure"
2375 (string-append "--build-prefix=" (getcwd) "/build")
2376 (string-append "--prefix=" (assoc-ref outputs "out"))
2377 (string-append "--debug")
2378 (string-append "--with-fuse-prefix="
2379 (assoc-ref inputs "fuse"))
2380 (string-append "--with-magic-prefix="
2381 (assoc-ref inputs "libmagic"))
2382 ;; TODO: building with libxml2 fails with linker errors
2383 ;; (string-append "--with-xml2-prefix="
2384 ;; (assoc-ref inputs "libxml2"))
2385 (string-append "--with-ncbi-vdb-sources="
2386 (assoc-ref inputs "ncbi-vdb"))
2387 (string-append "--with-ncbi-vdb-build="
2388 (assoc-ref inputs "ncbi-vdb"))
2389 (string-append "--with-ngs-sdk-prefix="
2390 (assoc-ref inputs "ngs-sdk"))
2391 (string-append "--with-hdf5-prefix="
2392 (assoc-ref inputs "hdf5")))))
2393 %standard-phases)))
2394 (native-inputs `(("perl" ,perl)))
2395 (inputs
2396 `(("ngs-sdk" ,ngs-sdk)
2397 ("ncbi-vdb" ,ncbi-vdb)
2398 ("libmagic" ,file)
2399 ("fuse" ,fuse)
2400 ("hdf5" ,hdf5)
2401 ("zlib" ,zlib)))
2402 (home-page "http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software")
2403 (synopsis "Tools and libraries for reading and writing sequencing data")
2404 (description
2405 "The SRA Toolkit from NCBI is a collection of tools and libraries for
2406 reading of sequencing files from the Sequence Read Archive (SRA) database and
2407 writing files into the .sra format.")
2408 (license license:public-domain)))
2409
2410 (define-public seqan
2411 (package
2412 (name "seqan")
2413 (version "1.4.2")
2414 (source (origin
2415 (method url-fetch)
2416 (uri (string-append "http://packages.seqan.de/seqan-library/"
2417 "seqan-library-" version ".tar.bz2"))
2418 (sha256
2419 (base32
2420 "05s3wrrwn50f81aklfm65i4a749zag1vr8z03k21xm0pdxy47yvp"))))
2421 ;; The documentation is 7.8MB and the includes are 3.6MB heavy, so it
2422 ;; makes sense to split the outputs.
2423 (outputs '("out" "doc"))
2424 (build-system trivial-build-system)
2425 (arguments
2426 `(#:modules ((guix build utils))
2427 #:builder
2428 (begin
2429 (use-modules (guix build utils))
2430 (let ((tar (assoc-ref %build-inputs "tar"))
2431 (bzip (assoc-ref %build-inputs "bzip2"))
2432 (out (assoc-ref %outputs "out"))
2433 (doc (assoc-ref %outputs "doc")))
2434 (setenv "PATH" (string-append tar "/bin:" bzip "/bin"))
2435 (system* "tar" "xvf" (assoc-ref %build-inputs "source"))
2436 (chdir (string-append "seqan-library-" ,version))
2437 (copy-recursively "include" (string-append out "/include"))
2438 (copy-recursively "share" (string-append doc "/share"))))))
2439 (native-inputs
2440 `(("source" ,source)
2441 ("tar" ,tar)
2442 ("bzip2" ,bzip2)))
2443 (home-page "http://www.seqan.de")
2444 (synopsis "Library for nucleotide sequence analysis")
2445 (description
2446 "SeqAn is a C++ library of efficient algorithms and data structures for
2447 the analysis of sequences with the focus on biological data. It contains
2448 algorithms and data structures for string representation and their
2449 manipulation, online and indexed string search, efficient I/O of
2450 bioinformatics file formats, sequence alignment, and more.")
2451 (license license:bsd-3)))
2452
2453 (define-public star
2454 (package
2455 (name "star")
2456 (version "2.4.2a")
2457 (source (origin
2458 (method url-fetch)
2459 (uri (string-append
2460 "https://github.com/alexdobin/STAR/archive/STAR_"
2461 version ".tar.gz"))
2462 (sha256
2463 (base32
2464 "1c3rnm7r5l0kl3d04gl1g7938xqf1c2l0mla87rlplqg1hcns5mc"))
2465 (modules '((guix build utils)))
2466 (snippet
2467 '(substitute* "source/Makefile"
2468 (("/bin/rm") "rm")))))
2469 (build-system gnu-build-system)
2470 (arguments
2471 '(#:tests? #f ;no check target
2472 #:make-flags '("STAR")
2473 #:phases
2474 (alist-cons-after
2475 'unpack 'enter-source-dir (lambda _ (chdir "source"))
2476 (alist-replace
2477 'install
2478 (lambda* (#:key outputs #:allow-other-keys)
2479 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
2480 (mkdir-p bin)
2481 (copy-file "STAR" (string-append bin "STAR"))))
2482 (alist-delete
2483 'configure %standard-phases)))))
2484 (native-inputs
2485 `(("vim" ,vim))) ; for xxd
2486 (inputs
2487 `(("zlib" ,zlib)))
2488 (home-page "https://github.com/alexdobin/STAR")
2489 (synopsis "Universal RNA-seq aligner")
2490 (description
2491 "The Spliced Transcripts Alignment to a Reference (STAR) software is
2492 based on a previously undescribed RNA-seq alignment algorithm that uses
2493 sequential maximum mappable seed search in uncompressed suffix arrays followed
2494 by seed clustering and stitching procedure. In addition to unbiased de novo
2495 detection of canonical junctions, STAR can discover non-canonical splices and
2496 chimeric (fusion) transcripts, and is also capable of mapping full-length RNA
2497 sequences.")
2498 ;; STAR is licensed under GPLv3 or later; htslib is MIT-licensed.
2499 (license license:gpl3+)))
2500
2501 (define-public subread
2502 (package
2503 (name "subread")
2504 (version "1.4.6-p2")
2505 (source (origin
2506 (method url-fetch)
2507 (uri (string-append
2508 "mirror://sourceforge/subread/subread-"
2509 version "-source.tar.gz"))
2510 (sha256
2511 (base32
2512 "06sv9mpcsdj6p68y15d6gi70lca3lxmzk0dn61hg0kfsa7rxmsr3"))))
2513 (build-system gnu-build-system)
2514 (arguments
2515 `(#:tests? #f ;no "check" target
2516 #:make-flags '("-f" "Makefile.Linux")
2517 #:phases
2518 (alist-cons-after
2519 'unpack 'enter-dir
2520 (lambda _ (chdir "src") #t)
2521 (alist-replace
2522 'install
2523 (lambda* (#:key outputs #:allow-other-keys)
2524 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
2525 (mkdir-p bin)
2526 (copy-recursively "../bin" bin)))
2527 ;; no "configure" script
2528 (alist-delete 'configure %standard-phases)))))
2529 (inputs `(("zlib" ,zlib)))
2530 (home-page "http://bioinf.wehi.edu.au/subread-package/")
2531 (synopsis "Tool kit for processing next-gen sequencing data")
2532 (description
2533 "The subread package contains the following tools: subread aligner, a
2534 general-purpose read aligner; subjunc aligner: detecting exon-exon junctions
2535 and mapping RNA-seq reads; featureCounts: counting mapped reads for genomic
2536 features; exactSNP: a SNP caller that discovers SNPs by testing signals
2537 against local background noises.")
2538 (license license:gpl3+)))
2539
2540 (define-public vcftools
2541 (package
2542 (name "vcftools")
2543 (version "0.1.12b")
2544 (source (origin
2545 (method url-fetch)
2546 (uri (string-append
2547 "mirror://sourceforge/vcftools/vcftools_"
2548 version ".tar.gz"))
2549 (sha256
2550 (base32
2551 "148al9h7f8g8my2qdnpax51kdd2yjrivlx6frvakf4lz5r8j88wx"))))
2552 (build-system gnu-build-system)
2553 (arguments
2554 `(#:tests? #f ; no "check" target
2555 #:make-flags (list
2556 "CFLAGS=-O2" ; override "-m64" flag
2557 (string-append "PREFIX=" (assoc-ref %outputs "out"))
2558 (string-append "MANDIR=" (assoc-ref %outputs "out")
2559 "/share/man/man1"))
2560 #:phases
2561 (alist-cons-after
2562 'unpack 'patch-manpage-install
2563 (lambda _
2564 (substitute* "Makefile"
2565 (("cp \\$\\{PREFIX\\}/cpp/vcftools.1") "cp ./cpp/vcftools.1")))
2566 (alist-delete 'configure %standard-phases))))
2567 (inputs
2568 `(("perl" ,perl)
2569 ("zlib" ,zlib)))
2570 (home-page "http://vcftools.sourceforge.net/")
2571 (synopsis "Tools for working with VCF files")
2572 (description
2573 "VCFtools is a program package designed for working with VCF files, such
2574 as those generated by the 1000 Genomes Project. The aim of VCFtools is to
2575 provide easily accessible methods for working with complex genetic variation
2576 data in the form of VCF files.")
2577 ;; The license is declared as LGPLv3 in the README and
2578 ;; at http://vcftools.sourceforge.net/license.html
2579 (license license:lgpl3)))