1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015 Ricardo Wurmus <rekado@elephly.net>
3 ;;; Copyright © 2015 Ben Woodcroft <donttrustben@gmail.com>
5 ;;; This file is part of GNU Guix.
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
20 (define-module (gnu packages bioinformatics)
21 #:use-module ((guix licenses) #:prefix license:)
22 #:use-module (guix packages)
23 #:use-module (guix utils)
24 #:use-module (guix download)
25 #:use-module (guix git-download)
26 #:use-module (guix build-system gnu)
27 #:use-module (guix build-system cmake)
28 #:use-module (guix build-system perl)
29 #:use-module (guix build-system python)
30 #:use-module (guix build-system ruby)
31 #:use-module (guix build-system trivial)
32 #:use-module (gnu packages)
33 #:use-module (gnu packages algebra)
34 #:use-module (gnu packages base)
35 #:use-module (gnu packages boost)
36 #:use-module (gnu packages compression)
37 #:use-module (gnu packages cpio)
38 #:use-module (gnu packages file)
39 #:use-module (gnu packages java)
40 #:use-module (gnu packages linux)
41 #:use-module (gnu packages machine-learning)
42 #:use-module (gnu packages maths)
43 #:use-module (gnu packages ncurses)
44 #:use-module (gnu packages perl)
45 #:use-module (gnu packages pkg-config)
46 #:use-module (gnu packages popt)
47 #:use-module (gnu packages protobuf)
48 #:use-module (gnu packages python)
49 #:use-module (gnu packages ruby)
50 #:use-module (gnu packages statistics)
51 #:use-module (gnu packages tbb)
52 #:use-module (gnu packages textutils)
53 #:use-module (gnu packages vim)
54 #:use-module (gnu packages web)
55 #:use-module (gnu packages xml)
56 #:use-module (gnu packages zip)
57 #:use-module (srfi srfi-1))
59 (define-public aragorn
66 "http://mbio-serv2.mbioekol.lu.se/ARAGORN/Downloads/aragorn"
70 "1dg7jlz1qpqy88igjxd6ncs11ccsirb36qv1z01a0np4i4jh61mb"))))
71 (build-system gnu-build-system)
73 `(#:tests? #f ; there are no tests
75 (modify-phases %standard-phases
85 (string-append "aragorn" ,version ".c")))))
87 (lambda* (#:key outputs #:allow-other-keys)
88 (let* ((out (assoc-ref outputs "out"))
89 (bin (string-append out "/bin"))
90 (man (string-append out "/share/man/man1")))
93 (string-append bin "/aragorn"))
95 (copy-file "aragorn.1"
96 (string-append man "/aragorn.1")))
98 (home-page "http://mbio-serv2.mbioekol.lu.se/ARAGORN")
99 (synopsis "Detect tRNA, mtRNA and tmRNA genes in nucleotide sequences")
101 "Aragorn identifies transfer RNA, mitochondrial RNA and
102 transfer-messenger RNA from nucleotide sequences, based on homology to known
103 tRNA consensus sequences and RNA structure. It also outputs the secondary
104 structure of the predicted RNA.")
105 (license license:gpl2)))
107 (define-public bamtools
114 "https://github.com/pezmaster31/bamtools/archive/v"
116 (file-name (string-append name "-" version ".tar.gz"))
119 "1brry29bw2xr2l9pqn240rkqwayg85b8qq78zk2zs6nlspk4d018"))))
120 (build-system cmake-build-system)
122 `(#:tests? #f ;no "check" target
124 (modify-phases %standard-phases
126 'configure 'set-ldflags
127 (lambda* (#:key outputs #:allow-other-keys)
131 (assoc-ref outputs "out") "/lib/bamtools")))))))
132 (inputs `(("zlib" ,zlib)))
133 (home-page "https://github.com/pezmaster31/bamtools")
134 (synopsis "C++ API and command-line toolkit for working with BAM data")
136 "BamTools provides both a C++ API and a command-line toolkit for handling
138 (license license:expat)))
140 (define-public bedops
146 (uri (string-append "https://github.com/bedops/bedops/archive/v"
148 (file-name (string-append name "-" version ".tar.gz"))
151 "1kqbac547wyqma81cyky9n7mkgikjpsfd3nnmcm6hpqwanqgh10v"))))
152 (build-system gnu-build-system)
155 #:make-flags (list (string-append "BINDIR=" %output "/bin"))
158 'unpack 'unpack-tarballs
160 ;; FIXME: Bedops includes tarballs of minimally patched upstream
161 ;; libraries jansson, zlib, and bzip2. We cannot just use stock
162 ;; libraries because at least one of the libraries (zlib) is
163 ;; patched to add a C++ function definition (deflateInit2cpp).
164 ;; Until the Bedops developers offer a way to link against system
165 ;; libraries we have to build the in-tree copies of these three
168 ;; See upstream discussion:
169 ;; https://github.com/bedops/bedops/issues/124
171 ;; Unpack the tarballs to benefit from shebang patching.
172 (with-directory-excursion "third-party"
173 (and (zero? (system* "tar" "xvf" "jansson-2.6.tar.bz2"))
174 (zero? (system* "tar" "xvf" "zlib-1.2.7.tar.bz2"))
175 (zero? (system* "tar" "xvf" "bzip2-1.0.6.tar.bz2"))))
176 ;; Disable unpacking of tarballs in Makefile.
177 (substitute* "system.mk/Makefile.linux"
178 (("^\tbzcat .*") "\t@echo \"not unpacking\"\n")
179 (("\\./configure") "CONFIG_SHELL=bash ./configure"))
180 (substitute* "third-party/zlib-1.2.7/Makefile.in"
181 (("^SHELL=.*$") "SHELL=bash\n")))
182 (alist-delete 'configure %standard-phases))))
183 (home-page "https://github.com/bedops/bedops")
184 (synopsis "Tools for high-performance genomic feature operations")
186 "BEDOPS is a suite of tools to address common questions raised in genomic
187 studies---mostly with regard to overlap and proximity relationships between
188 data sets. It aims to be scalable and flexible, facilitating the efficient
189 and accurate analysis and management of large-scale genomic data.
191 BEDOPS provides tools that perform highly efficient and scalable Boolean and
192 other set operations, statistical calculations, archiving, conversion and
193 other management of genomic data of arbitrary scale. Tasks can be easily
194 split by chromosome for distributing whole-genome analyses across a
195 computational cluster.")
196 (license license:gpl2+)))
198 (define-public bedtools
204 (uri (string-append "https://github.com/arq5x/bedtools2/archive/v"
206 (file-name (string-append name "-" version ".tar.gz"))
209 "0lnxrjvs3nnmb4bmskag1wg3h2hd80przz5q3xd0bvs7vyxrvpbl"))
210 (patches (list (search-patch "bedtools-32bit-compilation.patch")))))
211 (build-system gnu-build-system)
212 (native-inputs `(("python" ,python-2)))
213 (inputs `(("samtools" ,samtools)
216 '(#:test-target "test"
219 'unpack 'patch-makefile-SHELL-definition
221 ;; patch-makefile-SHELL cannot be used here as it does not
222 ;; yet patch definitions with `:='. Since changes to
223 ;; patch-makefile-SHELL result in a full rebuild, features
224 ;; of patch-makefile-SHELL are reimplemented here.
225 (substitute* "Makefile"
226 (("^SHELL := .*$") (string-append "SHELL := " (which "bash") " -e \n"))))
231 (lambda* (#:key outputs #:allow-other-keys)
232 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
234 (for-each (lambda (file)
235 (copy-file file (string-append bin (basename file))))
236 (find-files "bin" ".*"))))
237 %standard-phases)))))
238 (home-page "https://github.com/arq5x/bedtools2")
239 (synopsis "Tools for genome analysis and arithmetic")
241 "Collectively, the bedtools utilities are a swiss-army knife of tools for
242 a wide-range of genomics analysis tasks. The most widely-used tools enable
243 genome arithmetic: that is, set theory on the genome. For example, bedtools
244 allows one to intersect, merge, count, complement, and shuffle genomic
245 intervals from multiple files in widely-used genomic file formats such as BAM,
247 (license license:gpl2)))
249 (define-public python2-pybedtools
251 (name "python2-pybedtools")
256 "https://pypi.python.org/packages/source/p/pybedtools/pybedtools-"
260 "1ldzdxw1p4y3g2ignmggsdypvqkcwqwzhdha4rbgpih048z5p4an"))))
261 (build-system python-build-system)
262 (arguments `(#:python ,python-2)) ; no Python 3 support
264 `(("python-cython" ,python2-cython)
265 ("python-matplotlib" ,python2-matplotlib)))
267 `(("bedtools" ,bedtools)
268 ("samtools" ,samtools)))
270 `(("python-pyyaml" ,python2-pyyaml)
271 ("python-nose" ,python2-nose)
272 ("python-setuptools" ,python2-setuptools)))
273 (home-page "https://pythonhosted.org/pybedtools/")
274 (synopsis "Python wrapper for BEDtools programs")
276 "pybedtools is a Python wrapper for Aaron Quinlan's BEDtools programs,
277 which are widely used for genomic interval manipulation or \"genome algebra\".
278 pybedtools extends BEDTools by offering feature-level manipulations from with
280 (license license:gpl2+)))
282 (define-public bioperl-minimal
283 (let* ((inputs `(("perl-module-build" ,perl-module-build)
284 ("perl-data-stag" ,perl-data-stag)
285 ("perl-libwww" ,perl-libwww)
286 ("perl-uri" ,perl-uri)))
288 (map (compose package-name cadr)
291 (map (compose package-transitive-target-inputs cadr) inputs))))))
293 (name "bioperl-minimal")
298 (uri (string-append "mirror://cpan/authors/id/C/CJ/CJFIELDS/BioPerl-"
302 "1l3npcvvvwjlhkna9dndpfv1hklhrgva013kw96m0n1wpd37ask1"))))
303 (build-system perl-build-system)
306 (modify-phases %standard-phases
308 'install 'wrap-programs
309 (lambda* (#:key outputs #:allow-other-keys)
310 ;; Make sure all executables in "bin" find the required Perl
311 ;; modules at runtime. As the PERL5LIB variable contains also
312 ;; the paths of native inputs, we pick the transitive target
313 ;; inputs from %build-inputs.
314 (let* ((out (assoc-ref outputs "out"))
315 (bin (string-append out "/bin/"))
317 (cons (string-append out "/lib/perl5/site_perl")
319 (assoc-ref %build-inputs name))
320 ',transitive-inputs))
322 (for-each (lambda (file)
324 `("PERL5LIB" ":" prefix (,path))))
325 (find-files bin "\\.pl$"))
329 `(("perl-test-most" ,perl-test-most)))
330 (home-page "http://search.cpan.org/dist/BioPerl")
331 (synopsis "Bioinformatics toolkit")
333 "BioPerl is the product of a community effort to produce Perl code which
334 is useful in biology. Examples include Sequence objects, Alignment objects
335 and database searching objects. These objects not only do what they are
336 advertised to do in the documentation, but they also interact - Alignment
337 objects are made from the Sequence objects, Sequence objects have access to
338 Annotation and SeqFeature objects and databases, Blast objects can be
339 converted to Alignment objects, and so on. This means that the objects
340 provide a coordinated and extensible framework to do computational biology.")
341 (license (package-license perl)))))
343 (define-public python-biopython
345 (name "python-biopython")
350 "http://biopython.org/DIST/biopython-"
354 "13m8s9jkrw40zvdp1rl709n6lmgdh4f52aann7gzr6sfp0fwhg26"))))
355 (build-system python-build-system)
357 `(("python-numpy" ,python-numpy)))
359 `(("python-setuptools" ,python2-setuptools)))
360 (home-page "http://biopython.org/")
361 (synopsis "Tools for biological computation in Python")
363 "Biopython is a set of tools for biological computation including parsers
364 for bioinformatics files into Python data structures; interfaces to common
365 bioinformatics programs; a standard sequence class and tools for performing
366 common operations on them; code to perform data classification; code for
367 dealing with alignments; code making it easy to split up parallelizable tasks
368 into separate processes; and more.")
369 (license (license:non-copyleft "http://www.biopython.org/DIST/LICENSE"))))
371 (define-public python2-biopython
372 (package (inherit (package-with-python2 python-biopython))
374 `(("python2-numpy" ,python2-numpy)))))
376 (define-public blast+
383 "ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/"
384 version "/ncbi-blast-" version "+-src.tar.gz"))
387 "19gq6as4k1jrgsd26158ads6h7v4jca3h4r5dzg1y0m6ya50x5ph"))
388 (modules '((guix build utils)))
391 ;; Remove bundled bzip2 and zlib
392 (delete-file-recursively "c++/src/util/compress/bzip2")
393 (delete-file-recursively "c++/src/util/compress/zlib")
394 (substitute* "c++/src/util/compress/Makefile.in"
395 (("bzip2 zlib api") "api"))
396 ;; Remove useless msbuild directory
397 (delete-file-recursively
398 "c++/src/build-system/project_tree_builder/msbuild")
400 (build-system gnu-build-system)
402 `(;; There are three(!) tests for this massive library, and all fail with
403 ;; "unparsable timing stats".
404 ;; ERR [127] -- [util/regexp] test_pcre.sh (unparsable timing stats)
405 ;; ERR [127] -- [serial/datatool] datatool.sh (unparsable timing stats)
406 ;; ERR [127] -- [serial/datatool] datatool_xml.sh (unparsable timing stats)
409 #:parallel-build? #f ; not supported
411 (modify-phases %standard-phases
414 ;; $HOME needs to be set at some point during the configure phase
415 (lambda _ (setenv "HOME" "/tmp") #t))
418 (lambda _ (chdir "c++") #t))
420 'enter-dir 'fix-build-system
423 (cond ((string=? cmd "date")
424 ;; make call to "date" deterministic
429 (format (current-error-port)
430 "WARNING: Unable to find absolute path for ~s~%"
434 ;; Rewrite hardcoded paths to various tools
435 (substitute* (append '("src/build-system/configure.ac"
436 "src/build-system/configure"
437 "scripts/common/impl/if_diff.sh"
438 "scripts/common/impl/run_with_lock.sh"
439 "src/build-system/Makefile.configurables.real"
440 "src/build-system/Makefile.in.top"
441 "src/build-system/Makefile.meta.gmake=no"
442 "src/build-system/Makefile.meta.in"
443 "src/build-system/Makefile.meta_l"
444 "src/build-system/Makefile.meta_p"
445 "src/build-system/Makefile.meta_r"
446 "src/build-system/Makefile.mk.in"
447 "src/build-system/Makefile.requirements"
448 "src/build-system/Makefile.rules_with_autodep.in")
449 (find-files "scripts/common/check" "\\.sh$"))
450 (("(/usr/bin/|/bin/)([a-z][-_.a-z]*)" all dir cmd)
451 (or (which* cmd) all)))
453 (substitute* (find-files "src/build-system" "^config.*")
454 (("LN_S=/bin/\\$LN_S") (string-append "LN_S=" (which "ln")))
457 ;; rewrite "/var/tmp" in check script
458 (substitute* "scripts/common/check/check_make_unix.sh"
459 (("/var/tmp") "/tmp"))
462 (substitute* (find-files "scripts/common/impl/" "\\.sh$")
464 (("action=/bin/") "action=")
465 (("export PATH") ":"))
469 (lambda* (#:key inputs outputs #:allow-other-keys)
470 (let ((out (assoc-ref outputs "out"))
471 (lib (string-append (assoc-ref outputs "lib") "/lib"))
472 (include (string-append (assoc-ref outputs "include")
473 "/include/ncbi-tools++")))
474 ;; The 'configure' script doesn't recognize things like
475 ;; '--enable-fast-install'.
476 (zero? (system* "./configure.orig"
477 (string-append "--with-build-root=" (getcwd) "/build")
478 (string-append "--prefix=" out)
479 (string-append "--libdir=" lib)
480 (string-append "--includedir=" include)
481 (string-append "--with-bz2="
482 (assoc-ref inputs "bzip2"))
483 (string-append "--with-z="
484 (assoc-ref inputs "zlib"))
485 ;; Each library is built twice by default, once
486 ;; with "-static" in its name, and again
490 (outputs '("out" ; 19 MB
498 (home-page "http://blast.ncbi.nlm.nih.gov")
499 (synopsis "Basic local alignment search tool")
501 "BLAST is a popular method of performing a DNA or protein sequence
502 similarity search, using heuristics to produce results quickly. It also
503 calculates an “expect value” that estimates how many matches would have
504 occurred at a given score by chance, which can aid a user in judging how much
505 confidence to have in an alignment.")
506 ;; Most of the sources are in the public domain, with the following
509 ;; * ./c++/include/util/bitset/
510 ;; * ./c++/src/html/ncbi_menu*.js
512 ;; * ./c++/include/util/impl/floating_point_comparison.hpp
514 ;; * ./c++/include/dbapi/driver/odbc/unix_odbc/
516 ;; * ./c++/src/corelib/teamcity_*
517 (license (list license:public-domain
523 (define-public bowtie
529 (uri (string-append "https://github.com/BenLangmead/bowtie2/archive/v"
531 (file-name (string-append name "-" version ".tar.gz"))
534 "15dnbqippwvhyh9zqjhaxkabk7lm1xbh1nvar1x4b5kwm117zijn"))
535 (modules '((guix build utils)))
537 '(substitute* "Makefile"
538 (("^CC = .*$") "CC = gcc")
539 (("^CPP = .*$") "CPP = g++")
540 ;; replace BUILD_HOST and BUILD_TIME for deterministic build
541 (("-DBUILD_HOST=.*") "-DBUILD_HOST=\"\\\"guix\\\"\"")
542 (("-DBUILD_TIME=.*") "-DBUILD_TIME=\"\\\"0\\\"\"")))
543 (patches (list (search-patch "bowtie-fix-makefile.patch")))))
544 (build-system gnu-build-system)
545 (inputs `(("perl" ,perl)
546 ("perl-clone" ,perl-clone)
547 ("perl-test-deep" ,perl-test-deep)
548 ("perl-test-simple" ,perl-test-simple)
549 ("python" ,python-2)))
551 '(#:make-flags '("allall")
557 (lambda* (#:key outputs #:allow-other-keys)
558 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
560 (for-each (lambda (file)
561 (copy-file file (string-append bin file)))
562 (find-files "." "bowtie2.*"))))
565 (lambda* (#:key outputs #:allow-other-keys)
567 "scripts/test/simple_tests.pl"
568 "--bowtie2=./bowtie2"
569 "--bowtie2-build=./bowtie2-build"))
570 %standard-phases)))))
571 (home-page "http://bowtie-bio.sourceforge.net/bowtie2/index.shtml")
572 (synopsis "Fast and sensitive nucleotide sequence read aligner")
574 "Bowtie 2 is a fast and memory-efficient tool for aligning sequencing
575 reads to long reference sequences. It is particularly good at aligning reads
576 of about 50 up to 100s or 1,000s of characters, and particularly good at
577 aligning to relatively long (e.g. mammalian) genomes. Bowtie 2 indexes the
578 genome with an FM Index to keep its memory footprint small: for the human
579 genome, its memory footprint is typically around 3.2 GB. Bowtie 2 supports
580 gapped, local, and paired-end alignment modes.")
581 (supported-systems '("x86_64-linux"))
582 (license license:gpl3+)))
590 (uri (string-append "mirror://sourceforge/bio-bwa/bwa-"
594 "1330dpqncv0px3pbhjzz1gwgg39kkcv2r9qp2xs0sixf8z8wl7bh"))))
595 (build-system gnu-build-system)
597 '(#:tests? #f ;no "check" target
601 (lambda* (#:key outputs #:allow-other-keys)
602 (let ((bin (string-append
603 (assoc-ref outputs "out") "/bin"))
605 (assoc-ref outputs "out") "/share/doc/bwa"))
607 (assoc-ref outputs "out") "/share/man/man1")))
611 (copy-file "bwa" (string-append bin "/bwa"))
612 (copy-file "README.md" (string-append doc "/README.md"))
613 (copy-file "bwa.1" (string-append man "/bwa.1"))))
614 ;; no "configure" script
615 (alist-delete 'configure %standard-phases))))
616 (inputs `(("zlib" ,zlib)))
617 (home-page "http://bio-bwa.sourceforge.net/")
618 (synopsis "Burrows-Wheeler sequence aligner")
620 "BWA is a software package for mapping low-divergent sequences against a
621 large reference genome, such as the human genome. It consists of three
622 algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is
623 designed for Illumina sequence reads up to 100bp, while the rest two for
624 longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share similar
625 features such as long-read support and split alignment, but BWA-MEM, which is
626 the latest, is generally recommended for high-quality queries as it is faster
627 and more accurate. BWA-MEM also has better performance than BWA-backtrack for
628 70-100bp Illumina reads.")
629 (license license:gpl3+)))
631 (define-public python2-bx-python
633 (name "python2-bx-python")
638 "https://pypi.python.org/packages/source/b/bx-python/bx-python-"
642 "0ld49idhc5zjdvbhvjq1a2qmpjj7h5v58rqr25dzmfq7g34b50xh"))
643 (modules '((guix build utils)))
645 '(substitute* "setup.py"
646 ;; remove dependency on outdated "distribute" module
647 (("^from distribute_setup import use_setuptools") "")
648 (("^use_setuptools\\(\\)") "")))))
649 (build-system python-build-system)
651 `(#:tests? #f ;tests fail because test data are not included
654 `(("python-numpy" ,python2-numpy)
657 `(("python-nose" ,python2-nose)
658 ("python-setuptools" ,python2-setuptools)))
659 (home-page "http://bitbucket.org/james_taylor/bx-python/")
660 (synopsis "Tools for manipulating biological data")
662 "bx-python provides tools for manipulating biological data, particularly
663 multiple sequence alignments.")
664 (license license:expat)))
666 (define-public clipper
673 "https://github.com/YeoLab/clipper/archive/"
677 "1q7jpimsqln7ic44i8v2rx2haj5wvik8hc1s2syd31zcn0xk1iyq"))
678 (modules '((guix build utils)))
680 ;; remove unnecessary setup dependency
681 '(substitute* "setup.py"
682 (("setup_requires = .*") "")))))
683 (build-system python-build-system)
684 (arguments `(#:python ,python-2)) ; only Python 2 is supported
687 ("python-pybedtools" ,python2-pybedtools)
688 ("python-cython" ,python2-cython)
689 ("python-scikit-learn" ,python2-scikit-learn)
690 ("python-matplotlib" ,python2-matplotlib)
691 ("python-pysam" ,python2-pysam)
692 ("python-numpy" ,python2-numpy)
693 ("python-scipy" ,python2-scipy)))
695 `(("python-mock" ,python2-mock) ; for tests
696 ("python-pytz" ,python2-pytz) ; for tests
697 ("python-setuptools" ,python2-setuptools)))
698 (home-page "https://github.com/YeoLab/clipper")
699 (synopsis "CLIP peak enrichment recognition")
701 "CLIPper is a tool to define peaks in CLIP-seq datasets.")
702 (license license:gpl2)))
704 (define-public couger
711 "http://couger.oit.duke.edu/static/assets/COUGER"
715 "04p2b14nmhzxw5h72mpzdhalv21bx4w9b87z0wpw0xzxpysyncmq"))))
716 (build-system gnu-build-system)
720 (modify-phases %standard-phases
725 (lambda* (#:key outputs #:allow-other-keys)
726 (let ((out (assoc-ref outputs "out")))
727 (copy-recursively "src" (string-append out "/src"))
728 (mkdir (string-append out "/bin"))
729 ;; Add "src" directory to module lookup path.
730 (substitute* "couger"
732 (string-append "import sys\nsys.path.append(\""
733 out "\")\nfrom argparse")))
734 (copy-file "couger" (string-append out "/bin/couger")))
737 'install 'wrap-program
738 (lambda* (#:key inputs outputs #:allow-other-keys)
739 ;; Make sure 'couger' runs with the correct PYTHONPATH.
740 (let* ((out (assoc-ref outputs "out"))
741 (path (getenv "PYTHONPATH")))
742 (wrap-program (string-append out "/bin/couger")
743 `("PYTHONPATH" ":" prefix (,path))))
746 `(("python" ,python-2)
747 ("python2-pillow" ,python2-pillow)
748 ("python2-numpy" ,python2-numpy)
749 ("python2-scipy" ,python2-scipy)
750 ("python2-matplotlib" ,python2-matplotlib)))
754 ("randomjungle" ,randomjungle)))
757 (home-page "http://couger.oit.duke.edu")
758 (synopsis "Identify co-factors in sets of genomic regions")
760 "COUGER can be applied to any two sets of genomic regions bound by
761 paralogous TFs (e.g., regions derived from ChIP-seq experiments) to identify
762 putative co-factors that provide specificity to each TF. The framework
763 determines the genomic targets uniquely-bound by each TF, and identifies a
764 small set of co-factors that best explain the in vivo binding differences
767 COUGER uses classification algorithms (support vector machines and random
768 forests) with features that reflect the DNA binding specificities of putative
769 co-factors. The features are generated either from high-throughput TF-DNA
770 binding data (from protein binding microarray experiments), or from large
771 collections of DNA motifs.")
772 (license license:gpl3+)))
774 (define-public clustal-omega
776 (name "clustal-omega")
781 "http://www.clustal.org/omega/clustal-omega-"
785 "02ibkx0m0iwz8nscg998bh41gg251y56cgh86bvyrii5m8kjgwqf"))))
786 (build-system gnu-build-system)
788 `(("argtable" ,argtable)))
789 (home-page "http://www.clustal.org/omega/")
790 (synopsis "Multiple sequence aligner for protein and DNA/RNA")
792 "Clustal-Omega is a general purpose multiple sequence alignment (MSA)
793 program for protein and DNA/RNA. It produces high quality MSAs and is capable
794 of handling data-sets of hundreds of thousands of sequences in reasonable
796 (license license:gpl2+)))
798 (define-public crossmap
804 (uri (string-append "mirror://sourceforge/crossmap/CrossMap-"
808 "163hi5gjgij6cndxlvbkp5jjwr0k4wbm9im6d2210278q7k9kpnp"))
809 ;; patch has been sent upstream already
811 (search-patch "crossmap-allow-system-pysam.patch")))
812 (modules '((guix build utils)))
813 ;; remove bundled copy of pysam
815 '(delete-file-recursively "lib/pysam"))))
816 (build-system python-build-system)
822 (lambda _ (setenv "CROSSMAP_USE_SYSTEM_PYSAM" "1"))
825 `(("python-numpy" ,python2-numpy)
826 ("python-pysam" ,python2-pysam)
829 `(("python-cython" ,python2-cython)
830 ("python-nose" ,python2-nose)
831 ("python-setuptools" ,python2-setuptools)))
832 (home-page "http://crossmap.sourceforge.net/")
833 (synopsis "Convert genome coordinates between assemblies")
835 "CrossMap is a program for conversion of genome coordinates or annotation
836 files between different genome assemblies. It supports most commonly used
837 file formats including SAM/BAM, Wiggle/BigWig, BED, GFF/GTF, VCF.")
838 (license license:gpl2+)))
840 (define-public cutadapt
847 "https://github.com/marcelm/cutadapt/archive/v"
849 (file-name (string-append name "-" version ".tar.gz"))
852 "161bp87y6gd6r5bmvjpn2b1k942i3fizfpa139f0jn6jv1wcp5h5"))))
853 (build-system python-build-system)
855 ;; tests must be run after install
856 `(#:phases (alist-cons-after
858 (lambda* (#:key inputs outputs #:allow-other-keys)
861 (getenv "PYTHONPATH")
862 ":" (assoc-ref outputs "out")
864 (string-take (string-take-right
865 (assoc-ref inputs "python") 5) 3)
867 (zero? (system* "nosetests" "-P" "tests")))
868 (alist-delete 'check %standard-phases))))
870 `(("python-cython" ,python-cython)
871 ("python-nose" ,python-nose)
872 ("python-setuptools" ,python-setuptools)))
873 (home-page "https://code.google.com/p/cutadapt/")
874 (synopsis "Remove adapter sequences from nucleotide sequencing reads")
876 "Cutadapt finds and removes adapter sequences, primers, poly-A tails and
877 other types of unwanted sequence from high-throughput sequencing reads.")
878 (license license:expat)))
880 (define-public deeptools
887 "https://github.com/fidelram/deepTools/archive/"
889 (file-name (string-append name "-" version ".tar.gz"))
892 "1kaagygcbvjs9sxd9cqmskd02wcfp9imvb735r087w7hwqpvz6fs"))))
893 (build-system python-build-system)
895 `(#:python ,python-2))
897 `(("python-scipy" ,python2-scipy)
898 ("python-numpy" ,python2-numpy)
899 ("python-matplotlib" ,python2-matplotlib)
900 ("python-bx-python" ,python2-bx-python)
901 ("python-pysam" ,python2-pysam)))
903 `(("python-mock" ,python2-mock) ;for tests
904 ("python-pytz" ,python2-pytz) ;for tests
905 ("python-setuptools" ,python2-setuptools)))
906 (home-page "https://github.com/fidelram/deepTools")
907 (synopsis "Tools for normalizing and visualizing deep-sequencing data")
909 "DeepTools addresses the challenge of handling the large amounts of data
910 that are now routinely generated from DNA sequencing centers. To do so,
911 deepTools contains useful modules to process the mapped reads data to create
912 coverage files in standard bedGraph and bigWig file formats. By doing so,
913 deepTools allows the creation of normalized coverage files or the comparison
914 between two files (for example, treatment and control). Finally, using such
915 normalized and standardized files, multiple visualizations can be created to
916 identify enrichments with functional annotations of the genome.")
917 (license license:gpl3+)))
919 (define-public diamond
926 "https://github.com/bbuchfink/diamond/archive/v"
928 (file-name (string-append name "-" version ".tar.gz"))
931 "0hfkcfv9f76h5brbyw9fyvmc0l9cmbsxrcdqk0fa9xv82zj47p15"))
933 (delete-file "bin/diamond")
935 (build-system gnu-build-system)
937 '(#:tests? #f ;no "check" target
939 (modify-phases %standard-phases
940 (add-after 'unpack 'enter-source-dir
946 (lambda* (#:key outputs #:allow-other-keys)
947 (let ((bin (string-append (assoc-ref outputs "out")
950 (copy-file "../bin/diamond"
951 (string-append bin "/diamond"))
958 (home-page "https://github.com/bbuchfink/diamond")
959 (synopsis "Accelerated BLAST compatible local sequence aligner")
961 "DIAMOND is a BLAST-compatible local aligner for mapping protein and
962 translated DNA query sequences against a protein reference database (BLASTP
963 and BLASTX alignment mode). The speedup over BLAST is up to 20,000 on short
964 reads at a typical sensitivity of 90-99% relative to BLAST depending on the
966 (license (license:non-copyleft "file://src/COPYING"
967 "See src/COPYING in the distribution."))))
969 (define-public edirect
975 ;; Note: older versions are not retained.
976 (uri "ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/edirect.zip")
979 "08afhz2ph66h8h381hl1mqyxkdi5nbvzsyj9gfw3jfbdijnpi4qj"))))
980 (build-system perl-build-system)
982 `(#:tests? #f ;no "check" target
984 (modify-phases %standard-phases
988 (lambda* (#:key outputs #:allow-other-keys)
989 (let ((target (string-append (assoc-ref outputs "out")
992 (copy-file "edirect.pl"
993 (string-append target "/edirect.pl"))
996 'install 'wrap-program
997 (lambda* (#:key inputs outputs #:allow-other-keys)
998 ;; Make sure 'edirect.pl' finds all perl inputs at runtime.
999 (let* ((out (assoc-ref outputs "out"))
1000 (path (getenv "PERL5LIB")))
1001 (wrap-program (string-append out "/bin/edirect.pl")
1002 `("PERL5LIB" ":" prefix (,path)))))))))
1004 `(("perl-html-parser" ,perl-html-parser)
1005 ("perl-encode-locale" ,perl-encode-locale)
1006 ("perl-file-listing" ,perl-file-listing)
1007 ("perl-html-tagset" ,perl-html-tagset)
1008 ("perl-html-tree" ,perl-html-tree)
1009 ("perl-http-cookies" ,perl-http-cookies)
1010 ("perl-http-date" ,perl-http-date)
1011 ("perl-http-message" ,perl-http-message)
1012 ("perl-http-negotiate" ,perl-http-negotiate)
1013 ("perl-lwp-mediatypes" ,perl-lwp-mediatypes)
1014 ("perl-lwp-protocol-https" ,perl-lwp-protocol-https)
1015 ("perl-net-http" ,perl-net-http)
1016 ("perl-uri" ,perl-uri)
1017 ("perl-www-robotrules" ,perl-www-robotrules)
1020 `(("unzip" ,unzip)))
1021 (home-page "http://www.ncbi.nlm.nih.gov/books/NBK179288")
1022 (synopsis "Tools for accessing the NCBI's set of databases")
1024 "Entrez Direct (EDirect) is a method for accessing the National Center
1025 for Biotechnology Information's (NCBI) set of interconnected
1026 databases (publication, sequence, structure, gene, variation, expression,
1027 etc.) from a terminal. Functions take search terms from command-line
1028 arguments. Individual operations are combined to build multi-step queries.
1029 Record retrieval and formatting normally complete the process.
1031 EDirect also provides an argument-driven function that simplifies the
1032 extraction of data from document summaries or other results that are returned
1033 in structured XML format. This can eliminate the need for writing custom
1034 software to answer ad hoc questions.")
1035 (license license:public-domain)))
1037 (define-public express
1045 "http://bio.math.berkeley.edu/eXpress/downloads/express-"
1046 version "/express-" version "-src.tgz"))
1049 "03rczxd0gjp2l1jxcmjfmf5j94j77zqyxa6x063zsc585nj40n0c"))))
1050 (build-system cmake-build-system)
1052 `(#:tests? #f ;no "check" target
1055 'unpack 'use-shared-boost-libs-and-set-bamtools-paths
1056 (lambda* (#:key inputs #:allow-other-keys)
1057 (substitute* "CMakeLists.txt"
1058 (("set\\(Boost_USE_STATIC_LIBS ON\\)")
1059 "set(Boost_USE_STATIC_LIBS OFF)")
1060 (("\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/bamtools/include")
1061 (string-append (assoc-ref inputs "bamtools") "/include/bamtools")))
1062 (substitute* "src/CMakeLists.txt"
1063 (("\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/\\.\\./bamtools/lib")
1064 (string-append (assoc-ref inputs "bamtools") "/lib/bamtools")))
1069 ("bamtools" ,bamtools)
1070 ("protobuf" ,protobuf)
1072 (home-page "http://bio.math.berkeley.edu/eXpress")
1073 (synopsis "Streaming quantification for high-throughput genomic sequencing")
1075 "eXpress is a streaming tool for quantifying the abundances of a set of
1076 target sequences from sampled subsequences. Example applications include
1077 transcript-level RNA-Seq quantification, allele-specific/haplotype expression
1078 analysis (from RNA-Seq), transcription factor binding quantification in
1079 ChIP-Seq, and analysis of metagenomic data.")
1080 (license license:artistic2.0)))
1082 (define-public fasttree
1089 "http://www.microbesonline.org/fasttree/FastTree-"
1093 "0dzqc9vr9iiiw21y159xfjl2z90vw0y7r4x6456pcaxiy5hd2wmi"))))
1094 (build-system gnu-build-system)
1096 `(#:tests? #f ; no "check" target
1098 (modify-phases %standard-phases
1102 (lambda* (#:key source #:allow-other-keys)
1103 (and (zero? (system* "gcc"
1105 "-finline-functions"
1112 (zero? (system* "gcc"
1116 "-finline-functions"
1124 (lambda* (#:key outputs #:allow-other-keys)
1125 (let ((bin (string-append (assoc-ref outputs "out")
1128 (copy-file "FastTree"
1129 (string-append bin "/FastTree"))
1130 (copy-file "FastTreeMP"
1131 (string-append bin "/FastTreeMP"))
1133 (home-page "http://www.microbesonline.org/fasttree")
1134 (synopsis "Infers approximately-maximum-likelihood phylogenetic trees")
1136 "FastTree can handle alignments with up to a million of sequences in a
1137 reasonable amount of time and memory. For large alignments, FastTree is
1138 100-1,000 times faster than PhyML 3.0 or RAxML 7.")
1139 (license license:gpl2+)))
1141 (define-public fastx-toolkit
1143 (name "fastx-toolkit")
1149 "https://github.com/agordon/fastx_toolkit/releases/download/"
1150 version "/fastx_toolkit-" version ".tar.bz2"))
1153 "01jqzw386873sr0pjp1wr4rn8fsga2vxs1qfmicvx1pjr72007wy"))))
1154 (build-system gnu-build-system)
1156 `(("libgtextutils" ,libgtextutils)))
1158 `(("pkg-config" ,pkg-config)))
1159 (home-page "http://hannonlab.cshl.edu/fastx_toolkit/")
1160 (synopsis "Tools for FASTA/FASTQ file preprocessing")
1162 "The FASTX-Toolkit is a collection of command line tools for Short-Reads
1163 FASTA/FASTQ files preprocessing.
1165 Next-Generation sequencing machines usually produce FASTA or FASTQ files,
1166 containing multiple short-reads sequences. The main processing of such
1167 FASTA/FASTQ files is mapping the sequences to reference genomes. However, it
1168 is sometimes more productive to preprocess the files before mapping the
1169 sequences to the genome---manipulating the sequences to produce better mapping
1170 results. The FASTX-Toolkit tools perform some of these preprocessing tasks.")
1171 (license license:agpl3+)))
1173 (define-public flexbar
1180 (string-append "mirror://sourceforge/flexbar/"
1181 version "/flexbar_v" version "_src.tgz"))
1184 "13jaykc3y1x8y5nn9j8ljnb79s5y51kyxz46hdmvvjj6qhyympmf"))))
1185 (build-system cmake-build-system)
1187 `(#:configure-flags (list
1188 (string-append "-DFLEXBAR_BINARY_DIR="
1189 (assoc-ref %outputs "out")
1194 (lambda* (#:key outputs #:allow-other-keys)
1195 (setenv "PATH" (string-append
1196 (assoc-ref outputs "out") "/bin:"
1198 (chdir "../flexbar_v2.5_src/test")
1199 (zero? (system* "bash" "flexbar_validate.sh")))
1200 (alist-delete 'install %standard-phases))))
1205 `(("pkg-config" ,pkg-config)
1207 (home-page "http://flexbar.sourceforge.net")
1208 (synopsis "Barcode and adapter removal tool for sequencing platforms")
1210 "Flexbar preprocesses high-throughput nucleotide sequencing data
1211 efficiently. It demultiplexes barcoded runs and removes adapter sequences.
1212 Moreover, trimming and filtering features are provided. Flexbar increases
1213 read mapping rates and improves genome and transcriptome assemblies. It
1214 supports next-generation sequencing data in fasta/q and csfasta/q format from
1215 Illumina, Roche 454, and the SOLiD platform.")
1216 (license license:gpl3)))
1225 "https://github.com/nboley/grit/archive/"
1227 (file-name (string-append name "-" version ".tar.gz"))
1230 "157in84dj70wimbind3x7sy1whs3h57qfgcnj2s6lrd38fbrb7mj"))))
1231 (build-system python-build-system)
1233 `(#:python ,python-2
1236 'unpack 'generate-from-cython-sources
1237 (lambda* (#:key inputs outputs #:allow-other-keys)
1238 ;; Delete these C files to force fresh generation from pyx sources.
1239 (delete-file "grit/sparsify_support_fns.c")
1240 (delete-file "grit/call_peaks_support_fns.c")
1241 (substitute* "setup.py"
1242 (("Cython.Setup") "Cython.Build")
1243 ;; Add numpy include path to fix compilation
1245 (string-append "pyx\", ], include_dirs = ['"
1246 (assoc-ref inputs "python-numpy")
1247 "/lib/python2.7/site-packages/numpy/core/include/"
1251 `(("python-scipy" ,python2-scipy)
1252 ("python-numpy" ,python2-numpy)
1253 ("python-pysam" ,python2-pysam)
1254 ("python-networkx" ,python2-networkx)))
1256 `(("python-cython" ,python2-cython)
1257 ("python-setuptools" ,python2-setuptools)))
1258 (home-page "http://grit-bio.org")
1259 (synopsis "Tool for integrative analysis of RNA-seq type assays")
1261 "GRIT is designed to use RNA-seq, TES, and TSS data to build and quantify
1262 full length transcript models. When none of these data sources are available,
1263 GRIT can be run by providing a candidate set of TES or TSS sites. In
1264 addition, GRIT can merge in reference junctions and gene boundaries. GRIT can
1265 also be run in quantification mode, where it uses a provided GTF file and just
1266 estimates transcript expression.")
1267 (license license:gpl3+)))
1269 (define-public hisat
1276 "http://ccb.jhu.edu/software/hisat/downloads/hisat-"
1277 version "-beta-source.zip"))
1280 "1k381ydranqxp09yf2y7w1d0chz5d59vb6jchi89hbb0prq19lk5"))))
1281 (build-system gnu-build-system)
1283 `(#:tests? #f ;no check target
1284 #:make-flags '("allall"
1285 ;; Disable unsupported `popcnt' instructions on
1286 ;; architectures other than x86_64
1287 ,@(if (string-prefix? "x86_64"
1288 (or (%current-target-system)
1291 '("POPCNT_CAPABILITY=0")))
1294 'unpack 'patch-sources
1296 ;; XXX Cannot use snippet because zip files are not supported
1297 (substitute* "Makefile"
1298 (("^CC = .*$") "CC = gcc")
1299 (("^CPP = .*$") "CPP = g++")
1300 ;; replace BUILD_HOST and BUILD_TIME for deterministic build
1301 (("-DBUILD_HOST=.*") "-DBUILD_HOST=\"\\\"guix\\\"\"")
1302 (("-DBUILD_TIME=.*") "-DBUILD_TIME=\"\\\"0\\\"\""))
1303 (substitute* '("hisat-build" "hisat-inspect")
1304 (("/usr/bin/env") (which "env"))))
1307 (lambda* (#:key outputs #:allow-other-keys)
1308 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
1312 (copy-file file (string-append bin file)))
1315 "hisat(-(build|align|inspect)(-(s|l)(-debug)*)*)*$"))))
1316 (alist-delete 'configure %standard-phases)))))
1318 `(("unzip" ,unzip)))
1323 (home-page "http://ccb.jhu.edu/software/hisat/index.shtml")
1324 (synopsis "Hierarchical indexing for spliced alignment of transcripts")
1326 "HISAT is a fast and sensitive spliced alignment program for mapping
1327 RNA-seq reads. In addition to one global FM index that represents a whole
1328 genome, HISAT uses a large set of small FM indexes that collectively cover the
1329 whole genome. These small indexes (called local indexes) combined with
1330 several alignment strategies enable effective alignment of RNA-seq reads, in
1331 particular, reads spanning multiple exons.")
1332 (license license:gpl3+)))
1334 (define-public hmmer
1341 "http://selab.janelia.org/software/hmmer"
1342 (version-prefix version 1) "/"
1343 version "/hmmer-" version ".tar.gz"))
1346 "0djmgc0pfli0jilfx8hql1axhwhqxqb8rxg2r5rg07aw73sfs5nx"))))
1347 (build-system gnu-build-system)
1348 (native-inputs `(("perl", perl)))
1349 (home-page "http://hmmer.janelia.org")
1350 (synopsis "Biosequence analysis using profile hidden Markov models")
1352 "HMMER is used for searching sequence databases for homologs of protein
1353 sequences, and for making protein sequence alignments. It implements methods
1354 using probabilistic models called profile hidden Markov models (profile
1356 (license (list license:gpl3+
1357 ;; The bundled library 'easel' is distributed
1358 ;; under The Janelia Farm Software License.
1359 (license:non-copyleft
1360 "file://easel/LICENSE"
1361 "See easel/LICENSE in the distribution.")))))
1363 (define-public htseq
1370 "https://pypi.python.org/packages/source/H/HTSeq/HTSeq-"
1374 "1i85ppf2j2lj12m0x690qq5nn17xxk23pbbx2c83r8ayb5wngzwv"))))
1375 (build-system python-build-system)
1376 (arguments `(#:python ,python-2)) ; only Python 2 is supported
1378 `(("python-numpy" ,python2-numpy)
1379 ("python-setuptools" ,python2-setuptools)))
1380 (home-page "http://www-huber.embl.de/users/anders/HTSeq/")
1381 (synopsis "Analysing high-throughput sequencing data with Python")
1383 "HTSeq is a Python package that provides infrastructure to process data
1384 from high-throughput sequencing assays.")
1385 (license license:gpl3+)))
1387 (define-public htsjdk
1394 "https://github.com/samtools/htsjdk/archive/"
1396 (file-name (string-append name "-" version ".tar.gz"))
1399 "0asdk9b8jx2ij7yd6apg9qx03li8q7z3ml0qy2r2qczkra79y6fw"))
1400 (modules '((guix build utils)))
1401 ;; remove build dependency on git
1402 (snippet '(substitute* "build.xml"
1403 (("failifexecutionfails=\"true\"")
1404 "failifexecutionfails=\"false\"")))))
1405 (build-system gnu-build-system)
1407 `(#:modules ((srfi srfi-1)
1408 (guix build gnu-build-system)
1410 #:phases (alist-replace
1413 (setenv "JAVA_HOME" (assoc-ref %build-inputs "jdk"))
1414 (zero? (system* "ant" "all"
1415 (string-append "-Ddist="
1416 (assoc-ref %outputs "out")
1417 "/share/java/htsjdk/"))))
1418 (fold alist-delete %standard-phases
1419 '(configure install check)))))
1422 ("jdk" ,icedtea6 "jdk")))
1423 (home-page "http://samtools.github.io/htsjdk/")
1424 (synopsis "Java API for high-throughput sequencing data (HTS) formats")
1426 "HTSJDK is an implementation of a unified Java library for accessing
1427 common file formats, such as SAM and VCF, used for high-throughput
1428 sequencing (HTS) data. There are also an number of useful utilities for
1429 manipulating HTS data.")
1430 (license license:expat)))
1432 (define-public htslib
1439 "https://github.com/samtools/htslib/releases/download/"
1440 version "/htslib-" version ".tar.bz2"))
1443 "1c32ssscbnjwfw3dra140fq7riarp2x990qxybh34nr1p5r17nxx"))))
1444 (build-system gnu-build-system)
1447 (modify-phases %standard-phases
1449 'unpack 'patch-tests
1451 (substitute* "test/test.pl"
1452 (("/bin/bash") (which "bash")))
1458 (home-page "http://www.htslib.org")
1459 (synopsis "C library for reading/writing high-throughput sequencing data")
1461 "HTSlib is a C library for reading/writing high-throughput sequencing
1462 data. It also provides the bgzip, htsfile, and tabix utilities.")
1463 ;; Files under cram/ are released under the modified BSD license;
1464 ;; the rest is released under the Expat license
1465 (license (list license:expat license:bsd-3))))
1474 "https://github.com/nboley/idr/archive/"
1476 (file-name (string-append name "-" version ".tar.gz"))
1479 "1k3x44biak00aiv3hpm1yd6nn4hhp7n0qnbs3zh2q9sw7qr1qj5r"))))
1480 (build-system python-build-system)
1483 (modify-phases %standard-phases
1485 'install 'wrap-program
1486 (lambda* (#:key inputs outputs #:allow-other-keys)
1487 (let* ((out (assoc-ref outputs "out"))
1488 (python-version (string-take (string-take-right
1489 (assoc-ref inputs "python") 5) 3))
1492 (string-append (assoc-ref inputs name)
1493 "/lib/python" python-version
1497 "python-matplotlib"))
1499 (wrap-program (string-append out "/bin/idr")
1500 `("PYTHONPATH" ":" prefix (,path))))
1503 `(("python-scipy" ,python-scipy)
1504 ("python-numpy" ,python-numpy)
1505 ("python-matplotlib" ,python-matplotlib)))
1507 `(("python-cython" ,python-cython)
1508 ("python-setuptools" ,python-setuptools)))
1509 (home-page "https://github.com/nboley/idr")
1510 (synopsis "Tool to measure the irreproducible discovery rate (IDR)")
1512 "The IDR (Irreproducible Discovery Rate) framework is a unified approach
1513 to measure the reproducibility of findings identified from replicate
1514 experiments and provide highly stable thresholds based on reproducibility.")
1515 (license license:gpl3+)))
1520 (version "2.1.0.20140616")
1524 "https://pypi.python.org/packages/source/M/MACS2/MACS2-"
1528 "11lmiw6avqhwn75sn59g4lfkrr2kk20r3rgfbx9xfqb8rg9mi2n6"))))
1529 (build-system python-build-system)
1531 `(#:python ,python-2 ; only compatible with Python 2.7
1532 #:tests? #f)) ; no test target
1534 `(("python-numpy" ,python2-numpy)))
1536 `(("python-setuptools" ,python2-setuptools)))
1537 (home-page "http://github.com/taoliu/MACS/")
1538 (synopsis "Model based analysis for ChIP-Seq data")
1540 "MACS is an implementation of a ChIP-Seq analysis algorithm for
1541 identifying transcript factor binding sites named Model-based Analysis of
1542 ChIP-Seq (MACS). MACS captures the influence of genome complexity to evaluate
1543 the significance of enriched ChIP regions and it improves the spatial
1544 resolution of binding sites through combining the information of both
1545 sequencing tag position and orientation.")
1546 (license license:bsd-3)))
1548 (define-public mafft
1555 "http://mafft.cbrc.jp/alignment/software/mafft-" version
1556 "-without-extensions-src.tgz"))
1557 (file-name (string-append name "-" version ".tgz"))
1560 "0xi7klbsgi049vsrk6jiwh9wfj3b770gz3c8c7zwij448v0dr73d"))))
1561 (build-system gnu-build-system)
1563 `(#:tests? #f ; no automated tests, though there are tests in the read me
1564 #:make-flags (let ((out (assoc-ref %outputs "out")))
1565 (list (string-append "PREFIX=" out)
1566 (string-append "BINDIR="
1567 (string-append out "/bin"))))
1569 (modify-phases %standard-phases
1570 (add-after 'unpack 'enter-dir
1571 (lambda _ (chdir "core") #t))
1572 (add-after 'enter-dir 'patch-makefile
1574 ;; on advice from the MAFFT authors, there is no need to
1575 ;; distribute mafft-profile, mafft-distance, or
1576 ;; mafft-homologs.rb as they are too "specialised".
1577 (substitute* "Makefile"
1578 ;; remove mafft-homologs.rb from SCRIPTS
1579 (("^SCRIPTS = mafft mafft-homologs.rb")
1581 ;; remove mafft-distance from PROGS
1582 (("^PROGS = dvtditr dndfast7 dndblast sextet5 mafft-distance")
1583 "PROGS = dvtditr dndfast7 dndblast sextet5")
1584 ;; remove mafft-profile from PROGS
1585 (("splittbfast disttbfast tbfast mafft-profile 2cl mccaskillwrap")
1586 "splittbfast disttbfast tbfast f2cl mccaskillwrap")
1587 (("^rm -f mafft-profile mafft-profile.exe") "#")
1588 (("^rm -f mafft-distance mafft-distance.exe") ")#")
1589 ;; do not install MAN pages in libexec folder
1590 (("^\t\\$\\(INSTALL\\) -m 644 \\$\\(MANPAGES\\) \
1591 \\$\\(DESTDIR\\)\\$\\(LIBDIR\\)") "#"))
1593 (delete 'configure))))
1596 (home-page "http://mafft.cbrc.jp/alignment/software/")
1597 (synopsis "Multiple sequence alignment program")
1599 "MAFFT offers a range of multiple alignment methods for nucleotide and
1600 protein sequences. For instance, it offers L-INS-i (accurate; for alignment
1601 of <~200 sequences) and FFT-NS-2 (fast; for alignment of <~30,000
1603 (license (license:non-copyleft
1604 "http://mafft.cbrc.jp/alignment/software/license.txt"
1605 "BSD-3 with different formatting"))))
1607 (define-public metabat
1614 "https://bitbucket.org/berkeleylab/metabat/get/"
1615 version ".tar.bz2"))
1616 (file-name (string-append name "-" version ".tar.bz2"))
1619 "0vgrhbaxg4dkxyax2kbigak7w0arhqvw0szwp6gd9wmyilc44kfa"))))
1620 (build-system gnu-build-system)
1623 (modify-phases %standard-phases
1624 (add-after 'unpack 'fix-includes
1626 (substitute* "SConstruct"
1627 (("/include/bam/bam.h")
1628 "/include/samtools/bam.h"))
1629 (substitute* "src/BamUtils.h"
1630 (("^#include \"bam/bam\\.h\"")
1631 "#include \"samtools/bam.h\"")
1632 (("^#include \"bam/sam\\.h\"")
1633 "#include \"samtools/sam.h\""))
1634 (substitute* "src/KseqReader.h"
1635 (("^#include \"bam/kseq\\.h\"")
1636 "#include \"samtools/kseq.h\""))
1638 (add-after 'unpack 'fix-scons
1640 (substitute* "SConstruct" ; Do not distribute README
1641 (("^env\\.Install\\(idir_prefix, 'README\\.md'\\)")
1646 (lambda* (#:key inputs outputs #:allow-other-keys)
1647 (mkdir (assoc-ref outputs "out"))
1648 (zero? (system* "scons"
1651 (assoc-ref outputs "out"))
1654 (assoc-ref inputs "htslib"))
1657 (assoc-ref inputs "samtools"))
1660 (assoc-ref inputs "boost"))
1662 ;; check and install carried out during build phase
1664 (delete 'install))))
1668 ("samtools" ,samtools)
1672 `(("scons" ,scons)))
1673 (home-page "https://bitbucket.org/berkeleylab/metabat")
1675 "Reconstruction of single genomes from complex microbial communities")
1677 "Grouping large genomic fragments assembled from shotgun metagenomic
1678 sequences to deconvolute complex microbial communities, or metagenome binning,
1679 enables the study of individual organisms and their interactions. MetaBAT is
1680 an automated metagenome binning software, which integrates empirical
1681 probabilistic distances of genome abundance and tetranucleotide frequency.")
1682 (license (license:non-copyleft "file://license.txt"
1683 "See license.txt in the distribution."))))
1692 "https://pypi.python.org/packages/source/m/misopy/misopy-"
1696 "0x446867az8ir0z8c1vjqffkp0ma37wm4sylixnkhgawllzx8v5w"))
1697 (modules '((guix build utils)))
1699 '(substitute* "setup.py"
1700 ;; Use setuptools, or else the executables are not
1702 (("distutils.core") "setuptools")
1703 ;; use "gcc" instead of "cc" for compilation
1705 "cc.set_executables(
1709 linker_so='gcc -shared'); defines")))))
1710 (build-system python-build-system)
1712 `(#:python ,python-2 ; only Python 2 is supported
1713 #:tests? #f)) ; no "test" target
1715 `(("samtools" ,samtools)
1716 ("python-numpy" ,python2-numpy)
1717 ("python-pysam" ,python2-pysam)
1718 ("python-scipy" ,python2-scipy)
1719 ("python-matplotlib" ,python2-matplotlib)))
1721 `(("python-mock" ,python2-mock) ;for tests
1722 ("python-pytz" ,python2-pytz) ;for tests
1723 ("python-setuptools" ,python2-setuptools)))
1724 (home-page "http://genes.mit.edu/burgelab/miso/index.html")
1725 (synopsis "Mixture of Isoforms model for RNA-Seq isoform quantitation")
1727 "MISO (Mixture-of-Isoforms) is a probabilistic framework that quantitates
1728 the expression level of alternatively spliced genes from RNA-Seq data, and
1729 identifies differentially regulated isoforms or exons across samples. By
1730 modeling the generative process by which reads are produced from isoforms in
1731 RNA-Seq, the MISO model uses Bayesian inference to compute the probability
1732 that a read originated from a particular isoform.")
1733 (license license:gpl2)))
1742 "https://github.com/wwood/OrfM/releases/download/v"
1743 version "/orfm-" version ".tar.gz"))
1746 "05fmw145snk646ly076zby0fjav0k7ysbclck5d4s9pmgcfpijc2"))))
1747 (build-system gnu-build-system)
1748 (inputs `(("zlib" ,zlib)))
1749 (synopsis "Simple and not slow open reading frame (ORF) caller")
1751 "An ORF caller finds stretches of DNA that when translated are not
1752 interrupted by stop codons. OrfM finds and prints these ORFs.")
1753 (home-page "https://github.com/wwood/OrfM")
1754 (license license:lgpl3+)))
1756 (define-public python2-pbcore
1758 (name "python2-pbcore")
1763 "https://github.com/PacificBiosciences/pbcore/archive/"
1765 (file-name (string-append name "-" version ".tar.gz"))
1768 "1z46rwjac93jm87cbj2zgjg6qvsgs65140wkbbxsvxps7ai4pm09"))))
1769 (build-system python-build-system)
1770 (arguments `(#:python ,python-2)) ; pbcore requires Python 2.7
1772 `(("python-cython" ,python2-cython)
1773 ("python-numpy" ,python2-numpy)
1774 ("python-pysam" ,python2-pysam)
1775 ("python-h5py" ,python2-h5py)))
1777 `(("python-setuptools" ,python2-setuptools)))
1778 (home-page "http://pacificbiosciences.github.io/pbcore/")
1779 (synopsis "Library for reading and writing PacBio data files")
1781 "The pbcore package provides Python APIs for interacting with PacBio data
1782 files and writing bioinformatics applications.")
1783 (license license:bsd-3)))
1785 (define-public python2-warpedlmm
1787 (name "python2-warpedlmm")
1793 "https://pypi.python.org/packages/source/W/WarpedLMM/WarpedLMM-"
1797 "1agfz6zqa8nc6cw47yh0s3y14gkpa9wqazwcj7mwwj3ffnw39p3j"))))
1798 (build-system python-build-system)
1800 `(#:python ,python-2 ; requires Python 2.7
1802 (modify-phases %standard-phases
1804 'install 'remove-bin-directory
1805 (lambda* (#:key outputs #:allow-other-keys)
1806 ;; The "bin" directory only contains wrappers for running
1807 ;; the module tests. They are not needed after the
1809 (delete-file-recursively
1810 (string-append (assoc-ref outputs "out") "/bin"))
1813 `(("python-scipy" ,python2-scipy)
1814 ("python-numpy" ,python2-numpy)
1815 ("python-matplotlib" ,python2-matplotlib)
1816 ("python-fastlmm" ,python2-fastlmm)
1817 ("python-pandas" ,python2-pandas)
1818 ("python-pysnptools" ,python2-pysnptools)))
1820 `(("python-setuptools" ,python2-setuptools)
1821 ("python-mock" ,python2-mock)
1822 ("python-nose" ,python2-nose)
1824 (home-page "https://github.com/PMBio/warpedLMM")
1825 (synopsis "Implementation of warped linear mixed models")
1827 "WarpedLMM is a Python implementation of the warped linear mixed model,
1828 which automatically learns an optimal warping function (or transformation) for
1829 the phenotype as it models the data.")
1830 (license license:asl2.0)))
1832 (define-public pbtranscript-tofu
1833 (let ((commit "c7bbd5472"))
1835 (name "pbtranscript-tofu")
1836 (version (string-append "0.4.1." commit))
1840 (url "https://github.com/PacificBiosciences/cDNA_primer.git")
1842 (file-name (string-append name "-" version ".tar.gz"))
1845 "148xkzi689c49g6fdhckp6mnmj2qhjdf1j4wifm6ja7ij95d7fxx"))))
1846 (build-system python-build-system)
1848 `(#:python ,python-2
1849 ;; With standard flags, the install phase attempts to create a zip'd
1850 ;; egg file, and fails with an error: 'ZIP does not support timestamps
1852 #:configure-flags '("--single-version-externally-managed"
1853 "--record=pbtranscript-tofu.txt")
1856 'unpack 'enter-directory-and-clean-up
1858 (chdir "pbtranscript-tofu/pbtranscript/")
1860 (delete-file-recursively "dist/")
1861 (delete-file-recursively "build/")
1862 (delete-file-recursively "setuptools_cython-0.2.1-py2.6.egg/")
1863 (delete-file-recursively "pbtools.pbtranscript.egg-info")
1864 (delete-file "Cython-0.20.1.tar.gz")
1865 (delete-file "setuptools_cython-0.2.1-py2.7.egg")
1866 (delete-file "setuptools_cython-0.2.1.tar.gz")
1867 (delete-file "setup.cfg")
1868 (for-each delete-file
1869 (find-files "." "\\.so$"))
1870 ;; files should be writable for install phase
1871 (for-each (lambda (f) (chmod f #o755))
1872 (find-files "." "\\.py$")))
1875 `(("python-cython" ,python2-cython)
1876 ("python-numpy" ,python2-numpy)
1877 ("python-bx-python" ,python2-bx-python)
1878 ("python-networkx" ,python2-networkx)
1879 ("python-scipy" ,python2-scipy)
1880 ("python-pbcore" ,python2-pbcore)))
1882 `(("python-nose" ,python2-nose)
1883 ("python-setuptools" ,python2-setuptools)))
1884 (home-page "https://github.com/PacificBiosciences/cDNA_primer")
1885 (synopsis "Analyze transcriptome data generated with the Iso-Seq protocol")
1887 "pbtranscript-tofu contains scripts to analyze transcriptome data
1888 generated using the PacBio Iso-Seq protocol.")
1889 (license license:bsd-3))))
1891 (define-public prodigal
1898 "https://github.com/hyattpd/Prodigal/archive/v"
1900 (file-name (string-append name "-" version ".tar.gz"))
1903 "0m8sb0fg6lmxrlpzna0am6svbnlmd3dckrhgzxxgb3gxr5fyj284"))))
1904 (build-system gnu-build-system)
1906 `(#:tests? #f ;no check target
1907 #:make-flags (list (string-append "INSTALLDIR="
1908 (assoc-ref %outputs "out")
1911 (modify-phases %standard-phases
1912 (delete 'configure))))
1913 (home-page "http://prodigal.ornl.gov")
1914 (synopsis "Protein-coding gene prediction for Archaea and Bacteria")
1916 "Prodigal runs smoothly on finished genomes, draft genomes, and
1917 metagenomes, providing gene predictions in GFF3, Genbank, or Sequin table
1918 format. It runs quickly, in an unsupervised fashion, handles gaps, handles
1919 partial genes, and identifies translation initiation sites.")
1920 (license license:gpl3+)))
1930 (string-append "http://deweylab.biostat.wisc.edu/rsem/src/rsem-"
1933 (base32 "0nzdc0j0hjllhsd5f2xli95dafm3nawskigs140xzvjk67xh0r9q"))
1934 (patches (list (search-patch "rsem-makefile.patch")))
1935 (modules '((guix build utils)))
1938 ;; remove bundled copy of boost
1939 (delete-file-recursively "boost")
1941 (build-system gnu-build-system)
1943 `(#:tests? #f ;no "check" target
1945 (modify-phases %standard-phases
1946 ;; No "configure" script.
1947 ;; Do not build bundled samtools library.
1950 (substitute* "Makefile"
1951 (("^all : sam/libbam.a") "all : "))
1954 (lambda* (#:key outputs #:allow-other-keys)
1955 (let* ((out (string-append (assoc-ref outputs "out")))
1956 (bin (string-append out "/bin/"))
1957 (perl (string-append out "/lib/perl5/site_perl")))
1960 (for-each (lambda (file)
1962 (string-append bin (basename file))))
1963 (find-files "." "rsem-.*"))
1964 (copy-file "rsem_perl_utils.pm"
1965 (string-append perl "/rsem_perl_utils.pm")))
1968 'install 'wrap-program
1969 (lambda* (#:key outputs #:allow-other-keys)
1970 (let ((out (assoc-ref outputs "out")))
1971 (for-each (lambda (prog)
1972 (wrap-program (string-append out "/bin/" prog)
1973 `("PERL5LIB" ":" prefix
1974 (,(string-append out "/lib/perl5/site_perl")))))
1975 '("rsem-plot-transcript-wiggles"
1976 "rsem-calculate-expression"
1977 "rsem-generate-ngvector"
1979 "rsem-prepare-reference")))
1983 ("ncurses" ,ncurses)
1986 ("samtools" ,samtools-0.1)
1988 (home-page "http://deweylab.biostat.wisc.edu/rsem/")
1989 (synopsis "Estimate gene expression levels from RNA-Seq data")
1991 "RSEM is a software package for estimating gene and isoform expression
1992 levels from RNA-Seq data. The RSEM package provides a user-friendly
1993 interface, supports threads for parallel computation of the EM algorithm,
1994 single-end and paired-end read data, quality scores, variable-length reads and
1995 RSPD estimation. In addition, it provides posterior mean and 95% credibility
1996 interval estimates for expression levels. For visualization, it can generate
1997 BAM and Wiggle files in both transcript-coordinate and genomic-coordinate.")
1998 (license license:gpl3+)))
2000 (define-public rseqc
2008 (string-append "mirror://sourceforge/rseqc/"
2009 version "/RSeQC-" version ".tar.gz"))
2011 (base32 "15ly0254yi032qzkdplg00q144qfdsd986gh62829rl5bkxhj330"))
2012 (modules '((guix build utils)))
2015 ;; remove bundled copy of pysam
2016 (delete-file-recursively "lib/pysam")
2017 (substitute* "setup.py"
2018 ;; remove dependency on outdated "distribute" module
2019 (("^from distribute_setup import use_setuptools") "")
2020 (("^use_setuptools\\(\\)") "")
2021 ;; do not use bundled copy of pysam
2022 (("^have_pysam = False") "have_pysam = True"))))))
2023 (build-system python-build-system)
2024 (arguments `(#:python ,python-2))
2026 `(("python-cython" ,python2-cython)
2027 ("python-pysam" ,python2-pysam)
2028 ("python-numpy" ,python2-numpy)
2029 ("python-setuptools" ,python2-setuptools)
2032 `(("python-nose" ,python2-nose)))
2033 (home-page "http://rseqc.sourceforge.net/")
2034 (synopsis "RNA-seq quality control package")
2036 "RSeQC provides a number of modules that can comprehensively evaluate
2037 high throughput sequence data, especially RNA-seq data. Some basic modules
2038 inspect sequence quality, nucleotide composition bias, PCR bias and GC bias,
2039 while RNA-seq specific modules evaluate sequencing saturation, mapped reads
2040 distribution, coverage uniformity, strand specificity, etc.")
2041 (license license:gpl3+)))
2043 (define-public samtools
2051 (string-append "mirror://sourceforge/samtools/"
2052 version "/samtools-" version ".tar.bz2"))
2055 "1akdqb685pk9xk1nb6sa9aq8xssjjhvvc06kp4cpdqvz2157l3j2"))))
2056 (build-system gnu-build-system)
2058 `(;; There are 87 test failures when building on non-64-bit architectures
2059 ;; due to invalid test data. This has since been fixed upstream (see
2060 ;; <https://github.com/samtools/samtools/pull/307>), but as there has
2061 ;; not been a new release we disable the tests for all non-64-bit
2063 #:tests? ,(string=? (or (%current-system) (%current-target-system))
2065 #:modules ((ice-9 ftw)
2067 (guix build gnu-build-system)
2069 #:make-flags (list "LIBCURSES=-lncurses"
2070 (string-append "prefix=" (assoc-ref %outputs "out")))
2075 (lambda* (#:key inputs #:allow-other-keys)
2076 (let ((bash (assoc-ref inputs "bash")))
2077 (substitute* "test/test.pl"
2078 ;; The test script calls out to /bin/bash
2080 (string-append bash "/bin/bash"))
2081 ;; There are two failing tests upstream relating to the "stats"
2082 ;; subcommand in test_usage_subcommand ("did not have Usage"
2083 ;; and "usage did not mention samtools stats"), so we disable
2085 (("(test_usage_subcommand\\(.*\\);)" cmd)
2086 (string-append "unless ($subcommand eq 'stats') {" cmd "};")))))
2088 'install 'install-library
2089 (lambda* (#:key outputs #:allow-other-keys)
2090 (let ((lib (string-append (assoc-ref outputs "out") "/lib")))
2092 (copy-file "libbam.a" (string-append lib "/libbam.a"))))
2094 'install 'install-headers
2095 (lambda* (#:key outputs #:allow-other-keys)
2096 (let ((include (string-append (assoc-ref outputs "out")
2097 "/include/samtools/")))
2099 (for-each (lambda (file)
2100 (copy-file file (string-append include
2102 (scandir "." (lambda (name) (string-match "\\.h$" name))))
2104 (alist-delete 'configure %standard-phases))))))
2105 (native-inputs `(("pkg-config" ,pkg-config)))
2106 (inputs `(("ncurses" ,ncurses)
2110 (home-page "http://samtools.sourceforge.net")
2111 (synopsis "Utilities to efficiently manipulate nucleotide sequence alignments")
2113 "Samtools implements various utilities for post-processing nucleotide
2114 sequence alignments in the SAM, BAM, and CRAM formats, including indexing,
2115 variant calling (in conjunction with bcftools), and a simple alignment
2117 (license license:expat)))
2119 (define-public samtools-0.1
2120 ;; This is the most recent version of the 0.1 line of samtools. The input
2121 ;; and output formats differ greatly from that used and produced by samtools
2122 ;; 1.x and is still used in many bioinformatics pipelines.
2123 (package (inherit samtools)
2129 (string-append "mirror://sourceforge/samtools/"
2130 version "/samtools-" version ".tar.bz2"))
2132 (base32 "1m33xsfwz0s8qi45lylagfllqg7fphf4dr0780rsvw75av9wk06h"))))
2134 (substitute-keyword-arguments (package-arguments samtools)
2135 ((#:tests? tests) #f) ;no "check" target
2137 `(modify-phases ,phases
2139 (lambda* (#:key outputs #:allow-other-keys)
2140 (let ((bin (string-append
2141 (assoc-ref outputs "out") "/bin")))
2143 (copy-file "samtools"
2144 (string-append bin "/samtools")))))
2145 (delete 'patch-tests)))))))
2147 (define-public ngs-sdk
2155 (string-append "https://github.com/ncbi/ngs/archive/"
2157 (file-name (string-append name "-" version ".tar.gz"))
2160 "1x58gpm574n0xmk2a98gmikbgycq78ia0bvnb42k5ck34fmd5v8y"))))
2161 (build-system gnu-build-system)
2163 `(#:parallel-build? #f ; not supported
2164 #:tests? #f ; no "check" target
2168 (lambda* (#:key outputs #:allow-other-keys)
2169 (let ((out (assoc-ref outputs "out")))
2170 ;; The 'configure' script doesn't recognize things like
2171 ;; '--enable-fast-install'.
2172 (zero? (system* "./configure"
2173 (string-append "--build-prefix=" (getcwd) "/build")
2174 (string-append "--prefix=" out)))))
2177 (lambda _ (chdir "ngs-sdk") #t)
2178 %standard-phases))))
2179 (native-inputs `(("perl" ,perl)))
2180 (home-page "https://github.com/ncbi/ngs")
2181 (synopsis "API for accessing Next Generation Sequencing data")
2183 "NGS is a domain-specific API for accessing reads, alignments and pileups
2184 produced from Next Generation Sequencing. The API itself is independent from
2185 any particular back-end implementation, and supports use of multiple back-ends
2187 (license license:public-domain)))
2189 (define-public ngs-java
2190 (package (inherit ngs-sdk)
2193 `(,@(substitute-keyword-arguments
2194 `(#:modules ((guix build gnu-build-system)
2198 ,@(package-arguments ngs-sdk))
2201 'enter-dir 'fix-java-symlink-installation
2203 ;; Only replace the version suffix, not the version number in
2204 ;; the directory name. Reported here:
2205 ;; https://github.com/ncbi/ngs/pull/4
2206 (substitute* "Makefile.java"
2207 (((string-append "\\$\\(subst "
2208 "(\\$\\(VERSION[^\\)]*\\)),"
2209 "(\\$\\([^\\)]+\\)),"
2210 "(\\$\\([^\\)]+\\)|\\$\\@)"
2212 _ pattern replacement target)
2213 (string-append "$(patsubst "
2218 'enter-dir (lambda _ (chdir "ngs-java") #t)
2221 `(("jdk" ,icedtea6 "jdk")
2222 ("ngs-sdk" ,ngs-sdk)))
2223 (synopsis "Java bindings for NGS SDK")))
2225 (define-public ncbi-vdb
2233 (string-append "https://github.com/ncbi/ncbi-vdb/archive/"
2235 (file-name (string-append name "-" version ".tar.gz"))
2238 "1cj8nk6if8sqagv20vx36v566fdvhcaadf0x1ycnbgql6chbs6vy"))))
2239 (build-system gnu-build-system)
2241 `(#:parallel-build? #f ; not supported
2242 #:tests? #f ; no "check" target
2246 (lambda* (#:key inputs outputs #:allow-other-keys)
2247 (let ((out (assoc-ref outputs "out")))
2248 ;; Only replace the version suffix, not the version number in the
2249 ;; directory name; fixed in commit 4dbba5c6a809 (no release yet).
2250 (substitute* "setup/konfigure.perl"
2251 (((string-append "\\$\\(subst "
2252 "(\\$\\(VERSION[^\\)]*\\)),"
2253 "(\\$\\([^\\)]+\\)),"
2254 "(\\$\\([^\\)]+\\)|\\$\\@)"
2256 _ pattern replacement target)
2257 (string-append "$(patsubst "
2262 ;; Override include path for libmagic
2263 (substitute* "setup/package.prl"
2264 (("name => 'magic', Include => '/usr/include'")
2265 (string-append "name=> 'magic', Include => '"
2266 (assoc-ref inputs "libmagic")
2269 ;; Install kdf5 library (needed by sra-tools)
2270 (substitute* "build/Makefile.install"
2271 (("LIBRARIES_TO_INSTALL =")
2272 "LIBRARIES_TO_INSTALL = kdf5.$(VERSION_LIBX) kdf5.$(VERSION_SHLX)"))
2274 ;; The 'configure' script doesn't recognize things like
2275 ;; '--enable-fast-install'.
2278 (string-append "--build-prefix=" (getcwd) "/build")
2279 (string-append "--prefix=" (assoc-ref outputs "out"))
2280 (string-append "--debug")
2281 (string-append "--with-xml2-prefix="
2282 (assoc-ref inputs "libxml2"))
2283 (string-append "--with-ngs-sdk-prefix="
2284 (assoc-ref inputs "ngs-sdk"))
2285 (string-append "--with-ngs-java-prefix="
2286 (assoc-ref inputs "ngs-java"))
2287 (string-append "--with-hdf5-prefix="
2288 (assoc-ref inputs "hdf5"))))))
2290 'install 'install-interfaces
2291 (lambda* (#:key outputs #:allow-other-keys)
2292 ;; Install interface libraries. On i686 the interface libraries
2293 ;; are installed to "linux/gcc/i386", so we need to use the Linux
2294 ;; architecture name ("i386") instead of the target system prefix
2296 (mkdir (string-append (assoc-ref outputs "out") "/ilib"))
2297 (copy-recursively (string-append "build/ncbi-vdb/linux/gcc/"
2298 ,(system->linux-architecture
2299 (or (%current-target-system)
2302 (string-append (assoc-ref outputs "out")
2304 ;; Install interface headers
2305 (copy-recursively "interfaces"
2306 (string-append (assoc-ref outputs "out")
2308 %standard-phases))))
2310 `(("libxml2" ,libxml2)
2311 ("ngs-sdk" ,ngs-sdk)
2312 ("ngs-java" ,ngs-java)
2315 (native-inputs `(("perl" ,perl)))
2316 (home-page "https://github.com/ncbi/ncbi-vdb")
2317 (synopsis "Database engine for genetic information")
2319 "The NCBI-VDB library implements a highly compressed columnar data
2320 warehousing engine that is most often used to store genetic information.
2321 Databases are stored in a portable image within the file system, and can be
2322 accessed/downloaded on demand across HTTP.")
2323 (license license:public-domain)))
2325 (define-public plink
2333 "http://pngu.mgh.harvard.edu/~purcell/plink/dist/plink-"
2334 version "-src.zip"))
2336 (base32 "0as8gxm4pjyc8dxmm1sl873rrd7wn5qs0l29nqfnl31x8i467xaa"))
2337 (patches (list (search-patch "plink-1.07-unclobber-i.patch")))))
2338 (build-system gnu-build-system)
2340 '(#:tests? #f ;no "check" target
2341 #:make-flags (list (string-append "LIB_LAPACK="
2342 (assoc-ref %build-inputs "lapack")
2343 "/lib/liblapack.so")
2346 ;; disable phoning home
2349 (modify-phases %standard-phases
2350 ;; no "configure" script
2353 (lambda* (#:key outputs #:allow-other-keys)
2354 (let ((bin (string-append (assoc-ref outputs "out")
2357 (copy-file "plink" (string-append bin "plink"))
2361 ("lapack" ,lapack)))
2363 `(("unzip" ,unzip)))
2364 (home-page "http://pngu.mgh.harvard.edu/~purcell/plink/")
2365 (synopsis "Whole genome association analysis toolset")
2367 "PLINK is a whole genome association analysis toolset, designed to
2368 perform a range of basic, large-scale analyses in a computationally efficient
2369 manner. The focus of PLINK is purely on analysis of genotype/phenotype data,
2370 so there is no support for steps prior to this (e.g. study design and
2371 planning, generating genotype or CNV calls from raw data). Through
2372 integration with gPLINK and Haploview, there is some support for the
2373 subsequent visualization, annotation and storage of results.")
2374 ;; Code is released under GPLv2, except for fisher.h, which is under
2376 (license (list license:gpl2 license:lgpl2.1+))))
2378 (define-public preseq
2385 (string-append "http://smithlabresearch.org/downloads/preseq-"
2386 version ".tar.bz2"))
2388 (base32 "0r7sw07p6nv8ygvc17gd78lisbw5336v3vhs86b5wv8mw3pwqksc"))
2389 (patches (list (search-patch "preseq-1.0.2-install-to-PREFIX.patch")
2390 (search-patch "preseq-1.0.2-link-with-libbam.patch")))
2391 (modules '((guix build utils)))
2393 ;; Remove bundled samtools.
2394 '(delete-file-recursively "preseq-master/samtools"))))
2395 (build-system gnu-build-system)
2397 `(#:tests? #f ;no "check" target
2399 (modify-phases %standard-phases
2403 (chdir "preseq-master")
2406 'enter-dir 'use-samtools-headers
2408 (substitute* '("smithlab_cpp/SAM.cpp"
2409 "smithlab_cpp/SAM.hpp")
2410 (("sam.h") "samtools/sam.h"))
2412 (delete 'configure))
2413 #:make-flags (list (string-append "PREFIX="
2414 (assoc-ref %outputs "out"))
2415 (string-append "LIBBAM="
2416 (assoc-ref %build-inputs "samtools")
2420 ("samtools" ,samtools-0.1)
2422 (home-page "http://smithlabresearch.org/software/preseq/")
2423 (synopsis "Program for analyzing library complexity")
2425 "The preseq package is aimed at predicting and estimating the complexity
2426 of a genomic sequencing library, equivalent to predicting and estimating the
2427 number of redundant reads from a given sequencing depth and how many will be
2428 expected from additional sequencing using an initial sequencing experiment.
2429 The estimates can then be used to examine the utility of further sequencing,
2430 optimize the sequencing depth, or to screen multiple libraries to avoid low
2431 complexity samples.")
2432 (license license:gpl3+)))
2434 (define-public sra-tools
2442 (string-append "https://github.com/ncbi/sra-tools/archive/"
2444 (file-name (string-append name "-" version ".tar.gz"))
2447 "11nrnvz7a012f4iryf0wiwrid0h111grsfxbxa9j51h3f2xbvgns"))))
2448 (build-system gnu-build-system)
2450 `(#:parallel-build? #f ; not supported
2451 #:tests? #f ; no "check" target
2455 (lambda* (#:key inputs outputs #:allow-other-keys)
2456 ;; The build system expects a directory containing the sources and
2457 ;; raw build output of ncbi-vdb, including files that are not
2458 ;; installed. Since we are building against an installed version of
2459 ;; ncbi-vdb, the following modifications are needed.
2460 (substitute* "setup/konfigure.perl"
2461 ;; Make the configure script look for the "ilib" directory of
2462 ;; "ncbi-vdb" without first checking for the existence of a
2463 ;; matching library in its "lib" directory.
2464 (("^ my \\$f = File::Spec->catdir\\(\\$libdir, \\$lib\\);")
2465 "my $f = File::Spec->catdir($ilibdir, $ilib);")
2466 ;; Look for interface libraries in ncbi-vdb's "ilib" directory.
2467 (("my \\$ilibdir = File::Spec->catdir\\(\\$builddir, 'ilib'\\);")
2468 "my $ilibdir = File::Spec->catdir($dir, 'ilib');"))
2470 ;; The 'configure' script doesn't recognize things like
2471 ;; '--enable-fast-install'.
2474 (string-append "--build-prefix=" (getcwd) "/build")
2475 (string-append "--prefix=" (assoc-ref outputs "out"))
2476 (string-append "--debug")
2477 (string-append "--with-fuse-prefix="
2478 (assoc-ref inputs "fuse"))
2479 (string-append "--with-magic-prefix="
2480 (assoc-ref inputs "libmagic"))
2481 ;; TODO: building with libxml2 fails with linker errors
2482 ;; (string-append "--with-xml2-prefix="
2483 ;; (assoc-ref inputs "libxml2"))
2484 (string-append "--with-ncbi-vdb-sources="
2485 (assoc-ref inputs "ncbi-vdb"))
2486 (string-append "--with-ncbi-vdb-build="
2487 (assoc-ref inputs "ncbi-vdb"))
2488 (string-append "--with-ngs-sdk-prefix="
2489 (assoc-ref inputs "ngs-sdk"))
2490 (string-append "--with-hdf5-prefix="
2491 (assoc-ref inputs "hdf5")))))
2493 (native-inputs `(("perl" ,perl)))
2495 `(("ngs-sdk" ,ngs-sdk)
2496 ("ncbi-vdb" ,ncbi-vdb)
2501 (home-page "http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software")
2502 (synopsis "Tools and libraries for reading and writing sequencing data")
2504 "The SRA Toolkit from NCBI is a collection of tools and libraries for
2505 reading of sequencing files from the Sequence Read Archive (SRA) database and
2506 writing files into the .sra format.")
2507 (license license:public-domain)))
2509 (define-public seqan
2515 (uri (string-append "http://packages.seqan.de/seqan-library/"
2516 "seqan-library-" version ".tar.bz2"))
2519 "05s3wrrwn50f81aklfm65i4a749zag1vr8z03k21xm0pdxy47yvp"))))
2520 ;; The documentation is 7.8MB and the includes are 3.6MB heavy, so it
2521 ;; makes sense to split the outputs.
2522 (outputs '("out" "doc"))
2523 (build-system trivial-build-system)
2525 `(#:modules ((guix build utils))
2528 (use-modules (guix build utils))
2529 (let ((tar (assoc-ref %build-inputs "tar"))
2530 (bzip (assoc-ref %build-inputs "bzip2"))
2531 (out (assoc-ref %outputs "out"))
2532 (doc (assoc-ref %outputs "doc")))
2533 (setenv "PATH" (string-append tar "/bin:" bzip "/bin"))
2534 (system* "tar" "xvf" (assoc-ref %build-inputs "source"))
2535 (chdir (string-append "seqan-library-" ,version))
2536 (copy-recursively "include" (string-append out "/include"))
2537 (copy-recursively "share" (string-append doc "/share"))))))
2539 `(("source" ,source)
2542 (home-page "http://www.seqan.de")
2543 (synopsis "Library for nucleotide sequence analysis")
2545 "SeqAn is a C++ library of efficient algorithms and data structures for
2546 the analysis of sequences with the focus on biological data. It contains
2547 algorithms and data structures for string representation and their
2548 manipulation, online and indexed string search, efficient I/O of
2549 bioinformatics file formats, sequence alignment, and more.")
2550 (license license:bsd-3)))
2559 "https://github.com/alexdobin/STAR/archive/STAR_"
2563 "1c3rnm7r5l0kl3d04gl1g7938xqf1c2l0mla87rlplqg1hcns5mc"))
2564 (modules '((guix build utils)))
2566 '(substitute* "source/Makefile"
2567 (("/bin/rm") "rm")))))
2568 (build-system gnu-build-system)
2570 '(#:tests? #f ;no check target
2571 #:make-flags '("STAR")
2574 'unpack 'enter-source-dir (lambda _ (chdir "source"))
2577 (lambda* (#:key outputs #:allow-other-keys)
2578 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
2580 (copy-file "STAR" (string-append bin "STAR"))))
2582 'configure %standard-phases)))))
2584 `(("vim" ,vim))) ; for xxd
2587 (home-page "https://github.com/alexdobin/STAR")
2588 (synopsis "Universal RNA-seq aligner")
2590 "The Spliced Transcripts Alignment to a Reference (STAR) software is
2591 based on a previously undescribed RNA-seq alignment algorithm that uses
2592 sequential maximum mappable seed search in uncompressed suffix arrays followed
2593 by seed clustering and stitching procedure. In addition to unbiased de novo
2594 detection of canonical junctions, STAR can discover non-canonical splices and
2595 chimeric (fusion) transcripts, and is also capable of mapping full-length RNA
2597 ;; STAR is licensed under GPLv3 or later; htslib is MIT-licensed.
2598 (license license:gpl3+)))
2600 (define-public subread
2603 (version "1.4.6-p2")
2607 "mirror://sourceforge/subread/subread-"
2608 version "-source.tar.gz"))
2611 "06sv9mpcsdj6p68y15d6gi70lca3lxmzk0dn61hg0kfsa7rxmsr3"))))
2612 (build-system gnu-build-system)
2614 `(#:tests? #f ;no "check" target
2615 #:make-flags '("-f" "Makefile.Linux")
2619 (lambda _ (chdir "src") #t)
2622 (lambda* (#:key outputs #:allow-other-keys)
2623 (let ((bin (string-append (assoc-ref outputs "out") "/bin/")))
2625 (copy-recursively "../bin" bin)))
2626 ;; no "configure" script
2627 (alist-delete 'configure %standard-phases)))))
2628 (inputs `(("zlib" ,zlib)))
2629 (home-page "http://bioinf.wehi.edu.au/subread-package/")
2630 (synopsis "Tool kit for processing next-gen sequencing data")
2632 "The subread package contains the following tools: subread aligner, a
2633 general-purpose read aligner; subjunc aligner: detecting exon-exon junctions
2634 and mapping RNA-seq reads; featureCounts: counting mapped reads for genomic
2635 features; exactSNP: a SNP caller that discovers SNPs by testing signals
2636 against local background noises.")
2637 (license license:gpl3+)))
2639 (define-public vcftools
2646 "mirror://sourceforge/vcftools/vcftools_"
2650 "148al9h7f8g8my2qdnpax51kdd2yjrivlx6frvakf4lz5r8j88wx"))))
2651 (build-system gnu-build-system)
2653 `(#:tests? #f ; no "check" target
2655 "CFLAGS=-O2" ; override "-m64" flag
2656 (string-append "PREFIX=" (assoc-ref %outputs "out"))
2657 (string-append "MANDIR=" (assoc-ref %outputs "out")
2661 'unpack 'patch-manpage-install
2663 (substitute* "Makefile"
2664 (("cp \\$\\{PREFIX\\}/cpp/vcftools.1") "cp ./cpp/vcftools.1")))
2665 (alist-delete 'configure %standard-phases))))
2669 (home-page "http://vcftools.sourceforge.net/")
2670 (synopsis "Tools for working with VCF files")
2672 "VCFtools is a program package designed for working with VCF files, such
2673 as those generated by the 1000 Genomes Project. The aim of VCFtools is to
2674 provide easily accessible methods for working with complex genetic variation
2675 data in the form of VCF files.")
2676 ;; The license is declared as LGPLv3 in the README and
2677 ;; at http://vcftools.sourceforge.net/license.html
2678 (license license:lgpl3)))
2680 (define-public bio-locus
2687 (uri (rubygems-uri "bio-locus" version))
2690 "02vmrxyimkj9sahsp4zhfhnmbvz6dbbqz1y01vglf8cbwvkajfl0"))))
2691 (build-system ruby-build-system)
2693 `(("ruby-rspec" ,ruby-rspec)))
2694 (synopsis "Tool for fast querying of genome locations")
2696 "Bio-locus is a tabix-like tool for fast querying of genome
2697 locations. Many file formats in bioinformatics contain records that
2698 start with a chromosome name and a position for a SNP, or a start-end
2699 position for indels. Bio-locus allows users to store this chr+pos or
2700 chr+pos+alt information in a database.")
2701 (home-page "https://github.com/pjotrp/bio-locus")
2702 (license license:expat)))
2704 (define-public bioruby
2711 (uri (rubygems-uri "bio" version))
2714 "01k2fyjl5fpx4zn8g6gqiqvsg2j1fgixrs9p03vzxckynxdq3wmc"))))
2715 (build-system ruby-build-system)
2717 `(("ruby-libxml" ,ruby-libxml)))
2719 `(("which" ,which))) ; required for test phase
2722 (modify-phases %standard-phases
2723 (add-before 'build 'patch-test-command
2725 (substitute* '("test/functional/bio/test_command.rb")
2726 (("/bin/sh") (which "sh")))
2727 (substitute* '("test/functional/bio/test_command.rb")
2728 (("/bin/ls") (which "ls")))
2729 (substitute* '("test/functional/bio/test_command.rb")
2730 (("which") (which "which")))
2731 (substitute* '("test/functional/bio/test_command.rb",
2732 "test/data/command/echoarg2.sh")
2733 (("/bin/echo") (which "echo")))
2735 (synopsis "Ruby library, shell and utilities for bioinformatics")
2736 (description "BioRuby comes with a comprehensive set of Ruby development
2737 tools and libraries for bioinformatics and molecular biology. BioRuby has
2738 components for sequence analysis, pathway analysis, protein modelling and
2739 phylogenetic analysis; it supports many widely used data formats and provides
2740 easy access to databases, external programs and public web services, including
2741 BLAST, KEGG, GenBank, MEDLINE and GO.")
2742 (home-page "http://bioruby.org/")
2743 ;; Code is released under Ruby license, except for setup
2744 ;; (LGPLv2.1+) and scripts in samples (which have GPL2 and GPL2+)
2745 (license (list license:ruby license:lgpl2.1+ license:gpl2+ ))))