Skip to content

Commit 6bc3cb9

Browse files
authored
Merge pull request #169 from databio/dev
Dev
2 parents 3291788 + 9c6322d commit 6bc3cb9

File tree

18 files changed

+740
-125
lines changed

18 files changed

+740
-125
lines changed

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ _site
2020
.sass-cache
2121
_site/
2222
/_site/
23+
_build/
2324
.sass-cache/
2425
.jekyll-metadata
2526

@@ -32,3 +33,17 @@ anno/mm9_annotations.bed.gz
3233
# Tutorial files
3334
examples/data/tutorial_r1.fastq.gz
3435
examples/data/tutorial_r2.fastq.gz
36+
examples/gold_atac/metadata/distinct.bed
37+
examples/gold_atac/metadata/distinct_only.bed
38+
examples/gold_atac/metadata/gold_fseq.yaml
39+
examples/gold_atac/metadata/gold_genrich.yaml
40+
examples/gold_atac/metadata/gold_hmmratac.yaml
41+
examples/gold_atac/metadata/gold_homer.yaml
42+
examples/gold_atac/metadata/gold_picard_dedup.yaml
43+
examples/gold_atac/metadata/gold_samtools_dedup.yaml
44+
examples/test_project/test_bwa.yaml
45+
examples/test_project/test_fseq.yaml
46+
examples/test_project/test_genrich.yaml
47+
examples/test_project/test_hmmratac.yaml
48+
examples/test_project/test_homer.yaml
49+
examples/test_project/test_macs.yaml

PEP_schema.yaml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
description: "Schema for a minimal PEP"
2+
version: "2.0.0"
3+
properties:
4+
name:
5+
type: string
6+
pattern: "^\\S*$"
7+
description: "Project name with no whitespace"
8+
config:
9+
pep_version:
10+
description: "Version of the PEP Schema this PEP follows"
11+
type: string
12+
sample_table:
13+
type: string
14+
description: "Path to the sample annotation table with one row per sample"
15+
subsample_table:
16+
type: string
17+
description: "Path to the subsample annotation table with one row per subsample and sample_name attribute matching an entry in the sample table"
18+
sample_modifiers:
19+
type: object
20+
properties:
21+
append:
22+
type: object
23+
duplicate:
24+
type: object
25+
imply:
26+
type: array
27+
items:
28+
type: object
29+
properties:
30+
if:
31+
type: object
32+
then:
33+
type: object
34+
derive:
35+
type: object
36+
properties:
37+
attributes:
38+
type: array
39+
items:
40+
type: string
41+
sources:
42+
type: object
43+
project_modifiers:
44+
type: object
45+
properties:
46+
amend:
47+
description: "Object overwriting original project attributes"
48+
type: object
49+
import:
50+
description: "List of external PEP project config files to import"
51+
type: array
52+
items:
53+
type: string
54+
required:
55+
- pep_version
56+
samples:
57+
type: array
58+
items:
59+
type: object
60+
properties:
61+
sample_name:
62+
type: string
63+
pattern: "^\\S*$"
64+
description: "Unique name of the sample with no whitespace"
65+
required:
66+
- sample_name
67+
required:
68+
- samples

docs/changelog.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22
All notable changes to this project will be documented in this file.
33

44

5+
## [0.9.14] -- 2021-02-05
6+
7+
### Changed
8+
- Update F-Seq to F-Seq2
9+
- Add the option to install using conda
10+
- Add script for easing installation of seqOutBias
11+
- Use https for PEP schema; list alternative local PEP schema approach
12+
513
## [0.9.13] -- 2020-12-02
614

715
### Changed

docs/files/examples/gold/gold_reports/fastqc_report_r1.html

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,14 @@
226226
<body>
227227
<div id="top"></div>
228228
<div class="container">
229-
230-
<p><b>No objects to display for: <code>FastQC report r1</code></b><p>
231-
232-
<!-- Expects a 2 lists of lists: figures and links -->
233-
229+
230+
<h5>FastQC r1 reports</h5>
231+
232+
<a href='../results_pipeline/gold1/fastqc/gold1_R1_trim_fastqc.html' class="list-group-item">gold1 FastQC report r1</a>
233+
<a href='../results_pipeline/gold2/fastqc/gold2_R1_trim_fastqc.html' class="list-group-item">gold2 FastQC report r1</a>
234+
<a href='../results_pipeline/gold3/fastqc/gold3_R1_trim_fastqc.html' class="list-group-item">gold3 FastQC report r1</a>
235+
<a href='../results_pipeline/gold4/fastqc/gold4_R1_trim_fastqc.html' class="list-group-item">gold4 FastQC report r1</a>
236+
<a href='../results_pipeline/gold5/fastqc/gold5_R1_trim_fastqc.html' class="list-group-item">gold5 FastQC report r1</a>
234237

235238
</div>
236239
</body>

docs/files/examples/gold/gold_reports/fastqc_report_r2.html

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,13 @@
227227
<div id="top"></div>
228228
<div class="container">
229229

230-
<p><b>No objects to display for: <code>FastQC report r2</code></b><p>
231-
232-
<!-- Expects a 2 lists of lists: figures and links -->
230+
<h5>FastQC r2 reports</h5>
231+
232+
<a href='../results_pipeline/gold1/fastqc/gold1_R2_trim_fastqc.html' class="list-group-item">gold1 FastQC report r2</a>
233+
<a href='../results_pipeline/gold2/fastqc/gold2_R2_trim_fastqc.html' class="list-group-item">gold2 FastQC report r2</a>
234+
<a href='../results_pipeline/gold3/fastqc/gold3_R2_trim_fastqc.html' class="list-group-item">gold3 FastQC report r2</a>
235+
<a href='../results_pipeline/gold4/fastqc/gold4_R2_trim_fastqc.html' class="list-group-item">gold4 FastQC report r2</a>
236+
<a href='../results_pipeline/gold5/fastqc/gold5_R2_trim_fastqc.html' class="list-group-item">gold5 FastQC report r2</a>
233237

234238

235239
</div>

docs/files/examples/gold/results_pipeline/gold1/fastqc/gold1_R1_trim_fastqc.html

Lines changed: 187 additions & 0 deletions
Large diffs are not rendered by default.

docs/files/examples/gold/results_pipeline/gold3/fastqc/gold3_R1_trim_fastqc.html

Lines changed: 187 additions & 0 deletions
Large diffs are not rendered by default.

docs/install.md

Lines changed: 51 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,84 +6,89 @@
66
git clone https://github.com/databio/pepatac.git
77
```
88

9-
## 2: Download `refgenie` assets
9+
## 2: Install required software
1010

11-
PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this:
12-
13-
```console
14-
pip install --user refgenie
15-
export REFGENIE=your_genome_folder/genome_config.yaml
16-
refgenie init -c $REFGENIE
17-
```
11+
You have two options for software prerequisites: 1) use containers, or 2) install all prerequisites natively. If you want to use containers, you need the [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively:
1812

19-
Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists.
13+
### Tools
2014

21-
Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command:
15+
You will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/).
2216

23-
```console
24-
refgenie pull hg38/bowtie2_index refgene_anno feat_annotation
25-
```
17+
Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)).
2618

27-
PEPATAC also requires `bowtie2_index` for any pre-alignment genomes:
19+
The easiest and preferred way is to utilize `conda` to install all the tools in a single command, albeit be prepared for this initial installation process to take more than an hour to complete.
2820

29-
```console
30-
refgenie pull rCRSd/bowtie2_index
31-
refgenie pull human_repeats/bowtie2_index
21+
From the `pepatac/` directory:
22+
```{bash}
23+
conda env create -f requirements-conda.yml
3224
```
3325

34-
## 3: Install required software
35-
36-
You have two options for software prerequisites: 1) use a container, or 2) install all prerequisites natively. If you want to use containers, you need our [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively:
26+
Note: The subsequent steps all assume you have installed using `conda`. Alternatively, you can follow instructions to install each individual program natively. If you need additional direction with this approach, see the [detailed installation instructions](detailed-install.md).
3727

3828
### Python packages
3929

40-
`PEPATAC` uses several packages under the hood. From the `pepatac/` directory:
30+
`PEPATAC` uses several Python packages under the hood. Not all of these are available through `conda`, so we'll ensure they are installed ourselves to the `pepatac` `conda` environment. From the `pepatac/` directory:
4131

4232
```{bash}
43-
pip install --user -r requirements.txt
33+
conda activate pepatac
34+
unset PYTHONPATH
35+
python -m pip install --ignore-installed --upgrade -r requirements.txt
4436
```
4537

4638
### R packages
4739

48-
`PEPATAC` uses `R` to generate quality control and read/peak annotation plots, so you'll need to have R functional if you want these outputs. We have packaged all the `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/dev/PEPATACr). The `PEPATAC` package relies on a few additional packages which can be installed at the command line as follows:
40+
`PEPATAC` uses `R` to generate quality control and read/peak annotation plots. We have packaged the `pepatac` specific `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/dev/PEPATACr). The `PEPATACr` package relies on a few additional packages which can be installed to the `conda` environment.
4941

50-
```
51-
Rscript -e 'install.packages("devtools")'
52-
Rscript -e 'devtools::install_github("pepkit/pepr")'
53-
Rscript -e 'install.packages("BiocManager")'
54-
Rscript -e 'BiocManager::install("GenomicRanges")'
55-
Rscript -e 'devtools::install_github("databio/GenomicDistributions")'
56-
Rscript -e 'BiocManager::install(c("BSgenome", "GenomicFeatures", "ensembldb"))'
57-
Rscript -e 'install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)'
42+
To ensure these packages are installed to the `pepatac` `conda` environment, make sure to point your `R_LIBS` environment variable to the `conda` environment `R` library. For example:
43+
```{bash}
44+
conda activate pepatac
45+
unset R_LIBS
46+
export R_LIBS="$CONDA_PREFIX/lib/R/library"
5847
```
5948

60-
Then, install the `PEPATAC` package. From the `pepatac/` directory:
61-
```
62-
Rscript -e 'devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")'
49+
From the `pepatac/` directory, open `R` and install the following packages:
50+
```{R}
51+
install.packages("optigrab")
52+
devtools::install_github("databio/GenomicDistributions")
53+
install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)
54+
devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")
6355
```
6456

65-
### Tools
57+
## 3: Download `refgenie` assets
6658

67-
We will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/). Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)).
59+
PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this:
6860

69-
You should follow instructions to install each individual program. If you need help installing these, see the [detailed installation instructions](detailed-install.md).
61+
```console
62+
export REFGENIE=/path/to/your_genome_folder/genome_config.yaml
63+
refgenie init -c $REFGENIE
64+
```
7065

71-
## 4: Run an example project through `PEPATAC`
66+
Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists.
7267

73-
Start by running the example project (test_config.yaml) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the pipeline across samples in a project. Let's use the `-d` argument to do a dry run, which will create job scripts for every sample in a project, but will not execute them:
68+
Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command:
7469

70+
```console
71+
refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb
72+
refgenie build hg38/feat_annotation
7573
```
76-
cd pepatac
77-
looper run -d examples/test_project/test_config.yaml
74+
75+
PEPATAC also requires a `bowtie2_index` asset for any pre-alignment genomes:
76+
77+
```console
78+
refgenie pull rCRSd/bowtie2_index
79+
refgenie pull human_repeats/bowtie2_index
7880
```
7981

80-
If the looper executable is not in your $PATH, add the following line to your .bashrc or .profile:
82+
## 4: Run an example project through `PEPATAC`
83+
84+
Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the pipeline across samples in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them:
8185

86+
From the `pepatac/` folder:
8287
```
83-
export PATH=$PATH:~/.local/bin
88+
looper run -d examples/test_project/test_config.yaml
8489
```
8590

86-
If that worked, let's actually run the example by taking out the -d flag:
91+
If that looked good, let's actually run the example by taking out the `-d` flag:
8792
```
8893
looper run examples/test_project/test_config.yaml
8994
```
@@ -95,11 +100,11 @@ bulker activate databio/pepatac
95100
looper run examples/test_project/test_config.yaml
96101
```
97102

98-
There are lots of other cool things you can do with looper, like dry runs, summarize results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/).
103+
There are lots of other cool things you can do with looper, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/).
99104

100105
## 5: Configure your project files
101106

102-
To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](https://pepkit.github.io/docs/home/) and is universal to all pipelines that read PEPs, including `PEPATAC`. To get you started, there are multiple examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/pepatac/tree/master/examples/test_project)). In short, you need two files for your project:
107+
To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](http://pep.databio.org/en/latest/simple_example/) and is universal to all pipelines that read PEPs, including `PEPATAC`. To get you started, there are multiple examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/pepatac/tree/master/examples/test_project)). In short, you need two files for your project:
103108

104109
1. project config file -- describes output locations, pointers to data, etc.
105110
2. sample annotation file -- comma-separated value (CSV) list of your samples.

docs/usage.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
2222
[--motif] [--sob] [--no-scale] [--prioritize] [--keep]
2323
[--noFIFO] [--lite] [--skipqc] [-V]
2424
25-
PEPATAC version 0.9.13
25+
PEPATAC version 0.9.14
2626
2727
optional arguments:
2828
-h, --help show this help message and exit

examples/test_project/test_config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,17 @@ pep_version: 2.0.0
55
sample_table: test_annotation.csv # sheet listing all samples in the project
66

77
looper: # relative paths are relative to this config file
8-
output_dir: "$PROCESSED/pepatac_test" # ABSOLUTE PATH to the parent, shared space where project results go
9-
pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"] # ABSOLUTE PATH to the directory where looper will find the pipeline repository
8+
output_dir: pepatac_test
9+
pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository.
1010

1111
sample_modifiers:
1212
append:
13-
pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"]
13+
pipeline_interfaces: ../../sample_pipeline_interface.yaml
1414
derive:
1515
attributes: [read1, read2]
1616
sources:
17-
test_data_R1: "$CODE/pepatac/examples/data/{sample_name}_r1.fastq.gz"
18-
test_data_R2: "$CODE/pepatac/examples/data/{sample_name}_r2.fastq.gz"
17+
test_data_R1: "examples/data/{sample_name}_r1.fastq.gz"
18+
test_data_R2: "examples/data/{sample_name}_r2.fastq.gz"
1919
imply:
2020
- if:
2121
organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"]
@@ -29,4 +29,4 @@ sample_modifiers:
2929
peak_type: fixed # Default. [options: variable]
3030
extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream.
3131
frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run
32-
blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz
32+

0 commit comments

Comments
 (0)