From 6f7e510a67b3a06c6097e5e2b49b317965565e56 Mon Sep 17 00:00:00 2001
From: Laura Cook <l.cook2@student.unimelb.edu.au>
Date: Thu, 3 Sep 2020 10:31:05 +1000
Subject: [PATCH] created separate snakefiles for histone marks as I couldn't
 get the wildcards to work so had to hardcode one of the rules

---
 mouse/{Snakefile => Snakefile_H3K27ac} | 114 +++--
 mouse/Snakefile_H3K4me3                | 663 +++++++++++++++++++++++++
 2 files changed, 730 insertions(+), 47 deletions(-)
 rename mouse/{Snakefile => Snakefile_H3K27ac} (87%)
 create mode 100644 mouse/Snakefile_H3K4me3

diff --git a/mouse/Snakefile b/mouse/Snakefile_H3K27ac
similarity index 87%
rename from mouse/Snakefile
rename to mouse/Snakefile_H3K27ac
index f0e2d17..55a1eea 100644
--- a/mouse/Snakefile
+++ b/mouse/Snakefile_H3K27ac
@@ -12,7 +12,7 @@
     # 10. Present QC for raw read, alignment, peak-calling in MultiQC
 
 
-configfile: "envs/config.yaml"
+configfile: "configs/config_H3K427ac.yaml"
 # Contains sample and genome information
 # calls the SRR.txt file that contains the samples as either IP or control
 
@@ -40,7 +40,7 @@ unique_marks = list(set(MARK))
 unique_stages = list(set(STAGE))
 
 ## combine all samples
-all_samples = IPS + unique_inputs
+all_samples = IPS + INPUTS
 
 # ===============================================================================================
 #        Output targets
@@ -80,17 +80,19 @@ rule all:
         "logs/input_E13.5_H3K27ac.mergeBAM",
         "logs/input_E14.5_H3K27ac.mergeBAM",
         "logs/input_E15.5_H3K27ac.mergeBAM",
-        "results/qc/multibamsum.npz",
-        "results/qc/multibamsum.tab",
-        "results/qc/pearsoncor_multibamsum.png",
-        "results/qc/pearsoncor_multibamsum_matrix.txt",
-        expand("results/qc/{sample}_{stage}_{mark}.SeqDepthNorm.bw", zip, sample=all_samples, stage=STAGE, mark=MARK),
-        "results/qc/multiBAM_fingerprint.png",
-        "results/qc/multiBAM_fingerprint_metrics.txt",
-        "results/qc/multiBAM_fingerprint_rawcounts.txt",
-        "results/qc/plot_coverage.png",
-        "results/qc/plot_coverage_rawcounts.tab",
-        expand("results/qc/{sample}_{stage}_{mark}.pbc.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        # "results/qc/multibamsum.npz",
+        # "results/qc/multibamsum.tab",
+        # "results/qc/pearsoncor_multibamsum.png",
+        # "results/qc/pearsoncor_multibamsum_matrix.txt",
+        # expand("results/qc/{sample}_{stage}_{mark}.SeqDepthNorm.bw", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        # "results/qc/multiBAM_fingerprint.png",
+        # "results/qc/multiBAM_fingerprint_metrics.txt",
+        # "results/qc/multiBAM_fingerprint_rawcounts.txt",
+        # "results/qc/plot_coverage.png",
+        # "results/qc/plot_coverage_rawcounts.tab",
+        #expand("results/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        #expand("logs/{sample}_{stage}_{mark}.pbc.sort", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        #expand("results/qc/{sample}_{stage}_{mark}.pbc.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
         expand("results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.qc",zip, sample=all_samples, stage=STAGE, mark=MARK),
         expand("results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.plot.pdf", zip, sample=all_samples, stage=STAGE, mark=MARK),
         expand("results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
@@ -100,14 +102,13 @@ rule all:
         expand("results/bwa/{case}_{stage}_{mark}_q30.sorted.dedup.bed", zip, case=IPS, stage=STAGE, mark=MARK),
         expand("logs/{case}_{stage}_{mark}.bamToBed", zip, case=IPS, stage=STAGE, mark=MARK),
         expand("results/qc/{case}_vs_{control}_{stage}_{mark}.frip.txt", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
-        directory("results/macs2/pooled"),
-        # "results/macs2/H3K27ac_E10.5_overlap.narrowPeak",
-        # "results/macs2/H3K27ac_E11.5_overlap.narrowPeak",
-        # "results/macs2/H3K27ac_E12.5_overlap.narrowPeak",
-        # "results/macs2/H3K27ac_E13.5_overlap.narrowPeak",
-        # "results/macs2/H3K27ac_E14.5_overlap.narrowPeak",
-        # "results/macs2/H3K27ac_E15.5_overlap.narrowPeak"
-        # expand("results/macs2/{stage}_H3K27ac_overlap.narrowPeak", zip, stage= STAGE, mark = MARK)
+        directory("results/macs2/pooled/"),
+        "results/macs2/H3K27ac_E10.5_overlap.narrowPeak",
+        "results/macs2/H3K27ac_E11.5_overlap.narrowPeak",
+        "results/macs2/H3K27ac_E12.5_overlap.narrowPeak",
+        "results/macs2/H3K27ac_E13.5_overlap.narrowPeak",
+        "results/macs2/H3K27ac_E14.5_overlap.narrowPeak",
+        "results/macs2/H3K27ac_E15.5_overlap.narrowPeak"
         # directory("results/multiqc/multiqc_report_data/"),
         # "results/multiqc/multiqc_report.html"
 
@@ -267,22 +268,41 @@ rule mappingStats:
         shell("samtools flagstat {input.c} > {output.c}")
         shell("samtools flagstat {input.d} > {output.d}")
 
-rule estimate_lib_complexity:
-    input:
-        "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
-    output:
-        qc = "results/qc/{sample}_{stage}_{mark}.pbc.qc",
-    log:
-        "logs/{sample}_{stage}_{mark}.pbc"
-    shell:
-        """
-        bedtools bamtobed -bedpe -i {input} \
-        | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}' \
-        | sort | uniq -c \
-        | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
-        {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
-        > {output.qc}
-        """
+
+# rule sort_name:
+#     input:
+#         "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
+#     output:
+#         tmp = "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
+#     log:
+#         "logs/{sample}_{stage}_{mark}.pbc.sort"
+#     run:
+#         shell("samtools sort -n {input} -o {output.tmp} 2> {log}")
+#
+# rule estimate_lib_complexity:
+#     input:
+#         "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
+#     output:
+#         qc = "results/qc/{sample}_{stage}_{mark}.pbc.qc",
+#     log:
+#         "logs/{sample}_{stage}_{mark}.pbc"
+#     shell:
+#         """
+#         bedtools bamtobed -i {input} \
+#         | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6}}' \
+#         | sort | uniq -c \
+#         | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
+#         {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
+#         > {output.qc}
+#         """
+#
+## convert to bedPE
+## print read 1 scaffold, read 1 start coordinate, read 2 scaffold, read 2 end coordinate, strand read 1, strand read 2
+## remove mitochondrial genome
+## sort by position and obtain uniq reads
+## Format of file
+## TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
+
 
 # ===============================================================================================
 #  5. deepTools
@@ -470,23 +490,23 @@ rule call_peaks_macs2_pooled_replicates:
         E12C = "results/bwa/input_E12.5_H3K27ac_q30.sorted.dedup.bam",
         E13C = "results/bwa/input_E13.5_H3K27ac_q30.sorted.dedup.bam",
         E14C = "results/bwa/input_E14.5_H3K27ac_q30.sorted.dedup.bam",
-        E15C = "results/bwa/input_E15.5_H3K27ac_q30.sorted.dedup.bam",
+        E15C = "results/bwa/input_E15.5_H3K27ac_q30.sorted.dedup.bam"
     output:
-        directory("results/macs2/pooled")
+        directory("results/macs2/pooled/")
     log:
         E10 = "results/qc/E10.5_H3K27ac.pooled.macs2",
         E11 = "results/qc/E11.5_H3K27ac.pooled.macs2",
         E12 = "results/qc/E12.5_H3K27ac.pooled.macs2",
         E13 = "results/qc/E13.5_H3K27ac.pooled.macs2",
         E14 = "results/qc/E14.5_H3K27ac.pooled.macs2",
-        E15 = "results/qc/E15.5_H3K27ac.pooled.macs2",
+        E15 = "results/qc/E15.5_H3K27ac.pooled.macs2"
     params:
         E10 = "E10.5_H3K27ac.pooled.macs2",
         E11 = "E11.5_H3K27ac.pooled.macs2",
         E12 = "E12.5_H3K27ac.pooled.macs2",
         E13 = "E13.5_H3K27ac.pooled.macs2",
         E14 = "E14.5_H3K27ac.pooled.macs2",
-        E15 = "E15.5_H3K27ac.pooled.macs2",
+        E15 = "E15.5_H3K27ac.pooled.macs2"
     run:
         shell("macs2 callpeak -f BAM -t {input.E10} \
         -c {input.E10C} --keep-dup all \
@@ -556,12 +576,12 @@ rule frip:
 
 rule overlap_peaks:
     input:
-        E10= expand(["results/macs2/{case}_vs_{control}_E10.5_H3K27ac_macs2_peaks.narrowPeak"], zip, case=IPS, control=INPUTS, mark=MARK),
-        E11= expand(["results/macs2/{case}_vs_{control}_E11.5_H3K27ac_macs2_peaks.narrowPeak"], zip, case=IPS, control=INPUTS, mark=MARK),
-        E12= expand(["results/macs2/{case}_vs_{control}_E12.5_H3K27ac_macs2_peaks.narrowPeak"], zip, case=IPS, control=INPUTS, mark=MARK),
-        E13= expand(["results/macs2/{case}_vs_{control}_E13.5_H3K27ac_macs2_peaks.narrowPeak"], zip, case=IPS, control=INPUTS, mark=MARK),
-        E14= expand(["results/macs2/{case}_vs_{control}_E14.5_H3K27ac_macs2_peaks.narrowPeak"], zip, case=IPS, control=INPUTS, mark=MARK),
-        E15= expand(["results/macs2/{case}_vs_{control}_E15.5_H3K27ac_macs2_peaks.narrowPeak"], zip, case=IPS, control=INPUTS, mark=MARK),
+        E10= ["results/macs2/ENCFF213EBC_vs_ENCFF157KEH_E10.5_H3K27ac_macs2_peaks.narrowPeak", "results/macs2/ENCFF548BRR_vs_ENCFF825AVI_E10.5_H3K27ac_macs2_peaks.narrowPeak"],
+        E11= ["results/macs2/ENCFF512SFE_vs_ENCFF184CUE_E11.5_H3K27ac_macs2_peaks.narrowPeak", "results/macs2/ENCFF515PKL_vs_ENCFF376FGM_E11.5_H3K27ac_macs2_peaks.narrowPeak"],
+        E12= ["results/macs2/ENCFF011NFM_vs_ENCFF058AUT_E12.5_H3K27ac_macs2_peaks.narrowPeak", "results/macs2/ENCFF394TZN_vs_ENCFF203JQV_E12.5_H3K27ac_macs2_peaks.narrowPeak"],
+        E13= ["results/macs2/ENCFF194ORC_vs_ENCFF117QRC_E13.5_H3K27ac_macs2_peaks.narrowPeak", "results/macs2/ENCFF290ZNF_vs_ENCFF248PGK_E13.5_H3K27ac_macs2_peaks.narrowPeak"],
+        E14= ["results/macs2/ENCFF902HAR_vs_ENCFF002HZV_E14.5_H3K27ac_macs2_peaks.narrowPeak", "results/macs2/ENCFF327VAO_vs_ENCFF784ORI_E14.5_H3K27ac_macs2_peaks.narrowPeak"],
+        E15= ["results/macs2/ENCFF707WKL_vs_ENCFF182XFG_E15.5_H3K27ac_macs2_peaks.narrowPeak", "results/macs2/ENCFF584JFB_vs_ENCFF727QTS_E15.5_H3K27ac_macs2_peaks.narrowPeak"],
         E10_pooled="results/macs2/E10.5_H3K27ac.pooled.macs2_peaks.narrowPeak",
         E11_pooled="results/macs2/E11.5_H3K27ac.pooled.macs2_peaks.narrowPeak",
         E12_pooled="results/macs2/E12.5_H3K27ac.pooled.macs2_peaks.narrowPeak",
diff --git a/mouse/Snakefile_H3K4me3 b/mouse/Snakefile_H3K4me3
new file mode 100644
index 0000000..6e37aac
--- /dev/null
+++ b/mouse/Snakefile_H3K4me3
@@ -0,0 +1,663 @@
+## Author: Laura E Cook, University of Melbourne
+## Purpose: ChIP-seq pipeline for histone marks
+## This workflow only works for single-end reads
+    # 1. FastQC on raw reads
+    # 2. Alignment (downloaded from ENCODE)
+    # 3. Filtering
+    # 4. Alignment QC & Library Complexity
+    # 5. deepTools
+    # 7. cross correlation (SPP)
+    # 8. Call narrow peaks (MACS2)
+    # 9. Create consensus peaksets
+    # 10. Present QC for raw read, alignment, peak-calling in MultiQC
+
+
+configfile: "configs/config_H3K4me3.yaml"
+# Contains sample and genome information
+# calls the SRR.txt file that contains the samples as either IP or control
+
+import csv
+import os
+
+IPS = []
+INPUTS = []
+MARK = []
+STAGE = []
+
+with open(config['SAMPLES'], "r") as f:
+    reader = csv.reader(f, delimiter = "\t")
+    # skip the header
+    header = next(reader)
+    for row in reader:
+        STAGE.append(row[2])
+        MARK.append(row[1])
+        IPS.append(row[3])
+        INPUTS.append(row[4])
+
+## multiple samples may use the same control input files
+unique_inputs = list(set(INPUTS))
+unique_marks = list(set(MARK))
+unique_stages = list(set(STAGE))
+
+## combine all samples
+all_samples = IPS + INPUTS
+
+# ===============================================================================================
+#        Output targets
+# ===============================================================================================
+
+rule all:
+    input:
+        expand("results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/qc/{sample}_{stage}_{mark}.dedup.flagstat.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/qc/{sample}_{stage}_{mark}.dupmark.flagstat.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/qc/{sample}_{stage}_{mark}.q30.flagstat.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/qc/{sample}_{stage}_{mark}.unfiltered.flagstat.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        "results/bwa/E10.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        "results/bwa/E11.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        "results/bwa/E12.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        "results/bwa/E13.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        "results/bwa/E14.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        "results/bwa/E15.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        "results/bwa/input_E10.5_H3K4me3_q30.sorted.dedup.bam",
+        "results/bwa/input_E11.5_H3K4me3_q30.sorted.dedup.bam",
+        "results/bwa/input_E12.5_H3K4me3_q30.sorted.dedup.bam",
+        "results/bwa/input_E13.5_H3K4me3_q30.sorted.dedup.bam",
+        "results/bwa/input_E14.5_H3K4me3_q30.sorted.dedup.bam",
+        "results/bwa/input_E15.5_H3K4me3_q30.sorted.dedup.bam",
+        "logs/E10.5_H3K4me3.mergeBAM",
+        "logs/E11.5_H3K4me3.mergeBAM",
+        "logs/E12.5_H3K4me3.mergeBAM",
+        "logs/E13.5_H3K4me3.mergeBAM",
+        "logs/E14.5_H3K4me3.mergeBAM",
+        "logs/E15.5_H3K4me3.mergeBAM",
+        "logs/input_E10.5_H3K4me3.mergeBAM",
+        "logs/input_E11.5_H3K4me3.mergeBAM",
+        "logs/input_E12.5_H3K4me3.mergeBAM",
+        "logs/input_E13.5_H3K4me3.mergeBAM",
+        "logs/input_E14.5_H3K4me3.mergeBAM",
+        "logs/input_E15.5_H3K4me3.mergeBAM",
+        # "results/qc/multibamsum.npz",
+        # "results/qc/multibamsum.tab",
+        # "results/qc/pearsoncor_multibamsum.png",
+        # "results/qc/pearsoncor_multibamsum_matrix.txt",
+        # expand("results/qc/{sample}_{stage}_{mark}.SeqDepthNorm.bw", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        # "results/qc/multiBAM_fingerprint.png",
+        # "results/qc/multiBAM_fingerprint_metrics.txt",
+        # "results/qc/multiBAM_fingerprint_rawcounts.txt",
+        # "results/qc/plot_coverage.png",
+        # "results/qc/plot_coverage_rawcounts.tab",
+        #expand("results/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        #expand("logs/{sample}_{stage}_{mark}.pbc.sort", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        #expand("results/qc/{sample}_{stage}_{mark}.pbc.qc", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.qc",zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.plot.pdf", zip, sample=all_samples, stage=STAGE, mark=MARK),
+        expand("results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
+        expand("results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_peaks.xls", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
+        expand("results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_summits.bed", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
+        expand("logs/{case}_vs_{control}_{stage}_{mark}_call_peaks_macs2.log", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
+        expand("results/bwa/{case}_{stage}_{mark}_q30.sorted.dedup.bed", zip, case=IPS, stage=STAGE, mark=MARK),
+        expand("logs/{case}_{stage}_{mark}.bamToBed", zip, case=IPS, stage=STAGE, mark=MARK),
+        expand("results/qc/{case}_vs_{control}_{stage}_{mark}.frip.txt", zip, case=IPS, control=INPUTS, stage=STAGE, mark=MARK),
+        directory("results/macs2/pooled/"),
+        "results/macs2/H3K4me3_E10.5_overlap.narrowPeak",
+        "results/macs2/H3K4me3_E11.5_overlap.narrowPeak",
+        "results/macs2/H3K4me3_E12.5_overlap.narrowPeak",
+        "results/macs2/H3K4me3_E13.5_overlap.narrowPeak",
+        "results/macs2/H3K4me3_E14.5_overlap.narrowPeak",
+        "results/macs2/H3K4me3_E15.5_overlap.narrowPeak"
+        # directory("results/multiqc/multiqc_report_data/"),
+        # "results/multiqc/multiqc_report.html"
+
+# ===============================================================================================
+#  1. FASTQC
+# ===============================================================================================
+
+## Done separately because fastq and BAM files don't share file prefixes so it becomes very messy
+# without renaming everything
+
+# ===============================================================================================
+#  2. ALIGNMENT
+#   > ENCODE alignment downloaded from ENCODE database
+# ===============================================================================================
+
+## Singled-end 36-50nt reads
+## bwa-aln used for aligning reads to the mouse genome (mm10 assembly)
+## Alignment command: bwa-aln -q 5 -l 32 -k 2 <reference_file> <read_file>
+## bwa version version 0.7.7
+
+## -q = parameter for read trimming
+## -l = read length
+## -k = Maximum edit distance in the seed
+
+# ===============================================================================================
+#  4. FILTERING
+#   > remove unmapped, mate unmapped
+#   > remove low MAPQ reads (-q 30)
+#   > -F 1804
+#           -Read unmapped
+#           -Mate unmapped
+#           -Not primary alignment
+#           -Read fails platform/vendor quality checks
+#           -Read is PCR or optical duplicate
+#   > mark duplicates & remove duplicates (picard)
+#   > re-filter, sort and index final BAM
+# ===============================================================================================
+
+rule filter:
+    input:
+        "results/bwa/{sample}_{stage}_{mark}.bam"
+    output:
+        "results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam"
+    log:
+        "logs/{sample}_{stage}_{mark}.filter"
+    shell:
+        "samtools view -b -F 1804 -q 30 {input} | samtools sort -o {output} - 2> {log}"
+
+rule markDups:
+    input:
+        "results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam"
+    output:
+        bam="results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam",
+        dupQC="results/bwa/{sample}_{stage}_{mark}.dupmark.qc"
+    log:
+        "logs/{sample}_{stage}_{mark}.dedup"
+    shell:
+        "picard MarkDuplicates I={input} O={output.bam} \
+        METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=false ASSUME_SORTED=true 2> {log}"
+
+rule dedup:
+    input:
+        "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
+    output:
+        "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"
+    log:
+        "logs/{sample}_{stage}_{mark}.dedup"
+    shell:
+        "samtools view -F 1804 -b {input} | samtools sort -o {output} - 2> {log}"
+
+rule indexBam:
+    input:
+        "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"
+    output:
+        "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"
+    log:
+        "logs/{sample}_{stage}_{mark}.indexBam"
+    shell:
+        "samtools index {input} {output} 2> {log}"
+
+rule mergeBAMreplicates:
+    input:
+        E10 = ["results/bwa/ENCFF124UYX_E10.5_{mark}_q30.sorted.dedup.bam", "results/bwa/ENCFF045IPK_E10.5_{mark}_q30.sorted.dedup.bam"],
+        E11 = ["results/bwa/ENCFF760QYZ_E11.5_{mark}_q30.sorted.dedup.bam", "results/bwa/ENCFF717QDV_E11.5_{mark}_q30.sorted.dedup.bam"],
+        E12 = ["results/bwa/ENCFF182ZPF_E12.5_{mark}_q30.sorted.dedup.bam", "results/bwa/ENCFF941QJZ_E12.5_{mark}_q30.sorted.dedup.bam"],
+        E13 = ["results/bwa/ENCFF485UDC_E13.5_{mark}_q30.sorted.dedup.bam", "results/bwa/ENCFF124TAB_E13.5_{mark}_q30.sorted.dedup.bam"],
+        E14 = ["results/bwa/ENCFF724DMU_E14.5_{mark}_q30.sorted.dedup.bam", "results/bwa/ENCFF665QBJ_E14.5_{mark}_q30.sorted.dedup.bam"],
+        E15 = ["results/bwa/ENCFF258KCR_E15.5_{mark}_q30.sorted.dedup.bam", "results/bwa/ENCFF401BKM_E15.5_{mark}_q30.sorted.dedup.bam"],
+        E10C = ["results/bwa/ENCFF157KEH_E10.5_H3K4me3_q30.sorted.dedup.bam", "results/bwa/ENCFF825AVI_E10.5_H3K4me3_q30.sorted.dedup.bam"],
+        E11C = ["results/bwa/ENCFF184CUE_E11.5_H3K4me3_q30.sorted.dedup.bam", "results/bwa/ENCFF376FGM_E11.5_H3K4me3_q30.sorted.dedup.bam"],
+        E12C = ["results/bwa/ENCFF203JQV_E12.5_H3K4me3_q30.sorted.dedup.bam", "results/bwa/ENCFF058AUT_E12.5_H3K4me3_q30.sorted.dedup.bam"],
+        E13C = ["results/bwa/ENCFF117QRC_E13.5_H3K4me3_q30.sorted.dedup.bam", "results/bwa/ENCFF248PGK_E13.5_H3K4me3_q30.sorted.dedup.bam"],
+        E14C = ["results/bwa/ENCFF784ORI_E14.5_H3K4me3_q30.sorted.dedup.bam", "results/bwa/ENCFF002HZV_E14.5_H3K4me3_q30.sorted.dedup.bam"],
+        E15C = ["results/bwa/ENCFF727QTS_E15.5_H3K4me3_q30.sorted.dedup.bam", "results/bwa/ENCFF182XFG_E15.5_H3K4me3_q30.sorted.dedup.bam"],
+    output:
+        E10 = "results/bwa/E10.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E11 = "results/bwa/E11.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E12 = "results/bwa/E12.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E13 = "results/bwa/E13.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E14 = "results/bwa/E14.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E15 = "results/bwa/E15.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E10C = "results/bwa/input_E10.5_H3K4me3_q30.sorted.dedup.bam",
+        E11C = "results/bwa/input_E11.5_H3K4me3_q30.sorted.dedup.bam",
+        E12C = "results/bwa/input_E12.5_H3K4me3_q30.sorted.dedup.bam",
+        E13C = "results/bwa/input_E13.5_H3K4me3_q30.sorted.dedup.bam",
+        E14C = "results/bwa/input_E14.5_H3K4me3_q30.sorted.dedup.bam",
+        E15C = "results/bwa/input_E15.5_H3K4me3_q30.sorted.dedup.bam"
+    log:
+        E10 = "logs/E10.5_H3K4me3.mergeBAM",
+        E11 = "logs/E11.5_H3K4me3.mergeBAM",
+        E12 = "logs/E12.5_H3K4me3.mergeBAM",
+        E13 = "logs/E13.5_H3K4me3.mergeBAM",
+        E14 = "logs/E14.5_H3K4me3.mergeBAM",
+        E15 = "logs/E15.5_H3K4me3.mergeBAM",
+        E10C = "logs/input_E10.5_H3K4me3.mergeBAM",
+        E11C = "logs/input_E11.5_H3K4me3.mergeBAM",
+        E12C = "logs/input_E12.5_H3K4me3.mergeBAM",
+        E13C = "logs/input_E13.5_H3K4me3.mergeBAM",
+        E14C = "logs/input_E14.5_H3K4me3.mergeBAM",
+        E15C = "logs/input_E15.5_H3K4me3.mergeBAM"
+    run:
+        shell("samtools merge {output.E10} {input.E10} 2> {log.E10}")
+        shell("samtools merge {output.E11} {input.E11} 2> {log.E11}")
+        shell("samtools merge {output.E12} {input.E12} 2> {log.E12}")
+        shell("samtools merge {output.E13} {input.E13} 2> {log.E13}")
+        shell("samtools merge {output.E14} {input.E14} 2> {log.E14}")
+        shell("samtools merge {output.E15} {input.E15} 2> {log.E15}")
+        shell("samtools merge {output.E10C} {input.E10C} 2> {log.E10C}")
+        shell("samtools merge {output.E11C} {input.E11C} 2> {log.E11C}")
+        shell("samtools merge {output.E12C} {input.E12C} 2> {log.E12C}")
+        shell("samtools merge {output.E13C} {input.E13C} 2> {log.E13C}")
+        shell("samtools merge {output.E14C} {input.E14C} 2> {log.E14C}")
+        shell("samtools merge {output.E15C} {input.E15C} 2> {log.E15C}")
+
+
+# ===============================================================================================
+#  4. ALIGNMENT QC
+#   > SAMtools flagstat statistics
+#   > CollectMultipleMetrics (picard)
+#   > Compute Library Complexity (PreSeq)
+# ===============================================================================================
+
+rule mappingStats:
+    input:
+        a="results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam",
+        b="results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam",
+        c="results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam",
+        d="results/bwa/{sample}_{stage}_{mark}.bam"
+    output:
+        a="results/qc/{sample}_{stage}_{mark}.dedup.flagstat.qc",
+        b="results/qc/{sample}_{stage}_{mark}.dupmark.flagstat.qc",
+        c="results/qc/{sample}_{stage}_{mark}.q30.flagstat.qc",
+        d="results/qc/{sample}_{stage}_{mark}.unfiltered.flagstat.qc",
+    run:
+        shell("samtools flagstat {input.a} > {output.a}")
+        shell("samtools flagstat {input.b} > {output.b}")
+        shell("samtools flagstat {input.c} > {output.c}")
+        shell("samtools flagstat {input.d} > {output.d}")
+
+
+# rule sort_name:
+#     input:
+#         "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
+#     output:
+#         tmp = "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
+#     log:
+#         "logs/{sample}_{stage}_{mark}.pbc.sort"
+#     run:
+#         shell("samtools sort -n {input} -o {output.tmp} 2> {log}")
+#
+# rule estimate_lib_complexity:
+#     input:
+#         "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
+#     output:
+#         qc = "results/qc/{sample}_{stage}_{mark}.pbc.qc",
+#     log:
+#         "logs/{sample}_{stage}_{mark}.pbc"
+#     shell:
+#         """
+#         bedtools bamtobed -i {input} \
+#         | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6}}' \
+#         | sort | uniq -c \
+#         | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
+#         {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
+#         > {output.qc}
+#         """
+#
+## convert to bedPE
+## print read 1 scaffold, read 1 start coordinate, read 2 scaffold, read 2 end coordinate, strand read 1, strand read 2
+## remove mitochondrial genome
+## sort by position and obtain uniq reads
+## Format of file
+## TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
+
+
+# ===============================================================================================
+#  5. deepTools
+#   > multiBAMsummary
+#   > plotCorrelation
+#   > plotFingerprint
+#   > bamCoverage (read depth normalised bigWig files)
+# ===============================================================================================
+
+rule deeptools_summary:
+    input:
+        expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark = MARK)
+    output:
+        sum="results/qc/multibamsum.npz",
+        counts="results/qc/multibamsum.tab"
+    threads: 32
+    log:
+        "logs/multisummary.deepTools"
+    shell:
+        " multiBamSummary bins \
+        -p {threads} \
+        -b {input} \
+        --centerReads \
+        -out {output.sum} \
+        --outRawCounts {output.counts} 2> {log}"
+
+rule deeptools_correlation:
+    input: "results/qc/multibamsum.npz"
+    output:
+        fig="results/qc/pearsoncor_multibamsum.png",
+        matrix="results/qc/pearsoncor_multibamsum_matrix.txt"
+    log:
+        "logs/correlation.deepTools"
+    shell:
+        "plotCorrelation \
+        --corData {input} \
+        --plotFile {output.fig} \
+        --outFileCorMatrix {output.matrix} \
+        --corMethod pearson \
+        --whatToPlot heatmap \
+        --skipZeros \
+        --plotNumbers \
+        --colorMap RdYlBu 2> {log}"
+
+rule deeptools_coverage:
+    input:
+        bam = "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam",
+        bai = "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"
+    output:
+        "results/qc/{sample}_{stage}_{mark}.SeqDepthNorm.bw"
+    log:
+        "logs/{sample}_{stage}_{mark}_coverage.deepTools"
+    shell:
+        "bamCoverage --bam {input.bam} --binSize 10 --normalizeUsing RPGC --effectiveGenomeSize 2308125349 --numberOfProcessors 4 -o {output} 2> {log}"
+
+rule deeptools_fingerprint:
+    input:
+        bam = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark = MARK),
+        bai = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"], zip, sample=all_samples, stage=STAGE, mark = MARK)
+    output:
+        fig="results/qc/multiBAM_fingerprint.png",
+        metrics="results/qc/multiBAM_fingerprint_metrics.txt",
+        rawcounts="results/qc/multiBAM_fingerprint_rawcounts.txt"
+    threads: 32
+    log:
+        "logs/fingerprint.deepTools"
+    shell:
+        "plotFingerprint -p {threads} \
+        -b {input.bam} \
+        --plotFile {output.fig} \
+        --outQualityMetrics {output.metrics} \
+        --outRawCounts {output.rawcounts} \
+        --minMappingQuality 20 \
+        --skipZeros \
+        --centerReads 2> {log}"
+
+rule deeptools_plotCoverage:
+    input:
+        bam = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark=MARK),
+        bai = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"], zip, sample=all_samples, stage=STAGE, mark=MARK)
+    output:
+        fig="results/qc/plot_coverage.png",
+        rawcounts="results/qc/plot_coverage_rawcounts.tab"
+    log:
+        "logs/plotCoverage.deepTools"
+    shell:
+        "plotCoverage --bamfiles {input.bam} --plotFile {output.fig} \
+        -n 1000000 \
+        --outRawCounts {output.rawcounts} \
+        --ignoreDuplicates 2> {log}"
+
+
+# ===============================================================================================
+#  6. Cross-correlation scores - following ENCODE code
+# ===============================================================================================
+
+rule bamtobed_crossC:
+    input:
+        "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
+    output:
+        tagAlign = "results/bed/{sample}_{stage}_{mark}_q30_SE.tagAlign.gz"
+    shell:
+        """
+        bedtools bamtobed -i {input} | \
+        awk 'BEGIN{{OFS="\\t"}}{{$4='N';$5='1000';print $0}}' |\
+        gzip -nc > {output.tagAlign}
+        """
+
+## Subsample 15 million reads for cross-correlation analysis
+## Estimate read length from first 100 reads
+rule subsample_aligned_reads:
+    input:
+        "results/bed/{sample}_{stage}_{mark}_q30_SE.tagAlign.gz"
+    output:
+        subsample = "results/bed/{sample}_{stage}_{mark}.filt.sample.15Mreads.SE.tagAlign.gz",
+        tmp = "results/bed/{sample}_{stage}_{mark}_R1_trimmed_q30_SE.tagAlign.tmp"
+    params:
+        nreads= 15000000
+    run:
+        shell("""zcat {input} | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""")
+        shell("zcat {input} > {output.tmp}")
+
+# Determine exclusion range for fragment length estimation.
+## From bamPEFragmentSize (deepTools) average fragment length is ~220bp
+## See bamPEFragmentSize.histogram.png
+# Use a fixed lowerbound at -500.
+# Upperbound E
+#EXCLUSION_RANGE_MAX is
+#   Histone ChIP-seq:  max(read_len + 10, 100)
+# lowerbound is fixed at 500 for both
+
+
+rule cross_correlation_SSP:
+    input:
+        "results/bed/{sample}_{stage}_{mark}.filt.sample.15Mreads.SE.tagAlign.gz"
+    output:
+        CC_SCORES_FILE="results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.qc",
+        CC_PLOT_FILE="results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.plot.pdf"
+    log:
+        "logs/{sample}_{stage}_{mark}_filt_15Mreads.SE.spp.log"
+    params:
+        EXCLUSION_RANGE_MIN=-500,
+        EXCLUSION_RANGE_MAX=85
+    shell:
+        "Rscript scripts/run_spp.R -c={input} -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}"
+
+
+# CC_SCORE FILE format
+# Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
+
+
+# ===============================================================================================
+#  7. Call peaks (MACS2)
+# ===============================================================================================
+
+rule call_peaks_macs2:
+    input:
+        control = "results/bwa/{control}_{stage}_{mark}_q30.sorted.dedup.bam",
+        case = "results/bwa/{case}_{stage}_{mark}_q30.sorted.dedup.bam"
+    output:
+        "results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_peaks.xls",
+        "results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_summits.bed",
+        "results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_peaks.narrowPeak",
+    log:
+        "logs/{case}_vs_{control}_{stage}_{mark}_call_peaks_macs2.log"
+    params:
+        name = "{case}_vs_{control}_{stage}_{mark}_macs2",
+    shell:
+        " macs2 callpeak -f BAM -t {input.case} \
+        -c {input.control} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.name} \
+        -g mm 2> {log} "
+
+rule call_peaks_macs2_pooled_replicates:
+    input:
+        E10 = "results/bwa/E10.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E11 = "results/bwa/E11.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E12 = "results/bwa/E12.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E13 = "results/bwa/E13.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E14 = "results/bwa/E14.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E15 = "results/bwa/E15.5_H3K4me3_q30.sorted.pooled.dedup.bam",
+        E10C = "results/bwa/input_E10.5_H3K4me3_q30.sorted.dedup.bam",
+        E11C = "results/bwa/input_E11.5_H3K4me3_q30.sorted.dedup.bam",
+        E12C = "results/bwa/input_E12.5_H3K4me3_q30.sorted.dedup.bam",
+        E13C = "results/bwa/input_E13.5_H3K4me3_q30.sorted.dedup.bam",
+        E14C = "results/bwa/input_E14.5_H3K4me3_q30.sorted.dedup.bam",
+        E15C = "results/bwa/input_E15.5_H3K4me3_q30.sorted.dedup.bam"
+    output:
+        directory("results/macs2/pooled/")
+    log:
+        E10 = "results/qc/E10.5_H3K4me3.pooled.macs2",
+        E11 = "results/qc/E11.5_H3K4me3.pooled.macs2",
+        E12 = "results/qc/E12.5_H3K4me3.pooled.macs2",
+        E13 = "results/qc/E13.5_H3K4me3.pooled.macs2",
+        E14 = "results/qc/E14.5_H3K4me3.pooled.macs2",
+        E15 = "results/qc/E15.5_H3K4me3.pooled.macs2"
+    params:
+        E10 = "E10.5_H3K4me3.pooled.macs2",
+        E11 = "E11.5_H3K4me3.pooled.macs2",
+        E12 = "E12.5_H3K4me3.pooled.macs2",
+        E13 = "E13.5_H3K4me3.pooled.macs2",
+        E14 = "E14.5_H3K4me3.pooled.macs2",
+        E15 = "E15.5_H3K4me3.pooled.macs2"
+    run:
+        shell("macs2 callpeak -f BAM -t {input.E10} \
+        -c {input.E10C} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.E10} \
+        -g mm 2> {log.E10}" )
+        shell("macs2 callpeak -f BAM -t {input.E11} \
+        -c {input.E11C} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.E11} \
+        -g mm 2> {log.E11}" )
+        shell("macs2 callpeak -f BAM -t {input.E12} \
+        -c {input.E12C} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.E12} \
+        -g mm 2> {log.E12}" )
+        shell("macs2 callpeak -f BAM -t {input.E13} \
+        -c {input.E13C} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.E13} \
+        -g mm 2> {log.E13}" )
+        shell("macs2 callpeak -f BAM -t {input.E14} \
+        -c {input.E14C} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.E14} \
+        -g mm 2> {log.E14}" )
+        shell("macs2 callpeak -f BAM -t {input.E15} \
+        -c {input.E15C} --keep-dup all \
+        --outdir results/macs2/ -p 0.01 \
+        -n {params.E15} \
+        -g mm 2> {log.E15}" )
+
+# ===============================================================================================
+#  8. Peak QC
+#   > narrow peak counts
+#   > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks)
+# ===============================================================================================
+
+## Convert BAM to tagAlign file for calculating FRiP QC metric (Fraction of reads in peaks)
+rule bamToBed:
+    input:
+        "results/bwa/{case}_{stage}_{mark}_q30.sorted.dedup.bam"
+    output:
+        "results/bwa/{case}_{stage}_{mark}_q30.sorted.dedup.bed"
+    log:
+        "logs/{case}_{stage}_{mark}.bamToBed"
+    shell:
+        "samtools sort -n {input} | bedtools bamtobed -i - > {output}"
+
+# ## Fraction of reads in peaks
+rule frip:
+    input:
+        bed = "results/bwa/{case}_{stage}_{mark}_q30.sorted.dedup.bed",
+        peak = "results/macs2/{case}_vs_{control}_{stage}_{mark}_macs2_peaks.narrowPeak"
+    output:
+        "results/qc/{case}_vs_{control}_{stage}_{mark}.frip.txt"
+    shell:
+        "python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}"
+
+
+# ===============================================================================================
+#  9. Create consensus peaksets for replicates
+# ===============================================================================================
+
+# Based on ENCODE `overlap_peaks.py` - recommended for histone marks.
+# Need to pool peaks first for each replicate
+
+rule overlap_peaks:
+    input:
+        E10= ["results/macs2/ENCFF124UYX_vs_ENCFF157KEH_E10.5_H3K4me3_macs2_peaks.narrowPeak", "results/macs2/ENCFF045IPK_vs_ENCFF825AVI_E10.5_H3K4me3_macs2_peaks.narrowPeak"],
+        E11= ["results/macs2/ENCFF760QYZ_vs_ENCFF184CUE_E11.5_H3K4me3_macs2_peaks.narrowPeak", "results/macs2/ENCFF717QDV_vs_ENCFF376FGM_E11.5_H3K4me3_macs2_peaks.narrowPeak"],
+        E12= ["results/macs2/ENCFF941QJZ_vs_ENCFF058AUT_E12.5_H3K4me3_macs2_peaks.narrowPeak", "results/macs2/ENCFF182ZPF_vs_ENCFF203JQV_E12.5_H3K4me3_macs2_peaks.narrowPeak"],
+        E13= ["results/macs2/ENCFF485UDC_vs_ENCFF117QRC_E13.5_H3K4me3_macs2_peaks.narrowPeak", "results/macs2/ENCFF124TAB_vs_ENCFF248PGK_E13.5_H3K4me3_macs2_peaks.narrowPeak"],
+        E14= ["results/macs2/ENCFF665QBJ_vs_ENCFF002HZV_E14.5_H3K4me3_macs2_peaks.narrowPeak", "results/macs2/ENCFF724DMU_vs_ENCFF784ORI_E14.5_H3K4me3_macs2_peaks.narrowPeak"],
+        E15= ["results/macs2/ENCFF401BKM_vs_ENCFF182XFG_E15.5_H3K4me3_macs2_peaks.narrowPeak", "results/macs2/ENCFF258KCR_vs_ENCFF727QTS_E15.5_H3K4me3_macs2_peaks.narrowPeak"],
+        E10_pooled="results/macs2/E10.5_H3K4me3.pooled.macs2_peaks.narrowPeak",
+        E11_pooled="results/macs2/E11.5_H3K4me3.pooled.macs2_peaks.narrowPeak",
+        E12_pooled="results/macs2/E12.5_H3K4me3.pooled.macs2_peaks.narrowPeak",
+        E13_pooled="results/macs2/E13.5_H3K4me3.pooled.macs2_peaks.narrowPeak",
+        E14_pooled="results/macs2/E14.5_H3K4me3.pooled.macs2_peaks.narrowPeak",
+        E15_pooled="results/macs2/E15.5_H3K4me3.pooled.macs2_peaks.narrowPeak",
+    output:
+        E10= "results/macs2/H3K4me3_E10.5_overlap.narrowPeak",
+        E11= "results/macs2/H3K4me3_E11.5_overlap.narrowPeak",
+        E12= "results/macs2/H3K4me3_E12.5_overlap.narrowPeak",
+        E13= "results/macs2/H3K4me3_E13.5_overlap.narrowPeak",
+        E14= "results/macs2/H3K4me3_E14.5_overlap.narrowPeak",
+        E15= "results/macs2/H3K4me3_E15.5_overlap.narrowPeak"
+    run:
+        shell("python2.7 scripts/overlap_peaks.py {input.E10} {input.E10_pooled} {output.E10}")
+        shell("python2.7 scripts/overlap_peaks.py {input.E11} {input.E11_pooled} {output.E11}")
+        shell("python2.7 scripts/overlap_peaks.py {input.E12} {input.E12_pooled} {output.E12}")
+        shell("python2.7 scripts/overlap_peaks.py {input.E13} {input.E13_pooled} {output.E13}")
+        shell("python2.7 scripts/overlap_peaks.py {input.E14} {input.E14_pooled} {output.E14}")
+        shell("python2.7 scripts/overlap_peaks.py {input.E15} {input.E15_pooled} {output.E15}")
+
+# ===============================================================================================
+#  10. Combine all QC into multiqc output
+# ===============================================================================================
+#
+# rule multiqc:
+#     input:
+        # fastqc
+    #     expand("results/fastqc/{sample}_R1_fastqc.html", sample=all_samples),
+    #     expand("results/fastqc/{sample}_R2_fastqc.html", sample=all_samples),
+    #     expand("results/fastqc/{sample}_R1_fastqc.zip", sample=all_samples),
+    #     expand("results/fastqc/{sample}_R2_fastqc.zip", sample=all_samples),
+    #     # bwa
+    #     expand("logs/{sample}.align", sample=all_samples),
+    #     expand("logs/{sample}.flagstat.qc", sample=all_samples),
+    #     # preseq
+    #     expand("results/preseq/{sample}.ccurve.txt", sample=all_samples),
+    #     # picard
+    #     expand("results/picard/{sample}_est_lib_complex_metrics.txt", sample=all_samples),
+    #     # deepTools
+    #     "results/deeptools/multibamsum.npz",
+    #     "results/deeptools/multibamsum.tab",
+    #     "results/deeptools/pearsoncor_multibamsum.png",
+    #     "results/deeptools/pearsoncor_multibamsum_matrix.txt",
+    #     expand("results/deeptools/{sample}.SeqDepthNorm.bw", sample=all_samples),
+    #     "results/deeptools/multiBAM_fingerprint.png",
+    #     "results/deeptools/multiBAM_fingerprint_metrics.txt",
+    #     "results/deeptools/multiBAM_fingerprint_rawcounts.txt",
+    #     "results/deeptools/plot_coverage.png",
+    #     "results/deeptools/plot_coverage_rawcounts.tab",
+    #     "results/deeptools/bamPEFragmentSize_hist.png",
+    #     "results/deeptools/bamPEFragmentSize_rawcounts.tab",
+    #     # phantomPeaks
+    #     expand("results/phantomPeaks/{sample}.spp.pdf", sample = IPS),
+    #     expand("results/phantomPeaks/{sample}.spp.Rdata", sample = IPS),
+    #     expand("results/phantomPeaks/{sample}.spp.out", sample = IPS),
+    #     # macs2
+    #     expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS),
+    #     expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS),
+    #     expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS),
+    #     expand("results/macs2/{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS),
+    #     expand("results/frip/{case}.frip.txt", case=IPS)
+    # params:
+    #     "results/fastqc/",
+    #     "results/bwa/",
+    #     "logs/",
+    #     "results/deeptools/",
+    #     "results/macs2/",
+    #     "results/phantomPeaks/",
+    #     "results/frip/",
+    #     "results/picard/"
+    # output:
+    #     directory("results/multiqc/multiqc_report_data/"),
+    #     "results/multiqc/multiqc_report.html",
+    # log:
+    #     "logs/multiqc.log"
+    # shell:
+    #     "multiqc -f {params} -m fastqc -m samtools -m picard -m preseq -m deeptools -m phantompeakqualtools -m macs2 -o results/multiqc/ \
+    #     -n multiqc_report.html 2> {log}"
-- 
GitLab