From 47ee15b9be95ffb2b2f32b260b6cbc5bbc52ea55 Mon Sep 17 00:00:00 2001
From: Laura Cook <l.cook2@student.unimelb.edu.au>
Date: Fri, 7 Aug 2020 11:56:52 +1000
Subject: [PATCH] added max and min fragment length for spp cross correlation

---
 dunnart/Snakefile | 192 ++++++++++++++++++++++------------------------
 1 file changed, 92 insertions(+), 100 deletions(-)

diff --git a/dunnart/Snakefile b/dunnart/Snakefile
index 41cccb9..0ae59b3 100644
--- a/dunnart/Snakefile
+++ b/dunnart/Snakefile
@@ -57,15 +57,19 @@ rule all:
         expand("results/bowtie2/{sample}_PPq30.sorted.dupmark.bam", sample=all_samples),
         expand("results/bowtie2/{sample}_PPq30.sorted.dedup.bam", sample=all_samples),
         expand("results/bowtie2/{sample}_PPq30.sorted.dedup.bai", sample=all_samples),
-        expand("results/qc/{sample}.sorted.flagstat.qc", sample=all_samples),
+        expand("results/qc/{sample}.unfiltered.flagstat.qc", sample=all_samples),
         expand("results/qc/{sample}.dedup.flagstat.qc", sample=all_samples),
         expand("results/qc/{sample}.dupmark.flagstat.qc", sample=all_samples),
         expand("results/qc/{sample}.PPq30.flagstat.qc", sample=all_samples),
         expand("results/qc/{sample}.ccurve.txt", sample=all_samples),
+        expand("results/qc/{sample}.extrap.txt", sample=all_samples),
+        expand("logs/{sample}.ccurve.preseq", sample=all_samples),
+        expand("logs/{sample}.extrap.preseq", sample=all_samples),
         expand("results/qc/{sample}_est_lib_complex_metrics.txt", sample=all_samples),
         expand("logs/{sample}.picardLibComplexity", sample=all_samples),
         expand("results/qc/{sample}.pbc.qc", sample=all_samples),
         expand("results/bowtie2/{sample}_PPq30.sorted.tmp.bam",sample=all_samples),
+        expand("results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam",sample=all_samples),
         "results/qc/multibamsum.npz",
         "results/qc/multibamsum.tab",
         "results/qc/pearsoncor_multibamsum.png",
@@ -76,21 +80,22 @@ rule all:
         "results/qc/multiBAM_fingerprint_rawcounts.txt",
         "results/qc/bamPEFragmentSize_hist.png",
         "results/qc/bamPEFragmentSize_rawcounts.tab",
-        expand("results/qc/{sample}_R1_trimmed.flagstat.qc", sample=all_samples),
         expand("results/bowtie2/{sample}_R1_trimmed_q30.bam", sample=all_samples),
-        expand("{sample}_filt_15Mreads.SE.cc.qc", sample=all_samples),
-        expand("{sample}_filt_15Mreads.SE.cc.plot.pdf", sample=all_samples)
-        # expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS),
-        # expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS),
-        # expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS),
-        # expand("results/qxc{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS),
-        # expand("results/bowtie2/{case}.bedpe", case=IPS),
-        # expand("logs/{case}.bamToBed", case=IPS),
-        # expand("results/qc/{case}_vs_{control}.frip.txt", case=IPS, control=INPUTS)
-        # "results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak",
-        # "results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak",
-        # "results/macs2/H3K4me3_overlap.narrowPeak",
-        # "results/macs2/H3K27ac_overlap.narrowPeak",
+        expand("logs/{sample}_filt_15Mreads.SE.spp.log", sample=all_samples),
+        # expand("results/qc/{sample}_readlength.txt", sample=all_samples),
+        expand("results/qc/{sample}_filt_15Mreads.SE.cc.qc", sample=all_samples),
+        expand("results/qc/{sample}_filt_15Mreads.SE.cc.plot.pdf", sample=all_samples),
+        expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS),
+        expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS),
+        expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS),
+        expand("results/qc/{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS),
+        expand("results/bowtie2/{case}.bedpe", case=IPS),
+        expand("logs/{case}.bamToBed", case=IPS),
+        expand("results/qc/{case}_vs_{control}.frip.txt", case=IPS, control=INPUTS),
+        "results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak",
+        "results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak",
+        "results/macs2/H3K4me3_overlap.narrowPeak",
+        "results/macs2/H3K27ac_overlap.narrowPeak",
         # directory("results/multiqc/multiqc_report_data/"),
         # "results/multiqc/multiqc_report.html"
 # ===============================================================================================
@@ -129,19 +134,18 @@ rule align:
         "bowtie2 --threads 8 -q -X 2000 --very-sensitive -x {params.index} -1 {input.R1} -2 {input.R2} \
         |  samtools view -u -h  - |  samtools sort -o {output}  - 2> {log}"
 
-
 # ===============================================================================================
 #  3. FILTERING
 #   > remove unmapped, mate unmapped
 #   > remove low MAPQ reads (-q 30 (ENCODE))
 #   > keep proper pairs (-f)
+#   > mark duplicates (picard)
 #   > -F 1804
 #           -Read unmapped
 #           -Mate unmapped
 #           -Not primary alignment
 #           -Read fails platform/vendor quality checks
 #           -Read is PCR or optical duplicate
-#   > mark duplicates & remove duplicates (picard)
 #   > index final position sorted BAM
 # ===============================================================================================
 
@@ -153,17 +157,7 @@ rule filter:
     log:
         "logs/{sample}.dedup"
     shell:
-        "samtools view -b -q 30 -F 1804 -f 2 {input} | samtools sort -o {output} - 2> {log}"
-
-rule indexUnfiltBam:
-    input:
-        "results/bowtie2/{sample}.sorted.bam"
-    output:
-        "results/bowtie2/{sample}.sorted.bai"
-    log:
-        "logs/{sample}.indexUnfiltBam"
-    shell:
-        "samtools index -c {input} {output} 2> {log}"
+        "samtools view -b -F 1804 -q 30 -f 2 {input} | samtools sort -o {output} - 2> {log}"
 
 rule markDups:
     input:
@@ -175,7 +169,7 @@ rule markDups:
         "logs/{sample}.dupmark"
     shell:
         "picard MarkDuplicates I={input} O={output.bam} \
-        METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=false ASSUME_SORTED=true 2> {log}"
+        METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=FALSE ASSUME_SORTED=true 2> {log}"
 
 rule dedup:
     input:
@@ -202,6 +196,7 @@ rule indexBam:
 #   > SAMtools flagstat statistics
 #   > Compute Library Complexity (PreSeq)
 #   > picard library complexity
+#   > ENCODE library complexity
 # ===============================================================================================
 
 rule mappingStats:
@@ -221,31 +216,52 @@ rule mappingStats:
         shell("samtools flagstat {input.c} > {output.c}")
         shell("samtools flagstat {input.d} > {output.d}")
 
-rule preseq:
+rule downsample_bam:
     input:
-        "results/bowtie2/{sample}.sorted.bam"
+        "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
     output:
-        "results/qc/{sample}.ccurve.txt"
+        "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam"
     log:
-        "logs/{sample}.preseq"
+        "logs/{sample}.downsample"
     shell:
-        "preseq lc_extrap -v -output {output} -pe -bam {input} 2> {log}"
+        "picard DownsampleSam I={input} O={output} P=0.35 \
+         2> {log}"
+
+# A-1_dupmark = 87165286
+# B-1_dupmark = 72303316
+# A-2_dupmark = 62678360
+# A-3_dupmark = 86503078
+# B-2_dupmark = 67048994
+# B-3_dupmark = 66615704
+
+rule preseq:
+    input:
+        "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
+    output:
+        ccurve = "results/qc/{sample}.ccurve.txt",
+        extrap = "results/qc/{sample}.extrap.txt"
+    log:
+        ccurve = "logs/{sample}.ccurve.preseq",
+        extrap = "logs/{sample}.extrap.preseq"
+    run:
+        shell("preseq lc_extrap -v -output {output.extrap} -pe -bam {input} 2> {log.extrap}")
+        shell("preseq c_curve -v -output {output.ccurve} -pe -bam {input} 2> {log.ccurve}")
 
 rule get_picard_complexity_metrics:
     input:
-        "results/bowtie2/{sample}.sorted.bam"
+        "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
     output:
         "results/qc/{sample}_est_lib_complex_metrics.txt"
     log:
-        "logs/{sample}.picardLibComplexity"
+        "logs/{sample}.downSampled.picardLibComplexity"
     shell:
         "picard -Xmx6G EstimateLibraryComplexity INPUT={input} OUTPUT={output} USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE VERBOSITY=ERROR"
 
 rule sort_name:
     input:
-        "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
+        "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam"
     output:
-        tmp = "results/bowtie2/{sample}_PPq30.sorted.dupmark.tmp.bam"
+        tmp = "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.tmp.bam"
     log:
         "logs/{sample}.pbc.sort"
     run:
@@ -253,7 +269,7 @@ rule sort_name:
 
 rule estimate_lib_complexity:
     input:
-        "results/bowtie2/{sample}_PPq30.sorted.dupmark.tmp.bam"
+        "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.tmp.bam"
     output:
         qc = "results/qc/{sample}.pbc.qc",
     log:
@@ -262,7 +278,7 @@ rule estimate_lib_complexity:
         """
         bedtools bamtobed -bedpe -i {input} \
         | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}' \
-        | grep -v 'chrM' | sort | uniq -c \
+        | sort | uniq -c \
         | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
         {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
         > {output.qc}
@@ -271,7 +287,7 @@ rule estimate_lib_complexity:
 ## convert to bedPE
 ## print read 1 scaffold, read 1 start coordinate, read 2 scaffold, read 2 end coordinate, strand read 1, strand read 2
 ## remove mitochondrial genome
-## sort by position and obtain uniq counts
+## sort by position and obtain uniq reads
 ## Format of file
 ## TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
 
@@ -323,8 +339,8 @@ rule deeptools_correlation:
 
 rule deeptools_coverage:
     input:
-        bam ="results/bowtie2/{sample}.sorted.bam",
-        bai ="results/bowtie2/{sample}.sorted.bai"
+        bam ="results/bowtie2/{sample}_PPq30.sorted.dedup.bam",
+        bai ="results/bowtie2/{sample}_PPq30.sorted.dedup.bai"
     output:
         "results/qc/{sample}.SeqDepthNorm.bw"
     log:
@@ -411,12 +427,10 @@ rule filter_sort_trimmed_alignment:
     input:
         "results/bowtie2/{sample}_R1_trimmed.bam"
     output:
-        flagstat = "results/qc/{sample}_R1_trimmed.flagstat.qc",
         bam = "results/bowtie2/{sample}_R1_trimmed_q30.bam"
     log:
         "logs/{sample}_align_trimmed_read1_filter.log"
     run:
-        shell("samtools sort -n {input} -O SAM  | SAMstats  --sorted_sam_file - --outf {output.flagstat}")
         shell("samtools view -F 1804 -q 30 -b {input} -o {output.bam}")
 
 rule bamtobed_crossC:
@@ -442,9 +456,8 @@ rule subsample_aligned_reads:
     params:
         nreads= 15000000
     run:
-        shell("""zcat {input} | grep -v 'chrM' | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""")
+        shell("""zcat {input} | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""")
         shell("zcat {input} > {output.tmp}")
-        shell("""READ_LEN=\$(head -n 100 {output.tmp} | awk 'function abs(v) {{return v < 0 ? -v : v}} BEGIN{{sum=0}} {{sum+=abs($3-$2)}} END{{print int(sum/NR)}}')""")
 
 # Determine exclusion range for fragment length estimation.
 # Use a fixed lowerbound at -500.
@@ -457,20 +470,24 @@ rule cross_correlation_SSP:
     input:
         "results/bed/{sample}.filt.sample.15Mreads.SE.tagAlign.gz"
     output:
-        CC_SCORES_FILE="{sample}_filt_15Mreads.SE.cc.qc",
-        CC_PLOT_FILE="{sample}_filt_15Mreads.SE.cc.plot.pdf"
+        CC_SCORES_FILE="results/qc/{sample}_filt_15Mreads.SE.cc.qc",
+        CC_PLOT_FILE="results/qc/{sample}_filt_15Mreads.SE.cc.plot.pdf"
+    log:
+        "logs/{sample}_filt_15Mreads.SE.spp.log"
     params:
         EXCLUSION_RANGE_MIN=-500,
-        EXCLUSION_RANGE_MAX=110
-    run:
-        shell("Rscript scripts/run_spp.R -c={input} -filtchr=chrM -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}")
+        EXCLUSION_RANGE_MAX=230
+    shell:
+        "Rscript scripts/run_spp.R -c={input} -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}"
 
-# # CC_SCORE FILE format
-# # Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
 #
-# # ===============================================================================================
-# #  7. Call peaks (MACS2)
-# # ===============================================================================================
+# CC_SCORE FILE format
+# Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
+
+
+# ===============================================================================================
+#  7. Call peaks (MACS2)
+# ===============================================================================================
 
 # peak calling for pair-end peaks
 # effective genome size for dunnart genome calculated with khmer program (see README.md)
@@ -494,11 +511,11 @@ rule call_peaks_macs2:
         -n {params.name} \
         -g 2740338543 -B 2> {log} "
 
-# # # ===============================================================================================
-# # #  8. Peak QC
-# # #   > narrow peak counts
-# # #   > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks)
-# # # ===============================================================================================
+# ===============================================================================================
+#  8. Peak QC
+#   > narrow peak counts
+#   > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks)
+# ===============================================================================================
 
 # peak counts in a format that multiqc can handle
 rule get_narrow_peak_counts_for_multiqc:
@@ -537,9 +554,9 @@ rule frip:
         "python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}"
 
 
-# # ===============================================================================================
-# #  9. Create consensus peaksets for replicates
-# # ===============================================================================================
+# ===============================================================================================
+#  9. Create consensus peaksets for replicates
+# ===============================================================================================
 
 # Based on ENCODE `overlap_peaks.py` - recommended for histone marks.
 # Need to pool peaks first for each replicate
@@ -577,48 +594,23 @@ rule overlap_peaks_H3K27ac:
     shell:
         "python2.7 scripts/overlap_peaks.py {input.peak1} {input.peak2} {input.pooled} {output}"
 
-#rule consensus_peaks:
-
-
-#rule enhancer_promoter_peaks:
-# enhancer = peaks with only H3K27ac
-# Promoter = peaks with both H3K27ac & H3K4me3, or just H3K4me3
-# Make a bargraph with the amounts for each
-
-
-# # ===============================================================================================
-# #  10. QC on replicate peaks
-# # ===============================================================================================
-#
-# # FRiP for overlapped peaks
-# rule frip:
-#     input:
-#         bed = "results/bowtie2/{case}.bedpe",
-#         peak = "results/macs2/{}_macs2_peaks.narrowPeak"
-#     output:
-#         "results/qc/{case}.frip.txt"
-#     shell:
-#         "python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}"
-#
-#
-# # ===============================================================================================
-# #  11. Combine all QC into multiqc output
-# # ===============================================================================================
+# ===============================================================================================
+#  10. Combine all QC into multiqc output
+# ===============================================================================================
 #
 # rule multiqc:
 #     input:
 #         # fastqc
-#         expand("results/fastqc/{sample}_R1_fastqc.html", sample=all_samples),
-#         expand("results/fastqc/{sample}_R2_fastqc.html", sample=all_samples),
-#         expand("results/fastqc/{sample}_R1_fastqc.zip", sample=all_samples),
-#         expand("results/fastqc/{sample}_R2_fastqc.zip", sample=all_samples),
+#         expand("results/qc/{sample}_R1_fastqc.html", sample=all_samples),
+#         expand("results/qc/{sample}_R2_fastqc.html", sample=all_samples),
+#         expand("results/qc/{sample}_R1_fastqc.zip", sample=all_samples),
+#         expand("results/qc/{sample}_R2_fastqc.zip", sample=all_samples),
 #         # bowtie2
 #         expand("logs/{sample}.align", sample=all_samples),
-#         expand("logs/{sample}.flagstat.qc", sample=all_samples),
+#         expand("results/qc/{sample}.flagstat.qc", sample=all_samples),
 #         # preseq
-#         expand("results/preseq/{sample}.ccurve.txt", sample=all_samples),
-#         # picard
-#         expand("results/picard/{sample}_est_lib_complex_metrics.txt", sample=all_samples),
+#         expand("results/qc/{sample}.ccurve.txt", sample=all_samples),
+#         expand("results/qc/{sample}.extrap.txt", sample=all_samples),
 #         # deepTools
 #         "results/deeptools/multibamsum.npz",
 #         "results/deeptools/multibamsum.tab",
-- 
GitLab