From 47ee15b9be95ffb2b2f32b260b6cbc5bbc52ea55 Mon Sep 17 00:00:00 2001 From: Laura Cook <l.cook2@student.unimelb.edu.au> Date: Fri, 7 Aug 2020 11:56:52 +1000 Subject: [PATCH] added max and min fragment length for spp cross correlation --- dunnart/Snakefile | 192 ++++++++++++++++++++++------------------------ 1 file changed, 92 insertions(+), 100 deletions(-) diff --git a/dunnart/Snakefile b/dunnart/Snakefile index 41cccb9..0ae59b3 100644 --- a/dunnart/Snakefile +++ b/dunnart/Snakefile @@ -57,15 +57,19 @@ rule all: expand("results/bowtie2/{sample}_PPq30.sorted.dupmark.bam", sample=all_samples), expand("results/bowtie2/{sample}_PPq30.sorted.dedup.bam", sample=all_samples), expand("results/bowtie2/{sample}_PPq30.sorted.dedup.bai", sample=all_samples), - expand("results/qc/{sample}.sorted.flagstat.qc", sample=all_samples), + expand("results/qc/{sample}.unfiltered.flagstat.qc", sample=all_samples), expand("results/qc/{sample}.dedup.flagstat.qc", sample=all_samples), expand("results/qc/{sample}.dupmark.flagstat.qc", sample=all_samples), expand("results/qc/{sample}.PPq30.flagstat.qc", sample=all_samples), expand("results/qc/{sample}.ccurve.txt", sample=all_samples), + expand("results/qc/{sample}.extrap.txt", sample=all_samples), + expand("logs/{sample}.ccurve.preseq", sample=all_samples), + expand("logs/{sample}.extrap.preseq", sample=all_samples), expand("results/qc/{sample}_est_lib_complex_metrics.txt", sample=all_samples), expand("logs/{sample}.picardLibComplexity", sample=all_samples), expand("results/qc/{sample}.pbc.qc", sample=all_samples), expand("results/bowtie2/{sample}_PPq30.sorted.tmp.bam",sample=all_samples), + expand("results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam",sample=all_samples), "results/qc/multibamsum.npz", "results/qc/multibamsum.tab", "results/qc/pearsoncor_multibamsum.png", @@ -76,21 +80,22 @@ rule all: "results/qc/multiBAM_fingerprint_rawcounts.txt", "results/qc/bamPEFragmentSize_hist.png", "results/qc/bamPEFragmentSize_rawcounts.tab", - expand("results/qc/{sample}_R1_trimmed.flagstat.qc", sample=all_samples), expand("results/bowtie2/{sample}_R1_trimmed_q30.bam", sample=all_samples), - expand("{sample}_filt_15Mreads.SE.cc.qc", sample=all_samples), - expand("{sample}_filt_15Mreads.SE.cc.plot.pdf", sample=all_samples) - # expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS), - # expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS), - # expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS), - # expand("results/qxc{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS), - # expand("results/bowtie2/{case}.bedpe", case=IPS), - # expand("logs/{case}.bamToBed", case=IPS), - # expand("results/qc/{case}_vs_{control}.frip.txt", case=IPS, control=INPUTS) - # "results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak", - # "results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak", - # "results/macs2/H3K4me3_overlap.narrowPeak", - # "results/macs2/H3K27ac_overlap.narrowPeak", + expand("logs/{sample}_filt_15Mreads.SE.spp.log", sample=all_samples), + # expand("results/qc/{sample}_readlength.txt", sample=all_samples), + expand("results/qc/{sample}_filt_15Mreads.SE.cc.qc", sample=all_samples), + expand("results/qc/{sample}_filt_15Mreads.SE.cc.plot.pdf", sample=all_samples), + expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS), + expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS), + expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS), + expand("results/qc/{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS), + expand("results/bowtie2/{case}.bedpe", case=IPS), + expand("logs/{case}.bamToBed", case=IPS), + expand("results/qc/{case}_vs_{control}.frip.txt", case=IPS, control=INPUTS), + "results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak", + "results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak", + "results/macs2/H3K4me3_overlap.narrowPeak", + "results/macs2/H3K27ac_overlap.narrowPeak", # directory("results/multiqc/multiqc_report_data/"), # "results/multiqc/multiqc_report.html" # =============================================================================================== @@ -129,19 +134,18 @@ rule align: "bowtie2 --threads 8 -q -X 2000 --very-sensitive -x {params.index} -1 {input.R1} -2 {input.R2} \ | samtools view -u -h - | samtools sort -o {output} - 2> {log}" - # =============================================================================================== # 3. FILTERING # > remove unmapped, mate unmapped # > remove low MAPQ reads (-q 30 (ENCODE)) # > keep proper pairs (-f) +# > mark duplicates (picard) # > -F 1804 # -Read unmapped # -Mate unmapped # -Not primary alignment # -Read fails platform/vendor quality checks # -Read is PCR or optical duplicate -# > mark duplicates & remove duplicates (picard) # > index final position sorted BAM # =============================================================================================== @@ -153,17 +157,7 @@ rule filter: log: "logs/{sample}.dedup" shell: - "samtools view -b -q 30 -F 1804 -f 2 {input} | samtools sort -o {output} - 2> {log}" - -rule indexUnfiltBam: - input: - "results/bowtie2/{sample}.sorted.bam" - output: - "results/bowtie2/{sample}.sorted.bai" - log: - "logs/{sample}.indexUnfiltBam" - shell: - "samtools index -c {input} {output} 2> {log}" + "samtools view -b -F 1804 -q 30 -f 2 {input} | samtools sort -o {output} - 2> {log}" rule markDups: input: @@ -175,7 +169,7 @@ rule markDups: "logs/{sample}.dupmark" shell: "picard MarkDuplicates I={input} O={output.bam} \ - METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=false ASSUME_SORTED=true 2> {log}" + METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=FALSE ASSUME_SORTED=true 2> {log}" rule dedup: input: @@ -202,6 +196,7 @@ rule indexBam: # > SAMtools flagstat statistics # > Compute Library Complexity (PreSeq) # > picard library complexity +# > ENCODE library complexity # =============================================================================================== rule mappingStats: @@ -221,31 +216,52 @@ rule mappingStats: shell("samtools flagstat {input.c} > {output.c}") shell("samtools flagstat {input.d} > {output.d}") -rule preseq: +rule downsample_bam: input: - "results/bowtie2/{sample}.sorted.bam" + "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam" output: - "results/qc/{sample}.ccurve.txt" + "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam" log: - "logs/{sample}.preseq" + "logs/{sample}.downsample" shell: - "preseq lc_extrap -v -output {output} -pe -bam {input} 2> {log}" + "picard DownsampleSam I={input} O={output} P=0.35 \ + 2> {log}" + +# A-1_dupmark = 87165286 +# B-1_dupmark = 72303316 +# A-2_dupmark = 62678360 +# A-3_dupmark = 86503078 +# B-2_dupmark = 67048994 +# B-3_dupmark = 66615704 + +rule preseq: + input: + "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam" + output: + ccurve = "results/qc/{sample}.ccurve.txt", + extrap = "results/qc/{sample}.extrap.txt" + log: + ccurve = "logs/{sample}.ccurve.preseq", + extrap = "logs/{sample}.extrap.preseq" + run: + shell("preseq lc_extrap -v -output {output.extrap} -pe -bam {input} 2> {log.extrap}") + shell("preseq c_curve -v -output {output.ccurve} -pe -bam {input} 2> {log.ccurve}") rule get_picard_complexity_metrics: input: - "results/bowtie2/{sample}.sorted.bam" + "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam" output: "results/qc/{sample}_est_lib_complex_metrics.txt" log: - "logs/{sample}.picardLibComplexity" + "logs/{sample}.downSampled.picardLibComplexity" shell: "picard -Xmx6G EstimateLibraryComplexity INPUT={input} OUTPUT={output} USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE VERBOSITY=ERROR" rule sort_name: input: - "results/bowtie2/{sample}_PPq30.sorted.dupmark.bam" + "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam" output: - tmp = "results/bowtie2/{sample}_PPq30.sorted.dupmark.tmp.bam" + tmp = "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.tmp.bam" log: "logs/{sample}.pbc.sort" run: @@ -253,7 +269,7 @@ rule sort_name: rule estimate_lib_complexity: input: - "results/bowtie2/{sample}_PPq30.sorted.dupmark.tmp.bam" + "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.tmp.bam" output: qc = "results/qc/{sample}.pbc.qc", log: @@ -262,7 +278,7 @@ rule estimate_lib_complexity: """ bedtools bamtobed -bedpe -i {input} \ | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}' \ - | grep -v 'chrM' | sort | uniq -c \ + | sort | uniq -c \ | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \ {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \ > {output.qc} @@ -271,7 +287,7 @@ rule estimate_lib_complexity: ## convert to bedPE ## print read 1 scaffold, read 1 start coordinate, read 2 scaffold, read 2 end coordinate, strand read 1, strand read 2 ## remove mitochondrial genome -## sort by position and obtain uniq counts +## sort by position and obtain uniq reads ## Format of file ## TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair @@ -323,8 +339,8 @@ rule deeptools_correlation: rule deeptools_coverage: input: - bam ="results/bowtie2/{sample}.sorted.bam", - bai ="results/bowtie2/{sample}.sorted.bai" + bam ="results/bowtie2/{sample}_PPq30.sorted.dedup.bam", + bai ="results/bowtie2/{sample}_PPq30.sorted.dedup.bai" output: "results/qc/{sample}.SeqDepthNorm.bw" log: @@ -411,12 +427,10 @@ rule filter_sort_trimmed_alignment: input: "results/bowtie2/{sample}_R1_trimmed.bam" output: - flagstat = "results/qc/{sample}_R1_trimmed.flagstat.qc", bam = "results/bowtie2/{sample}_R1_trimmed_q30.bam" log: "logs/{sample}_align_trimmed_read1_filter.log" run: - shell("samtools sort -n {input} -O SAM | SAMstats --sorted_sam_file - --outf {output.flagstat}") shell("samtools view -F 1804 -q 30 -b {input} -o {output.bam}") rule bamtobed_crossC: @@ -442,9 +456,8 @@ rule subsample_aligned_reads: params: nreads= 15000000 run: - shell("""zcat {input} | grep -v 'chrM' | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""") + shell("""zcat {input} | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""") shell("zcat {input} > {output.tmp}") - shell("""READ_LEN=\$(head -n 100 {output.tmp} | awk 'function abs(v) {{return v < 0 ? -v : v}} BEGIN{{sum=0}} {{sum+=abs($3-$2)}} END{{print int(sum/NR)}}')""") # Determine exclusion range for fragment length estimation. # Use a fixed lowerbound at -500. @@ -457,20 +470,24 @@ rule cross_correlation_SSP: input: "results/bed/{sample}.filt.sample.15Mreads.SE.tagAlign.gz" output: - CC_SCORES_FILE="{sample}_filt_15Mreads.SE.cc.qc", - CC_PLOT_FILE="{sample}_filt_15Mreads.SE.cc.plot.pdf" + CC_SCORES_FILE="results/qc/{sample}_filt_15Mreads.SE.cc.qc", + CC_PLOT_FILE="results/qc/{sample}_filt_15Mreads.SE.cc.plot.pdf" + log: + "logs/{sample}_filt_15Mreads.SE.spp.log" params: EXCLUSION_RANGE_MIN=-500, - EXCLUSION_RANGE_MAX=110 - run: - shell("Rscript scripts/run_spp.R -c={input} -filtchr=chrM -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}") + EXCLUSION_RANGE_MAX=230 + shell: + "Rscript scripts/run_spp.R -c={input} -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}" -# # CC_SCORE FILE format -# # Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag # -# # =============================================================================================== -# # 7. Call peaks (MACS2) -# # =============================================================================================== +# CC_SCORE FILE format +# Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag + + +# =============================================================================================== +# 7. Call peaks (MACS2) +# =============================================================================================== # peak calling for pair-end peaks # effective genome size for dunnart genome calculated with khmer program (see README.md) @@ -494,11 +511,11 @@ rule call_peaks_macs2: -n {params.name} \ -g 2740338543 -B 2> {log} " -# # # =============================================================================================== -# # # 8. Peak QC -# # # > narrow peak counts -# # # > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks) -# # # =============================================================================================== +# =============================================================================================== +# 8. Peak QC +# > narrow peak counts +# > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks) +# =============================================================================================== # peak counts in a format that multiqc can handle rule get_narrow_peak_counts_for_multiqc: @@ -537,9 +554,9 @@ rule frip: "python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}" -# # =============================================================================================== -# # 9. Create consensus peaksets for replicates -# # =============================================================================================== +# =============================================================================================== +# 9. Create consensus peaksets for replicates +# =============================================================================================== # Based on ENCODE `overlap_peaks.py` - recommended for histone marks. # Need to pool peaks first for each replicate @@ -577,48 +594,23 @@ rule overlap_peaks_H3K27ac: shell: "python2.7 scripts/overlap_peaks.py {input.peak1} {input.peak2} {input.pooled} {output}" -#rule consensus_peaks: - - -#rule enhancer_promoter_peaks: -# enhancer = peaks with only H3K27ac -# Promoter = peaks with both H3K27ac & H3K4me3, or just H3K4me3 -# Make a bargraph with the amounts for each - - -# # =============================================================================================== -# # 10. QC on replicate peaks -# # =============================================================================================== -# -# # FRiP for overlapped peaks -# rule frip: -# input: -# bed = "results/bowtie2/{case}.bedpe", -# peak = "results/macs2/{}_macs2_peaks.narrowPeak" -# output: -# "results/qc/{case}.frip.txt" -# shell: -# "python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}" -# -# -# # =============================================================================================== -# # 11. Combine all QC into multiqc output -# # =============================================================================================== +# =============================================================================================== +# 10. Combine all QC into multiqc output +# =============================================================================================== # # rule multiqc: # input: # # fastqc -# expand("results/fastqc/{sample}_R1_fastqc.html", sample=all_samples), -# expand("results/fastqc/{sample}_R2_fastqc.html", sample=all_samples), -# expand("results/fastqc/{sample}_R1_fastqc.zip", sample=all_samples), -# expand("results/fastqc/{sample}_R2_fastqc.zip", sample=all_samples), +# expand("results/qc/{sample}_R1_fastqc.html", sample=all_samples), +# expand("results/qc/{sample}_R2_fastqc.html", sample=all_samples), +# expand("results/qc/{sample}_R1_fastqc.zip", sample=all_samples), +# expand("results/qc/{sample}_R2_fastqc.zip", sample=all_samples), # # bowtie2 # expand("logs/{sample}.align", sample=all_samples), -# expand("logs/{sample}.flagstat.qc", sample=all_samples), +# expand("results/qc/{sample}.flagstat.qc", sample=all_samples), # # preseq -# expand("results/preseq/{sample}.ccurve.txt", sample=all_samples), -# # picard -# expand("results/picard/{sample}_est_lib_complex_metrics.txt", sample=all_samples), +# expand("results/qc/{sample}.ccurve.txt", sample=all_samples), +# expand("results/qc/{sample}.extrap.txt", sample=all_samples), # # deepTools # "results/deeptools/multibamsum.npz", # "results/deeptools/multibamsum.tab", -- GitLab