diff --git a/mouse/Snakefile b/mouse/Snakefile index bd9ffae1f457beab449ef210f490435af0c15526..e52ecf1b306e9db988b71e7f6a867d5394231190 100644 --- a/mouse/Snakefile +++ b/mouse/Snakefile @@ -146,235 +146,290 @@ rule all: # > re-filter, sort and index final BAM # =============================================================================================== -# rule filter: -# input: -# "results/bwa/{sample}_{stage}_{mark}.bam" -# output: -# "results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam" -# log: -# "logs/{sample}_{stage}_{mark}.filter" -# shell: -# "samtools view -b -F 1804 -q 30 {input} | samtools sort -o {output} - 2> {log}" -# -# rule markDups: -# input: -# "results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam" -# output: -# bam="results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam", -# dupQC="results/bwa/{sample}_{stage}_{mark}.dupmark.qc" -# log: -# "logs/{sample}_{stage}_{mark}.dedup" -# shell: -# "picard MarkDuplicates I={input} O={output.bam} \ -# METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=false ASSUME_SORTED=true 2> {log}" -# -# rule dedup: -# input: -# "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam" -# output: -# "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam" -# log: -# "logs/{sample}_{stage}_{mark}.dedup" -# shell: -# "samtools view -F 1804 -b {input} | samtools sort -o {output} - 2> {log}" -# -# rule indexBam: -# input: -# "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam" -# output: -# "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai" -# log: -# "logs/{sample}_{stage}_{mark}.indexBam" -# shell: -# "samtools index {input} {output} 2> {log}" -# -# # =============================================================================================== -# # 4. ALIGNMENT QC -# # > SAMtools flagstat statistics -# # > CollectMultipleMetrics (picard) -# # > Compute Library Complexity (PreSeq) -# # =============================================================================================== -# -# rule mappingStats: -# input: -# a="results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam", -# b="results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam", -# c="results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam", -# d="results/bwa/{sample}_{stage}_{mark}.bam" -# output: -# a="results/qc/{sample}_{stage}_{mark}.dedup.flagstat.qc", -# b="results/qc/{sample}_{stage}_{mark}.dupmark.flagstat.qc", -# c="results/qc/{sample}_{stage}_{mark}.q30.flagstat.qc", -# d="results/qc/{sample}_{stage}_{mark}.unfiltered.flagstat.qc", -# run: -# shell("samtools flagstat {input.a} > {output.a}") -# shell("samtools flagstat {input.b} > {output.b}") -# shell("samtools flagstat {input.c} > {output.c}") -# shell("samtools flagstat {input.d} > {output.d}") -# -# rule estimate_lib_complexity: -# input: -# "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam" -# output: -# qc = "results/qc/{sample}_{stage}_{mark}.pbc.qc", -# log: -# "logs/{sample}_{stage}_{mark}.pbc" -# shell: -# """ -# bedtools bamtobed -bedpe -i {input} \ -# | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}' \ -# | sort | uniq -c \ -# | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \ -# {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \ -# > {output.qc} -# """ -# -# # =============================================================================================== -# # 5. deepTools -# # > multiBAMsummary -# # > plotCorrelation -# # > plotFingerprint -# # > bamCoverage (read depth normalised bigWig files) -# # =============================================================================================== -# -# rule deeptools_summary: -# input: -# expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark = MARK) -# output: -# sum="results/deeptools/multibamsum.npz", -# counts="results/deeptools/multibamsum.tab" -# threads: 32 -# log: -# "logs/multisummary.deepTools" -# shell: -# " multiBamSummary bins \ -# -p {threads} \ -# -b {input} \ -# --centerReads \ -# -out {output.sum} \ -# --outRawCounts {output.counts} 2> {log}" -# -# rule deeptools_correlation: -# input: "results/deeptools/multibamsum.npz" -# output: -# fig="results/deeptools/pearsoncor_multibamsum.png", -# matrix="results/deeptools/pearsoncor_multibamsum_matrix.txt" -# log: -# "logs/correlation.deepTools" -# shell: -# "plotCorrelation \ -# --corData {input} \ -# --plotFile {output.fig} \ -# --outFileCorMatrix {output.matrix} \ -# --corMethod pearson \ -# --whatToPlot heatmap \ -# --skipZeros \ -# --plotNumbers \ -# --colorMap RdYlBu 2> {log}" -# -# rule deeptools_coverage: -# input: -# bam = "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam", -# bai = "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai" -# output: -# "results/deeptools/{sample}_{stage}_{mark}.SeqDepthNorm.bw" -# log: -# "logs/{sample}_{stage}_{mark}_coverage.deepTools" -# shell: -# "bamCoverage --bam {input.bam} --binSize 10 --normalizeUsing RPGC --effectiveGenomeSize 2308125349 --numberOfProcessors 4 -o {output} 2> {log}" -# -# rule deeptools_fingerprint: -# input: -# bam = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark = MARK), -# bai = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"], zip, sample=all_samples, stage=STAGE, mark = MARK) -# output: -# fig="results/deeptools/multiBAM_fingerprint.png", -# metrics="results/deeptools/multiBAM_fingerprint_metrics.txt", -# rawcounts="results/deeptools/multiBAM_fingerprint_rawcounts.txt" -# threads: 32 -# log: -# "logs/fingerprint.deepTools" -# shell: -# "plotFingerprint -p {threads} \ -# -b {input.bam} \ -# --plotFile {output.fig} \ -# --outQualityMetrics {output.metrics} \ -# --outRawCounts {output.rawcounts} \ -# --minMappingQuality 20 \ -# --skipZeros \ -# --centerReads 2> {log}" -# -# rule deeptools_plotCoverage: -# input: -# bam = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark=MARK), -# bai = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"], zip, sample=all_samples, stage=STAGE, mark=MARK) -# output: -# fig="results/deeptools/plot_coverage.png", -# rawcounts="results/deeptools/plot_coverage_rawcounts.tab" -# log: -# "logs/plotCoverage.deepTools" -# shell: -# "plotCoverage --bamfiles {input.bam} --plotFile {output.fig} \ -# -n 1000000 \ -# --outRawCounts {output.rawcounts} \ -# --ignoreDuplicates 2> {log}" -# -# -# # =============================================================================================== -# # 6. Cross-correlation scores - following ENCODE code -# # =============================================================================================== -# -# rule bamtobed_crossC: -# input: -# "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam" -# output: -# tagAlign = "results/bed/{sample}_{stage}_{mark}_q30_SE.tagAlign.gz" -# shell: -# """ -# bedtools bamtobed -i {input} | \ -# awk 'BEGIN{{OFS="\\t"}}{{$4='N';$5='1000';print $0}}' |\ -# gzip -nc > {output.tagAlign} -# """ -# -# ## Subsample 15 million reads for cross-correlation analysis -# ## Estimate read length from first 100 reads -# rule subsample_aligned_reads: -# input: -# "results/bed/{sample}_{stage}_{mark}_q30_SE.tagAlign.gz" -# output: -# subsample = "results/bed/{sample}_{stage}_{mark}.filt.sample.15Mreads.SE.tagAlign.gz", -# tmp = "results/bed/{sample}_{stage}_{mark}_R1_trimmed_q30_SE.tagAlign.tmp" -# params: -# nreads= 15000000 -# run: -# shell("""zcat {input} | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""") -# shell("zcat {input} > {output.tmp}") -# -# # Determine exclusion range for fragment length estimation. -# ## From bamPEFragmentSize (deepTools) average fragment length is ~220bp -# ## See bamPEFragmentSize.histogram.png -# # Use a fixed lowerbound at -500. -# # Upperbound E -# #EXCLUSION_RANGE_MAX is -# # Histone ChIP-seq: max(read_len + 10, 100) -# # lowerbound is fixed at 500 for both -# -# -# rule cross_correlation_SSP: -# input: -# "results/bed/{sample}_{stage}_{mark}.filt.sample.15Mreads.SE.tagAlign.gz" -# output: -# CC_SCORES_FILE="results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.qc", -# CC_PLOT_FILE="results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.plot.pdf" -# log: -# "logs/{sample}_{stage}_{mark}_filt_15Mreads.SE.spp.log" -# params: -# EXCLUSION_RANGE_MIN=-500, -# EXCLUSION_RANGE_MAX=85 -# shell: -# "Rscript scripts/run_spp.R -c={input} -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}" +rule filter: + input: + "results/bwa/{sample}_{stage}_{mark}.bam" + output: + "results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam" + log: + "logs/{sample}_{stage}_{mark}.filter" + shell: + "samtools view -b -F 1804 -q 30 {input} | samtools sort -o {output} - 2> {log}" + +rule markDups: + input: + "results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam" + output: + bam="results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam", + dupQC="results/bwa/{sample}_{stage}_{mark}.dupmark.qc" + log: + "logs/{sample}_{stage}_{mark}.dedup" + shell: + "picard MarkDuplicates I={input} O={output.bam} \ + METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=false ASSUME_SORTED=true 2> {log}" + +rule dedup: + input: + "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam" + output: + "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam" + log: + "logs/{sample}_{stage}_{mark}.dedup" + shell: + "samtools view -F 1804 -b {input} | samtools sort -o {output} - 2> {log}" + +rule indexBam: + input: + "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam" + output: + "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai" + log: + "logs/{sample}_{stage}_{mark}.indexBam" + shell: + "samtools index {input} {output} 2> {log}" + +rule mergeBAMreplicates: + input: + E10 = ["results/bwa/ENCFF213EBC_E10.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF548BRR_E10.5_H3K27ac_q30.sorted.dedup.bam"], + E11 = ["results/bwa/ENCFF512SFE_E11.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF515PKL_E11.5_H3K27ac_q30.sorted.dedup.bam"], + E12 = ["results/bwa/ENCFF394TZN_E12.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF011NFM_E12.5_H3K27ac_q30.sorted.dedup.bam"], + E13 = ["results/bwa/ENCFF194ORC_E13.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF290ZNF_E13.5_H3K27ac_q30.sorted.dedup.bam"], + E14 = ["results/bwa/ENCFF327VAO_E14.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF902HAR_E14.5_H3K27ac_q30.sorted.dedup.bam"], + E15 = ["results/bwa/ENCFF584JFB_E15.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF707WKL_E15.5_H3K27ac_q30.sorted.dedup.bam"], + E10C = ["results/bwa/ENCFF157KEH_E10.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF825AVI_E10.5_H3K27ac_q30.sorted.dedup.bam"], + E11C = ["results/bwa/ENCFF184CUE_E11.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF376FGM_E11.5_H3K27ac_q30.sorted.dedup.bam"], + E12C = ["results/bwa/ENCFF203JQV_E12.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF058AUT_E12.5_H3K27ac_q30.sorted.dedup.bam"], + E13C = ["results/bwa/ENCFF117QRC_E13.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF248PGK_E13.5_H3K27ac_q30.sorted.dedup.bam"], + E14C = ["results/bwa/ENCFF784ORI_E14.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF002HZV_E14.5_H3K27ac_q30.sorted.dedup.bam"], + E15C = ["results/bwa/ENCFF727QTS_E15.5_H3K27ac_q30.sorted.dedup.bam", "results/bwa/ENCFF182XFG_E15.5_H3K27ac_q30.sorted.dedup.bam"], + output: + E10 = "results/bwa/E10.5_H3K27ac_q30.sorted.pooled.dedup.bam", + E11 = "results/bwa/E11.5_H3K27ac_q30.sorted.pooled.dedup.bam", + E12 = "results/bwa/E12.5_H3K27ac_q30.sorted.pooled.dedup.bam", + E13 = "results/bwa/E13.5_H3K27ac_q30.sorted.pooled.dedup.bam", + E14 = "results/bwa/E14.5_H3K27ac_q30.sorted.pooled.dedup.bam", + E15 = "results/bwa/E15.5_H3K27ac_q30.sorted.pooled.dedup.bam", + E10C = "results/bwa/input_E10.5_H3K27ac_q30.sorted.dedup.bam", + E11C = "results/bwa/input_E11.5_H3K27ac_q30.sorted.dedup.bam", + E12C = "results/bwa/input_E12.5_H3K27ac_q30.sorted.dedup.bam", + E13C = "results/bwa/input_E13.5_H3K27ac_q30.sorted.dedup.bam", + E14C = "results/bwa/input_E14.5_H3K27ac_q30.sorted.dedup.bam", + E15C = "results/bwa/input_E15.5_H3K27ac_q30.sorted.dedup.bam" + log: + E10 = "logs/E10.5_H3K27ac.mergeBAM", + E11 = "logs/E11.5_H3K27ac.mergeBAM", + E12 = "logs/E12.5_H3K27ac.mergeBAM", + E13 = "logs/E13.5_H3K27ac.mergeBAM", + E14 = "logs/E14.5_H3K27ac.mergeBAM", + E15 = "logs/E15.5_H3K27ac.mergeBAM", + E10C = "logs/input_E10.5_H3K27ac.mergeBAM", + E11C = "logs/input_E11.5_H3K27ac.mergeBAM", + E12C = "logs/input_E12.5_H3K27ac.mergeBAM", + E13C = "logs/input_E13.5_H3K27ac.mergeBAM", + E14C = "logs/input_E14.5_H3K27ac.mergeBAM", + E15C = "logs/input_E15.5_H3K27ac.mergeBAM" + run: + shell("samtools merge {output.E10} {input.E10} 2> {log.E10}") + shell("samtools merge {output.E11} {input.E11} 2> {log.E11}") + shell("samtools merge {output.E12} {input.E12} 2> {log.E12}") + shell("samtools merge {output.E13} {input.E13} 2> {log.E13}") + shell("samtools merge {output.E14} {input.E14} 2> {log.E14}") + shell("samtools merge {output.E15} {input.E15} 2> {log.E15}") + shell("samtools merge {output.E10C} {input.E10C} 2> {log.E10C}") + shell("samtools merge {output.E11C} {input.E11C} 2> {log.E11C}") + shell("samtools merge {output.E12C} {input.E12C} 2> {log.E12C}") + shell("samtools merge {output.E13C} {input.E13C} 2> {log.E13C}") + shell("samtools merge {output.E14C} {input.E14C} 2> {log.E14C}") + shell("samtools merge {output.E15C} {input.E15C} 2> {log.E15C}") + + +# =============================================================================================== +# 4. ALIGNMENT QC +# > SAMtools flagstat statistics +# > CollectMultipleMetrics (picard) +# > Compute Library Complexity (PreSeq) +# =============================================================================================== + +rule mappingStats: + input: + a="results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam", + b="results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam", + c="results/bwa/{sample}_{stage}_{mark}_q30.sorted.bam", + d="results/bwa/{sample}_{stage}_{mark}.bam" + output: + a="results/qc/{sample}_{stage}_{mark}.dedup.flagstat.qc", + b="results/qc/{sample}_{stage}_{mark}.dupmark.flagstat.qc", + c="results/qc/{sample}_{stage}_{mark}.q30.flagstat.qc", + d="results/qc/{sample}_{stage}_{mark}.unfiltered.flagstat.qc", + run: + shell("samtools flagstat {input.a} > {output.a}") + shell("samtools flagstat {input.b} > {output.b}") + shell("samtools flagstat {input.c} > {output.c}") + shell("samtools flagstat {input.d} > {output.d}") + +rule estimate_lib_complexity: + input: + "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam" + output: + qc = "results/qc/{sample}_{stage}_{mark}.pbc.qc", + log: + "logs/{sample}_{stage}_{mark}.pbc" + shell: + """ + bedtools bamtobed -bedpe -i {input} \ + | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}' \ + | sort | uniq -c \ + | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \ + {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \ + > {output.qc} + """ + +# =============================================================================================== +# 5. deepTools +# > multiBAMsummary +# > plotCorrelation +# > plotFingerprint +# > bamCoverage (read depth normalised bigWig files) +# =============================================================================================== + +rule deeptools_summary: + input: + expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark = MARK) + output: + sum="results/qc/multibamsum.npz", + counts="results/qc/multibamsum.tab" + threads: 32 + log: + "logs/multisummary.deepTools" + shell: + " multiBamSummary bins \ + -p {threads} \ + -b {input} \ + --centerReads \ + -out {output.sum} \ + --outRawCounts {output.counts} 2> {log}" + +rule deeptools_correlation: + input: "results/qc/multibamsum.npz" + output: + fig="results/qc/pearsoncor_multibamsum.png", + matrix="results/qc/pearsoncor_multibamsum_matrix.txt" + log: + "logs/correlation.deepTools" + shell: + "plotCorrelation \ + --corData {input} \ + --plotFile {output.fig} \ + --outFileCorMatrix {output.matrix} \ + --corMethod pearson \ + --whatToPlot heatmap \ + --skipZeros \ + --plotNumbers \ + --colorMap RdYlBu 2> {log}" + +rule deeptools_coverage: + input: + bam = "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam", + bai = "results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai" + output: + "results/qc/{sample}_{stage}_{mark}.SeqDepthNorm.bw" + log: + "logs/{sample}_{stage}_{mark}_coverage.deepTools" + shell: + "bamCoverage --bam {input.bam} --binSize 10 --normalizeUsing RPGC --effectiveGenomeSize 2308125349 --numberOfProcessors 4 -o {output} 2> {log}" + +rule deeptools_fingerprint: + input: + bam = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark = MARK), + bai = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"], zip, sample=all_samples, stage=STAGE, mark = MARK) + output: + fig="results/qc/multiBAM_fingerprint.png", + metrics="results/qc/multiBAM_fingerprint_metrics.txt", + rawcounts="results/qc/multiBAM_fingerprint_rawcounts.txt" + threads: 32 + log: + "logs/fingerprint.deepTools" + shell: + "plotFingerprint -p {threads} \ + -b {input.bam} \ + --plotFile {output.fig} \ + --outQualityMetrics {output.metrics} \ + --outRawCounts {output.rawcounts} \ + --minMappingQuality 20 \ + --skipZeros \ + --centerReads 2> {log}" + +rule deeptools_plotCoverage: + input: + bam = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bam"], zip, sample=all_samples, stage=STAGE, mark=MARK), + bai = expand(["results/bwa/{sample}_{stage}_{mark}_q30.sorted.dedup.bai"], zip, sample=all_samples, stage=STAGE, mark=MARK) + output: + fig="results/qc/plot_coverage.png", + rawcounts="results/qc/plot_coverage_rawcounts.tab" + log: + "logs/plotCoverage.deepTools" + shell: + "plotCoverage --bamfiles {input.bam} --plotFile {output.fig} \ + -n 1000000 \ + --outRawCounts {output.rawcounts} \ + --ignoreDuplicates 2> {log}" + + +# =============================================================================================== +# 6. Cross-correlation scores - following ENCODE code +# =============================================================================================== + +rule bamtobed_crossC: + input: + "results/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam" + output: + tagAlign = "results/bed/{sample}_{stage}_{mark}_q30_SE.tagAlign.gz" + shell: + """ + bedtools bamtobed -i {input} | \ + awk 'BEGIN{{OFS="\\t"}}{{$4='N';$5='1000';print $0}}' |\ + gzip -nc > {output.tagAlign} + """ + +## Subsample 15 million reads for cross-correlation analysis +## Estimate read length from first 100 reads +rule subsample_aligned_reads: + input: + "results/bed/{sample}_{stage}_{mark}_q30_SE.tagAlign.gz" + output: + subsample = "results/bed/{sample}_{stage}_{mark}.filt.sample.15Mreads.SE.tagAlign.gz", + tmp = "results/bed/{sample}_{stage}_{mark}_R1_trimmed_q30_SE.tagAlign.tmp" + params: + nreads= 15000000 + run: + shell("""zcat {input} | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""") + shell("zcat {input} > {output.tmp}") + +# Determine exclusion range for fragment length estimation. +## From bamPEFragmentSize (deepTools) average fragment length is ~220bp +## See bamPEFragmentSize.histogram.png +# Use a fixed lowerbound at -500. +# Upperbound E +#EXCLUSION_RANGE_MAX is +# Histone ChIP-seq: max(read_len + 10, 100) +# lowerbound is fixed at 500 for both + + +rule cross_correlation_SSP: + input: + "results/bed/{sample}_{stage}_{mark}.filt.sample.15Mreads.SE.tagAlign.gz" + output: + CC_SCORES_FILE="results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.qc", + CC_PLOT_FILE="results/qc/{sample}_{stage}_{mark}_filt_15Mreads.SE.cc.plot.pdf" + log: + "logs/{sample}_{stage}_{mark}_filt_15Mreads.SE.spp.log" + params: + EXCLUSION_RANGE_MIN=-500, + EXCLUSION_RANGE_MAX=85 + shell: + "Rscript scripts/run_spp.R -c={input} -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}" + -# # CC_SCORE FILE format # Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag