Skip to content
Snippets Groups Projects
Commit 47ee15b9 authored by Laura Cook's avatar Laura Cook
Browse files

added max and min fragment length for spp cross correlation

parent c5960ccf
Branches
No related tags found
No related merge requests found
......@@ -57,15 +57,19 @@ rule all:
expand("results/bowtie2/{sample}_PPq30.sorted.dupmark.bam", sample=all_samples),
expand("results/bowtie2/{sample}_PPq30.sorted.dedup.bam", sample=all_samples),
expand("results/bowtie2/{sample}_PPq30.sorted.dedup.bai", sample=all_samples),
expand("results/qc/{sample}.sorted.flagstat.qc", sample=all_samples),
expand("results/qc/{sample}.unfiltered.flagstat.qc", sample=all_samples),
expand("results/qc/{sample}.dedup.flagstat.qc", sample=all_samples),
expand("results/qc/{sample}.dupmark.flagstat.qc", sample=all_samples),
expand("results/qc/{sample}.PPq30.flagstat.qc", sample=all_samples),
expand("results/qc/{sample}.ccurve.txt", sample=all_samples),
expand("results/qc/{sample}.extrap.txt", sample=all_samples),
expand("logs/{sample}.ccurve.preseq", sample=all_samples),
expand("logs/{sample}.extrap.preseq", sample=all_samples),
expand("results/qc/{sample}_est_lib_complex_metrics.txt", sample=all_samples),
expand("logs/{sample}.picardLibComplexity", sample=all_samples),
expand("results/qc/{sample}.pbc.qc", sample=all_samples),
expand("results/bowtie2/{sample}_PPq30.sorted.tmp.bam",sample=all_samples),
expand("results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam",sample=all_samples),
"results/qc/multibamsum.npz",
"results/qc/multibamsum.tab",
"results/qc/pearsoncor_multibamsum.png",
......@@ -76,21 +80,22 @@ rule all:
"results/qc/multiBAM_fingerprint_rawcounts.txt",
"results/qc/bamPEFragmentSize_hist.png",
"results/qc/bamPEFragmentSize_rawcounts.tab",
expand("results/qc/{sample}_R1_trimmed.flagstat.qc", sample=all_samples),
expand("results/bowtie2/{sample}_R1_trimmed_q30.bam", sample=all_samples),
expand("{sample}_filt_15Mreads.SE.cc.qc", sample=all_samples),
expand("{sample}_filt_15Mreads.SE.cc.plot.pdf", sample=all_samples)
# expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS),
# expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS),
# expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS),
# expand("results/qxc{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS),
# expand("results/bowtie2/{case}.bedpe", case=IPS),
# expand("logs/{case}.bamToBed", case=IPS),
# expand("results/qc/{case}_vs_{control}.frip.txt", case=IPS, control=INPUTS)
# "results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak",
# "results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak",
# "results/macs2/H3K4me3_overlap.narrowPeak",
# "results/macs2/H3K27ac_overlap.narrowPeak",
expand("logs/{sample}_filt_15Mreads.SE.spp.log", sample=all_samples),
# expand("results/qc/{sample}_readlength.txt", sample=all_samples),
expand("results/qc/{sample}_filt_15Mreads.SE.cc.qc", sample=all_samples),
expand("results/qc/{sample}_filt_15Mreads.SE.cc.plot.pdf", sample=all_samples),
expand("results/macs2/{case}_vs_{control}_macs2_peaks.narrowPeak", zip, case=IPS, control=INPUTS),
expand("results/macs2/{case}_vs_{control}_macs2_peaks.xls", zip, case=IPS, control=INPUTS),
expand("results/macs2/{case}_vs_{control}_macs2_summits.bed", zip, case=IPS, control=INPUTS),
expand("results/qc/{case}-vs-{control}-narrowpeak-count_mqc.json", zip, case=IPS, control=INPUTS),
expand("results/bowtie2/{case}.bedpe", case=IPS),
expand("logs/{case}.bamToBed", case=IPS),
expand("results/qc/{case}_vs_{control}.frip.txt", case=IPS, control=INPUTS),
"results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak",
"results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak",
"results/macs2/H3K4me3_overlap.narrowPeak",
"results/macs2/H3K27ac_overlap.narrowPeak",
# directory("results/multiqc/multiqc_report_data/"),
# "results/multiqc/multiqc_report.html"
# ===============================================================================================
......@@ -129,19 +134,18 @@ rule align:
"bowtie2 --threads 8 -q -X 2000 --very-sensitive -x {params.index} -1 {input.R1} -2 {input.R2} \
| samtools view -u -h - | samtools sort -o {output} - 2> {log}"
# ===============================================================================================
# 3. FILTERING
# > remove unmapped, mate unmapped
# > remove low MAPQ reads (-q 30 (ENCODE))
# > keep proper pairs (-f)
# > mark duplicates (picard)
# > -F 1804
# -Read unmapped
# -Mate unmapped
# -Not primary alignment
# -Read fails platform/vendor quality checks
# -Read is PCR or optical duplicate
# > mark duplicates & remove duplicates (picard)
# > index final position sorted BAM
# ===============================================================================================
......@@ -153,17 +157,7 @@ rule filter:
log:
"logs/{sample}.dedup"
shell:
"samtools view -b -q 30 -F 1804 -f 2 {input} | samtools sort -o {output} - 2> {log}"
rule indexUnfiltBam:
input:
"results/bowtie2/{sample}.sorted.bam"
output:
"results/bowtie2/{sample}.sorted.bai"
log:
"logs/{sample}.indexUnfiltBam"
shell:
"samtools index -c {input} {output} 2> {log}"
"samtools view -b -F 1804 -q 30 -f 2 {input} | samtools sort -o {output} - 2> {log}"
rule markDups:
input:
......@@ -175,7 +169,7 @@ rule markDups:
"logs/{sample}.dupmark"
shell:
"picard MarkDuplicates I={input} O={output.bam} \
METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=false ASSUME_SORTED=true 2> {log}"
METRICS_FILE={output.dupQC} REMOVE_DUPLICATES=FALSE ASSUME_SORTED=true 2> {log}"
rule dedup:
input:
......@@ -202,6 +196,7 @@ rule indexBam:
# > SAMtools flagstat statistics
# > Compute Library Complexity (PreSeq)
# > picard library complexity
# > ENCODE library complexity
# ===============================================================================================
rule mappingStats:
......@@ -221,31 +216,52 @@ rule mappingStats:
shell("samtools flagstat {input.c} > {output.c}")
shell("samtools flagstat {input.d} > {output.d}")
rule preseq:
rule downsample_bam:
input:
"results/bowtie2/{sample}.sorted.bam"
"results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
output:
"results/qc/{sample}.ccurve.txt"
"results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam"
log:
"logs/{sample}.preseq"
"logs/{sample}.downsample"
shell:
"preseq lc_extrap -v -output {output} -pe -bam {input} 2> {log}"
"picard DownsampleSam I={input} O={output} P=0.35 \
2> {log}"
# A-1_dupmark = 87165286
# B-1_dupmark = 72303316
# A-2_dupmark = 62678360
# A-3_dupmark = 86503078
# B-2_dupmark = 67048994
# B-3_dupmark = 66615704
rule preseq:
input:
"results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
output:
ccurve = "results/qc/{sample}.ccurve.txt",
extrap = "results/qc/{sample}.extrap.txt"
log:
ccurve = "logs/{sample}.ccurve.preseq",
extrap = "logs/{sample}.extrap.preseq"
run:
shell("preseq lc_extrap -v -output {output.extrap} -pe -bam {input} 2> {log.extrap}")
shell("preseq c_curve -v -output {output.ccurve} -pe -bam {input} 2> {log.ccurve}")
rule get_picard_complexity_metrics:
input:
"results/bowtie2/{sample}.sorted.bam"
"results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
output:
"results/qc/{sample}_est_lib_complex_metrics.txt"
log:
"logs/{sample}.picardLibComplexity"
"logs/{sample}.downSampled.picardLibComplexity"
shell:
"picard -Xmx6G EstimateLibraryComplexity INPUT={input} OUTPUT={output} USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE VERBOSITY=ERROR"
rule sort_name:
input:
"results/bowtie2/{sample}_PPq30.sorted.dupmark.bam"
"results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.bam"
output:
tmp = "results/bowtie2/{sample}_PPq30.sorted.dupmark.tmp.bam"
tmp = "results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.tmp.bam"
log:
"logs/{sample}.pbc.sort"
run:
......@@ -253,7 +269,7 @@ rule sort_name:
rule estimate_lib_complexity:
input:
"results/bowtie2/{sample}_PPq30.sorted.dupmark.tmp.bam"
"results/bowtie2/{sample}_PPq30.sorted.dupmark_downSampled.tmp.bam"
output:
qc = "results/qc/{sample}.pbc.qc",
log:
......@@ -262,7 +278,7 @@ rule estimate_lib_complexity:
"""
bedtools bamtobed -bedpe -i {input} \
| awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}' \
| grep -v 'chrM' | sort | uniq -c \
| sort | uniq -c \
| awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
{{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
> {output.qc}
......@@ -271,7 +287,7 @@ rule estimate_lib_complexity:
## convert to bedPE
## print read 1 scaffold, read 1 start coordinate, read 2 scaffold, read 2 end coordinate, strand read 1, strand read 2
## remove mitochondrial genome
## sort by position and obtain uniq counts
## sort by position and obtain uniq reads
## Format of file
## TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
......@@ -323,8 +339,8 @@ rule deeptools_correlation:
rule deeptools_coverage:
input:
bam ="results/bowtie2/{sample}.sorted.bam",
bai ="results/bowtie2/{sample}.sorted.bai"
bam ="results/bowtie2/{sample}_PPq30.sorted.dedup.bam",
bai ="results/bowtie2/{sample}_PPq30.sorted.dedup.bai"
output:
"results/qc/{sample}.SeqDepthNorm.bw"
log:
......@@ -411,12 +427,10 @@ rule filter_sort_trimmed_alignment:
input:
"results/bowtie2/{sample}_R1_trimmed.bam"
output:
flagstat = "results/qc/{sample}_R1_trimmed.flagstat.qc",
bam = "results/bowtie2/{sample}_R1_trimmed_q30.bam"
log:
"logs/{sample}_align_trimmed_read1_filter.log"
run:
shell("samtools sort -n {input} -O SAM | SAMstats --sorted_sam_file - --outf {output.flagstat}")
shell("samtools view -F 1804 -q 30 -b {input} -o {output.bam}")
rule bamtobed_crossC:
......@@ -442,9 +456,8 @@ rule subsample_aligned_reads:
params:
nreads= 15000000
run:
shell("""zcat {input} | grep -v 'chrM' | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""")
shell("""zcat {input} | shuf -n {params.nreads} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {input} | wc -c) -nosalt </dev/zero 2>/dev/null) | gzip -nc > {output.subsample}""")
shell("zcat {input} > {output.tmp}")
shell("""READ_LEN=\$(head -n 100 {output.tmp} | awk 'function abs(v) {{return v < 0 ? -v : v}} BEGIN{{sum=0}} {{sum+=abs($3-$2)}} END{{print int(sum/NR)}}')""")
# Determine exclusion range for fragment length estimation.
# Use a fixed lowerbound at -500.
......@@ -457,20 +470,24 @@ rule cross_correlation_SSP:
input:
"results/bed/{sample}.filt.sample.15Mreads.SE.tagAlign.gz"
output:
CC_SCORES_FILE="{sample}_filt_15Mreads.SE.cc.qc",
CC_PLOT_FILE="{sample}_filt_15Mreads.SE.cc.plot.pdf"
CC_SCORES_FILE="results/qc/{sample}_filt_15Mreads.SE.cc.qc",
CC_PLOT_FILE="results/qc/{sample}_filt_15Mreads.SE.cc.plot.pdf"
log:
"logs/{sample}_filt_15Mreads.SE.spp.log"
params:
EXCLUSION_RANGE_MIN=-500,
EXCLUSION_RANGE_MAX=110
run:
shell("Rscript scripts/run_spp.R -c={input} -filtchr=chrM -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}")
EXCLUSION_RANGE_MAX=230
shell:
"Rscript scripts/run_spp.R -c={input} -savp={output.CC_PLOT_FILE} -out={output.CC_SCORES_FILE} -x={params.EXCLUSION_RANGE_MIN}:{params.EXCLUSION_RANGE_MAX} 2> {log}"
# # CC_SCORE FILE format
# # Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
#
# # ===============================================================================================
# # 7. Call peaks (MACS2)
# # ===============================================================================================
# CC_SCORE FILE format
# Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
# ===============================================================================================
# 7. Call peaks (MACS2)
# ===============================================================================================
# peak calling for pair-end peaks
# effective genome size for dunnart genome calculated with khmer program (see README.md)
......@@ -494,11 +511,11 @@ rule call_peaks_macs2:
-n {params.name} \
-g 2740338543 -B 2> {log} "
# # # ===============================================================================================
# # # 8. Peak QC
# # # > narrow peak counts
# # # > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks)
# # # ===============================================================================================
# ===============================================================================================
# 8. Peak QC
# > narrow peak counts
# > fraction of reads in peaks (convert BAM to bed first then do intersect with peaks)
# ===============================================================================================
# peak counts in a format that multiqc can handle
rule get_narrow_peak_counts_for_multiqc:
......@@ -537,9 +554,9 @@ rule frip:
"python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}"
# # ===============================================================================================
# # 9. Create consensus peaksets for replicates
# # ===============================================================================================
# ===============================================================================================
# 9. Create consensus peaksets for replicates
# ===============================================================================================
# Based on ENCODE `overlap_peaks.py` - recommended for histone marks.
# Need to pool peaks first for each replicate
......@@ -577,48 +594,23 @@ rule overlap_peaks_H3K27ac:
shell:
"python2.7 scripts/overlap_peaks.py {input.peak1} {input.peak2} {input.pooled} {output}"
#rule consensus_peaks:
#rule enhancer_promoter_peaks:
# enhancer = peaks with only H3K27ac
# Promoter = peaks with both H3K27ac & H3K4me3, or just H3K4me3
# Make a bargraph with the amounts for each
# # ===============================================================================================
# # 10. QC on replicate peaks
# # ===============================================================================================
#
# # FRiP for overlapped peaks
# rule frip:
# input:
# bed = "results/bowtie2/{case}.bedpe",
# peak = "results/macs2/{}_macs2_peaks.narrowPeak"
# output:
# "results/qc/{case}.frip.txt"
# shell:
# "python2.7 scripts/encode_frip.py {input.bed} {input.peak} > {output}"
#
#
# # ===============================================================================================
# # 11. Combine all QC into multiqc output
# # ===============================================================================================
# ===============================================================================================
# 10. Combine all QC into multiqc output
# ===============================================================================================
#
# rule multiqc:
# input:
# # fastqc
# expand("results/fastqc/{sample}_R1_fastqc.html", sample=all_samples),
# expand("results/fastqc/{sample}_R2_fastqc.html", sample=all_samples),
# expand("results/fastqc/{sample}_R1_fastqc.zip", sample=all_samples),
# expand("results/fastqc/{sample}_R2_fastqc.zip", sample=all_samples),
# expand("results/qc/{sample}_R1_fastqc.html", sample=all_samples),
# expand("results/qc/{sample}_R2_fastqc.html", sample=all_samples),
# expand("results/qc/{sample}_R1_fastqc.zip", sample=all_samples),
# expand("results/qc/{sample}_R2_fastqc.zip", sample=all_samples),
# # bowtie2
# expand("logs/{sample}.align", sample=all_samples),
# expand("logs/{sample}.flagstat.qc", sample=all_samples),
# expand("results/qc/{sample}.flagstat.qc", sample=all_samples),
# # preseq
# expand("results/preseq/{sample}.ccurve.txt", sample=all_samples),
# # picard
# expand("results/picard/{sample}_est_lib_complex_metrics.txt", sample=all_samples),
# expand("results/qc/{sample}.ccurve.txt", sample=all_samples),
# expand("results/qc/{sample}.extrap.txt", sample=all_samples),
# # deepTools
# "results/deeptools/multibamsum.npz",
# "results/deeptools/multibamsum.tab",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment