annotated

163ebec9 · lecook · d1ba38f5 · 163ebec9 · d1ba38f5 · 163ebec9
Commit 163ebec9 authored 3 years ago by lecook
--- a/mouse-chipseq/README.md
+++ b/mouse-chipseq/README.md
@@ -54,52 +54,12 @@ snakemake -j 6 --snakefile Snakefile_H3K4me3 --cluster-config configs/cluster.js

 ```

-
-
 Cluster configuration file: configs/cluster.json\
 Sample configuration file: configs/config.yaml\
 Sample text file: configs/SSR.text\
 multiQC configuration file: configs/.multiqc_config.yaml\

-
-#  3. FILTERING
-
-### rule filter:
-
-### rule markDups:
-
-### rule properPairs:
-
-### rule indexBam:
-
-#  4. GENERAL ALIGNMENT QC
-
-### rule mappingStats:
-
-### rule preseq:
-
-### rule get_picard_complexity_metrics:
-
-
-Library Complexity
-
-ChIP-seq Standards:
-
-| PBC1 | PBC2 | Bottlenecking level | NRF | Complexity | Flag colors |
-|:-:|:-:|:-:|:-:|:-:|:-:|
-| < 0.5 | < 1 | Severe | < 0.5 | Concerning | Orange |
-| 0.5 ≤ PBC1 < 0.8 | 1 ≤ PBC2 < 3 | Moderate | 0.5 ≤ NRF < 0.8 | Acceptable | Yellow |
-| 0.8 ≤ PBC1 < 0.9 | 3 ≤ PBC2 < 10 | Mild | 0.8 ≤ NRF < 0.9 | Compliant | None |
-| ≥ 0.9 | ≥ 10 | None | > 0.9 | Ideal | None |
-
-#  5. deepTools
-
-### rule deeptools_summary:
-
-
-### rule deeptools_correlation:
-
-
+#  deepTools
 ### rule deeptools_coverage:
 Normalised to the reads per genomic content (normalized to 1x coverage)
 Produces a coverage file
@@ -111,17 +71,7 @@ The bigWig format is an indexed binary format useful for dense, continuous data
 - `smoothLength`: defines a window, larger than the binSize, to average the number of reads over. This helps produce a more continuous plot.
 - `centerReads`: reads are centered with respect to the fragment length as specified by extendReads. This option is useful to get a sharper signal around enriched regions.

-
-
-### rule deeptools_fingerprint:
-
-
-### rule deeptools_plotCoverage:
-
-### rule deeptools_bamPEFragmentSize:
-
-
-#  6. phantomPeakQuals
+#  phantomPeakQuals

 Information from: https://docs.google.com/document/d/1lG_Rd7fnYgRpSIqrIfuVlAz2dW1VaSQThzk836Db99c/edit

@@ -154,10 +104,8 @@ RSC; RSC>0.8 (0 = no signal; <1 low quality ChIP; >1 high enrichment

 Quality tag based on thresholded RSC (codes: -2:veryLow,-1:Low,0:Medium,1:High; 2:veryHigh)

-### rule phantomPeakQuals:
-

-#  7. Call peaks (MACS2)
+#  Call peaks (MACS2)

 __Input file options__

@@ -196,34 +144,7 @@ I've left all the shifting model and peak calling arguments as default
 - `peaks.xls`: a tabular file which contains information about called peaks. Additional information includes pileup and fold enrichment
 - `summits.bed`: peak summits locations for every peak. To find the motifs at the binding sites, this file is recommended

-
-## Compare peaks to ENCODE peaks
-With p < 0.01 I only call ~85000 peaks but ENCODE call ~125000
-Look at pvalues, qvalues and peak length between the two lists.
-
-Average Peak Length:
-
-
-Plot pvalues of ENCODE on the x and my calls on the y for each sample
-
-
-
-#  8. Peak QC
-
-
-### rule get_narrow_peak_counts_for_multiqc:
-
-
-
-### rule bamToBed:
-
-Convert BAM to tagAlign file for calculating FRiP QC metric (Fraction of reads in peaks)
-
-
-### rule frip:
-
-
-#  9. Create consensus peaksets for replicates
+#  Create consensus peaksets for replicates

 Edited version of ENCODE `overlap_peaks.py` - recommended for histone marks.


--- a/mouse-chipseq/Snakefile1_H3K4me3
+++ b/mouse-chipseq/Snakefile1_H3K4me3
--- a/mouse-chipseq/Snakefile_H3K4me3
+++ b/mouse-chipseq/Snakefile_H3K4me3
@@ -9,7 +9,7 @@
    # 7. cross correlation (SPP)
    # 8. Call narrow peaks (MACS2)
    # 9. Create consensus peaksets
-    # 10. Present QC for raw read, alignment, peak-calling in MultiQC
+    # 10. Present QC for raw read, alignment, peak-calling in MultiQC (work in progress)


 configfile: "configs/config_H3K4me3.yaml"
@@ -274,33 +274,33 @@ rule mappingStats:
        #shell("samtools flagstat {input.d} > {output.d}")


-# rule sort_name:
-#     input:
-#         "results_10M/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
-#     output:
-#         tmp = "results_10M/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
-#     log:
-#         "results_10M/logs/{sample}_{stage}_{mark}.pbc.sort"
-#     run:
-#         shell("samtools sort -n {input} -o {output.tmp} 2> {log}")
-#
-# rule estimate_lib_complexity:
-#     input:
-#         "results_10M/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
-#     output:
-#         qc = "results_10M/qc/{sample}_{stage}_{mark}.pbc.qc",
-#     log:
-#         "results_10M/logs/{sample}_{stage}_{mark}.pbc"
-#     shell:
-#         """
-#         bedtools bamtobed -i {input} \
-#         | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6}}' \
-#         | sort | uniq -c \
-#         | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
-#         {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
-#         > {output.qc}
-#         """
-#
+rule sort_name:
+     input:
+         "results_10M/bwa/{sample}_{stage}_{mark}_q30.dupmark.bam"
+     output:
+         tmp = "results_10M/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
+     log:
+         "results_10M/logs/{sample}_{stage}_{mark}.pbc.sort"
+     run:
+         shell("samtools sort -n {input} -o {output.tmp} 2> {log}")
+
+rule estimate_lib_complexity:
+     input:
+         "results_10M/bwa/{sample}_{stage}_{mark}_q30.dupmark.tmp.bam"
+     output:
+         qc = "results_10M/qc/{sample}_{stage}_{mark}.pbc.qc",
+     log:
+         "results_10M/logs/{sample}_{stage}_{mark}.pbc"
+     shell:
+         """
+         bedtools bamtobed -i {input} \
+         | awk 'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6}}' \
+         | sort | uniq -c \
+         | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} \
+         {{m0=m0+1}} {{mt=mt+$1}} END{{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n" ,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}' \
+         > {output.qc}
+         """
+
 ## convert to bedPE
 ## print read 1 scaffold, read 1 start coordinate, read 2 scaffold, read 2 end coordinate, strand read 1, strand read 2
 ## remove mitochondrial genome