From b3e92f87319e8f0af1c08c2c6e1b249843346451 Mon Sep 17 00:00:00 2001
From: Laura Cook <l.cook2@student.unimelb.edu.au>
Date: Thu, 27 Aug 2020 12:35:22 +1000
Subject: [PATCH] use bedtools merge for pooling peaks instead of just
 concatenating two files

---
 dunnart/README.md | 31 ++++++++++++++++++++++++++++++-
 dunnart/Snakefile |  6 +++---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/dunnart/README.md b/dunnart/README.md
index 29c4fd9..76634b0 100644
--- a/dunnart/README.md
+++ b/dunnart/README.md
@@ -539,7 +539,7 @@ therefore if amount that overlaps between each replicate divided by the length o
 
 ### rule pool_peaks:
 
-Overlap peaks scripts takes a file of pooled peaks (repA plus repB peaks).
+Overlap peaks scripts takes a file of pooled peaks (repA merged with repB peaks).
 
 ### rule overlap_peaks_H3K4me3:
 
@@ -604,3 +604,32 @@ intersectBed -a H3K4me3_consensus.narrowPeak -b dunnart_TWARs.bed > twar_H3K4me3
 ```
 intersectBed -a H3K27ac_consensus.narrowPeak -b dunnart_TWARs.bed > twar_H3K27ac_overlap.bed
 ```
+
+
+# 12. HOMER annotate peaks
+
+### How Basic Annotation Works
+The process of annotating peaks/regions is divided into two primary parts.  The first determines the distance to the nearest TSS and assigns the peak to that gene.  The second determines the genomic annotation of the region occupied by the center of the peak/region.
+
+### Distance to the nearest TSS
+
+By default, `annotatePeaks.pl` loads a file in the "/path-to-homer/data/genomes/<genome>/<genome>.tss" that contains the positions of RefSeq transcription start sites.  It uses these positions to determine the closest TSS, reporting the distance (negative values mean upstream of the TSS, positive values mean downstream), and various annotation information linked to locus including alternative identifiers (unigene, entrez gene, ensembl, gene symbol etc.).  This information is also used to link gene-specific information (see below) to a peak/region, such as gene expression.
+
+### Genomic Annotation
+
+To annotate the location of a given peak in terms of important genomic features, `annotatePeaks.pl` calls a separate program (assignGenomeAnnotation) to efficiently assign peaks to one of millions of possible annotations genome wide.  Two types of output are provided.  The first is "Basic Annotation" that includes whether a peak is in the TSS (transcription start site), TTS (transcription termination site), Exon (Coding), 5' UTR Exon, 3' UTR Exon, Intronic, or Intergenic, which are common annotations that many researchers are interested in.  A second round of "Detailed Annotation" also includes more detailed annotation, also considering repeat elements and CpG islands.  Since some annotation overlap, a priority is assign based on the following (in case of ties it's random [i.e. if there are two overlapping repeat element annotations]):
+
+* TSS (by default defined from -1kb to +100bp)
+* TTS (by default defined from -100 bp to +1kb)
+* CDS Exons
+* 5' UTR Exons
+* 3' UTR Exons
+* CpG Islands
+* Repeats
+* Introns
+* Intergenic
+
+Although HOMER doesn't allow you to explicitly change the definition of the region that is the TSS (-1kb to +100bp), you can "do it yourself" by sorting the annotation output in EXCEL by the "Distance to nearest TSS" column, and selecting those within the range you are interested in.
+
+
+# 13. ChIPseeker annotate peaks
diff --git a/dunnart/Snakefile b/dunnart/Snakefile
index 265aa4e..ddc4a10 100644
--- a/dunnart/Snakefile
+++ b/dunnart/Snakefile
@@ -507,7 +507,7 @@ rule call_peaks_macs2:
     shell:
         " macs2 callpeak -f BAMPE -t {input.case} \
         -c {input.control} --keep-dup all \
-        --outdir results/macs2/ \
+        --outdir results/macs2/ -p 0.01 \
         -n {params.name} \
         -g 2740338543 2> {log} "
 
@@ -569,8 +569,8 @@ rule pool_peaks:
         mark1 = "results/macs2/H3K4me3_pooled_macs2_peaks.narrowPeak",
         mark2 = "results/macs2/H3K27ac_pooled_macs2_peaks.narrowPeak"
     run:
-        shell("cat {input.mark1} > {output.mark1}")
-        shell("cat {input.mark2} > {output.mark2}")
+        shell("cat {input.mark1} | sort -k1,1 -k2,2n - | bedtools merge -i - > {output.mark1}")
+        shell("cat {input.mark2}  | sort -k1,1 -k2,2n - | bedtools merge -i - > {output.mark2}")
 
 
 rule overlap_peaks_H3K4me3:
-- 
GitLab