first commit

0bae9dff · Laura Cook · d85d9ee5 · 0bae9dff · 0bae9dff · 0bae9dff
Commit 0bae9dff authored 4 years ago by Laura Cook
--- a/mouse/scripts/count_peaks.py
+++ b/mouse/scripts/count_peaks.py
+#! /usr/bin/env python
+
+# Author: Adrian Foucal https://gitlab.univ-nantes.fr/foucal-a/full-chipseq/-/blob/master/scripts/count_peaks.py
+
+import json
+import argparse
+import subprocess
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--peak_type", help="Required. Peak type for section title. eg broadPeak, narrowpeak...")
+parser.add_argument("--sample_name",nargs='+', help="Required. name of your samples")
+parser.add_argument("--peaks", nargs = '+', help = "Called peaks, no header")
+
+args = parser.parse_args()
+
+
+def count_lines(fn):
+    return(int(subprocess.check_output("wc -l " + fn, shell=True).split()[0]))
+
+# Go through all sample_name, identify corresponding peak files and get the count in a dict
+sample_count = {}
+for sample_name in args.sample_name:
+    sample_count[sample_name] = {"Peak number": [count_lines(fn) for index, fn in enumerate(args.peaks) if sample_name in fn][0]}
+
+
+# Get the MultiQC output
+
+multiqc_output = {
+    "id": "MACS2_" + args.peak_type + "_count" ,
+    "section_anchor": "MACS2 peak",
+    "section_name": "MACS2 " + args.peak_type + " counts",
+    "description": "",
+    "plot_type": "bargraph",
+    "pconfig": {
+        "id": "MACS2 " + args.peak_type,
+        "ylab": "# Peaks",
+        "cpswitch": False,                       # Show the 'Counts / Percentages' switch?
+        "cpswitch_c_active": True
+    },
+    "data": sample_count
+}
+
+print(json.dumps(multiqc_output, indent = 4))
--- a/mouse/scripts/encode_frip.py
+++ b/mouse/scripts/encode_frip.py
+#! /usr/bin/env python
+
+## Author: Laura E Cook, University of Melbourne
+## Purpose:
+#   1. Intersect aligned reads with called peaks
+#   2. Count number of lines in intersected file
+#   3. Count number of lines in aligned reads (BED) file
+#   4. Calculate fraction of peaks that are in the aligned reads
+#   5. ENCODE guidelines state that FRiP > 0.3 is optimal, FRiP > 0.2 is acceptable
+
+from pybedtools import BedTool
+
+import sys
+import os
+
+# bed file (converted from aligned BAM file to bed)
+bed = BedTool(sys.argv[1])
+
+# called narrowPeak file from MACS2
+peak = BedTool(sys.argv[2])
+
+# overlap bed and peak files
+overlap = bed.intersect(peak, nonamecheck=True, wa=True, u=True)
+
+num_lines_overlap = 0
+
+# determine how many lines in overlap file
+for line in overlap:
+    num_lines_overlap += 1
+print 'Number of reads that intersect with peaks = ', num_lines_overlap
+
+num_lines_bed = 0
+
+# determine how many lines in original bed file
+for line in bed:
+    num_lines_bed +=1
+print 'Number of total reads = ', num_lines_bed
+
+# print the fraction of peaks in aligned reads
+print 'Fraction of Reads in Peak = ', str(float(num_lines_overlap)/float(num_lines_bed))
--- a/mouse/scripts/overlap_peaks.py
+++ b/mouse/scripts/overlap_peaks.py
+#! /usr/bin/env python
+
+## Author: Laura E Cook, University of Melbourne
+## Purpose:
+#   1. Import replicate 1, replicate 2, and output file name from snakemake rule
+#   2. Concatonate peak 1 adn peak 2 to get a pooled_peaks file
+#   3. Intersect pooled peaks with peak 1
+#   4. Write the original A and B entries plus the number of base pairs of overlap between the two features. Only A features with overlap are reported.
+#   5. Calculate
+#   5.
+
+import sys
+import os
+import subprocess
+
+# replicate 1
+peak1 = sys.argv[1]
+
+# replicate 2
+peak2 = sys.argv[2]
+
+# pooled peaks
+pooled = sys.argv[3]
+
+output = sys.argv[4]
+
+awk_param = '{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'
+cut_param = '1-10'
+
+cmd1 = 'intersectBed -wo '
+cmd1 += '-a {} -b {} | '
+cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | '
+cmd1 += 'cut -f {} | sort | uniq | '
+cmd1 += 'intersectBed -wo '
+cmd1 += '-a stdin -b {} | '
+cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | '
+cmd1 += 'cut -f {} | sort | uniq > {}'
+cmd2 = cmd1.format(
+    pooled,  # peak_pooled
+    peak1,  # peak1
+    awk_param,
+    cut_param,
+    peak2,  # peak2
+    awk_param,
+    cut_param,
+    output)
+
+os.system(cmd2)