Skip to content
Snippets Groups Projects
Commit 37fcbe2a authored by lecook's avatar lecook
Browse files

change folders

parent c99e9c3b
No related branches found
No related tags found
No related merge requests found
Showing
with 4502 additions and 0 deletions
No preview for this file type
File added
This diff is collapsed.
name: wga
channels:
- bioconda
- biocore
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_gnu
- bzip2=1.0.8=h516909a_3
- c-ares=1.11.0=h470a237_1
- ca-certificates=2020.6.20=hecda079_0
- cairo=1.16.0=h3fc0475_1005
- cd-hit=4.8.1=h8b12597_3
- certifi=2020.6.20=py38h32f6830_0
- curl=7.71.1=he644dc0_6
- entrez-direct=13.9=pl526h375a9b1_0
- expat=2.2.9=he1b5a44_2
- fontconfig=2.13.1=h1056068_1002
- freetype=2.10.2=he06d7ca_0
- fribidi=1.0.10=h516909a_0
- genometools-genometools=1.6.1=py38h23571c4_2
- gettext=0.19.8.1=hc5be6a0_1002
- glib=2.66.0=h0dae87d_0
- gmp=6.1.2=hf484d3e_1000
- gnutls=3.5.19=h2a4e5f8_1
- graphite2=1.3.13=he1b5a44_1001
- harfbuzz=2.7.2=hee91db6_0
- hmmer=3.3.1=he1b5a44_0
- icu=67.1=he1b5a44_0
- krb5=1.17.1=hfafb76e_3
- ld_impl_linux-64=2.35=h769bd43_9
- libcurl=7.71.1=hcdd3856_6
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.3.0=h24d8f2e_16
- libgomp=9.3.0=h24d8f2e_16
- libiconv=1.16=h516909a_0
- libnghttp2=1.41.0=hab1572f_1
- libpng=1.6.37=hed695b0_2
- libssh2=1.9.0=hab1572f_5
- libstdcxx-ng=9.3.0=hdf63c60_16
- libuuid=2.32.1=h14c3975_1000
- libxcb=1.13=h14c3975_1002
- libxml2=2.9.10=h68273f3_2
- llvm-openmp=8.0.1=hc9558a2_0
- ltr_retriever=2.9.0=0
- mafft=7.471=h516909a_0
- ncurses=6.2=he1b5a44_1
- nettle=3.3=0
- nseg=1.0.1=h516909a_0
- openmp=8.0.1=0
- openssl=1.1.1h=h516909a_0
- pango=1.42.4=h7062337_4
- pcre=8.44=he1b5a44_0
- perl=5.26.2=h516909a_1006
- perl-app-cpanminus=1.7044=pl526_1
- perl-archive-tar=2.32=pl526_0
- perl-base=2.23=pl526_1
- perl-business-isbn=3.004=pl526_0
- perl-business-isbn-data=20140910.003=pl526_0
- perl-carp=1.38=pl526_3
- perl-common-sense=3.74=pl526_2
- perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
- perl-compress-raw-zlib=2.087=pl526hc9558a2_0
- perl-constant=1.33=pl526_1
- perl-data-dumper=2.173=pl526_0
- perl-digest-hmac=1.03=pl526_3
- perl-digest-md5=2.55=pl526_0
- perl-encode=2.88=pl526_1
- perl-encode-locale=1.05=pl526_6
- perl-exporter=5.72=pl526_1
- perl-exporter-tiny=1.002001=pl526_0
- perl-extutils-makemaker=7.36=pl526_1
- perl-file-listing=6.04=pl526_1
- perl-file-path=2.16=pl526_0
- perl-file-temp=0.2304=pl526_2
- perl-file-which=1.23=pl526_0
- perl-html-parser=3.72=pl526h6bb024c_5
- perl-html-tagset=3.20=pl526_3
- perl-html-tree=5.07=pl526_1
- perl-http-cookies=6.04=pl526_0
- perl-http-daemon=6.01=pl526_1
- perl-http-date=6.02=pl526_3
- perl-http-message=6.18=pl526_0
- perl-http-negotiate=6.01=pl526_3
- perl-io-compress=2.087=pl526he1b5a44_0
- perl-io-html=1.001=pl526_2
- perl-io-socket-ssl=2.066=pl526_0
- perl-io-zlib=1.10=pl526_2
- perl-json=4.02=pl526_0
- perl-json-xs=2.34=pl526h6bb024c_3
- perl-libwww-perl=6.39=pl526_0
- perl-list-moreutils=0.428=pl526_1
- perl-list-moreutils-xs=0.428=pl526_0
- perl-lwp-mediatypes=6.04=pl526_0
- perl-lwp-protocol-https=6.07=pl526_4
- perl-mime-base64=3.15=pl526_1
- perl-mozilla-ca=20180117=pl526_1
- perl-net-http=6.19=pl526_0
- perl-net-ssleay=1.88=pl526h90d6eec_0
- perl-ntlm=1.09=pl526_4
- perl-parent=0.236=pl526_1
- perl-pathtools=3.75=pl526h14c3975_1
- perl-scalar-list-utils=1.52=pl526h516909a_0
- perl-socket=2.027=pl526_1
- perl-storable=3.15=pl526h14c3975_0
- perl-test-requiresinternet=0.05=pl526_0
- perl-text-soundex=3.05=pl526_1000
- perl-time-local=1.28=pl526_1
- perl-try-tiny=0.30=pl526_1
- perl-types-serialiser=1.0=pl526_2
- perl-uri=1.76=pl526_0
- perl-www-robotrules=6.02=pl526_3
- perl-xml-namespacesupport=1.12=pl526_0
- perl-xml-parser=2.44=pl526h4e0c4b3_7
- perl-xml-sax=1.02=pl526_0
- perl-xml-sax-base=1.09=pl526_0
- perl-xml-sax-expat=0.51=pl526_3
- perl-xml-simple=2.25=pl526_1
- perl-xsloader=0.24=pl526_0
- pip=20.2.3=py_0
- pixman=0.38.0=h516909a_1003
- pthread-stubs=0.4=h14c3975_1001
- python=3.8.5=h1103e12_8_cpython
- python_abi=3.8=1_cp38
- readline=8.0=he28a2e2_2
- recon=1.08=h516909a_2
- repeatmasker=4.1.0=pl526_0
- repeatmodeler=2.0.1=pl526_0
- repeatscout=1.0.6=h516909a_1
- rmblast=2.9.0=h2d02072_0
- setuptools=49.6.0=py38h32f6830_1
- sqlite=3.33.0=h4cf870e_0
- tk=8.6.10=hed695b0_0
- trf=4.09.1=h516909a_0
- wheel=0.35.1=pyh9f0ad1d_0
- xorg-kbproto=1.0.7=h14c3975_1002
- xorg-libice=1.0.10=h516909a_0
- xorg-libsm=1.2.3=h84519dc_1000
- xorg-libx11=1.6.12=h516909a_0
- xorg-libxau=1.0.9=h14c3975_0
- xorg-libxdmcp=1.1.3=h516909a_0
- xorg-libxext=1.3.4=h516909a_0
- xorg-libxrender=0.9.10=h516909a_1002
- xorg-renderproto=0.11.1=h14c3975_1002
- xorg-xextproto=7.3.0=h14c3975_1002
- xorg-xproto=7.0.31=h14c3975_1007
- xz=5.2.5=h516909a_1
- zlib=1.2.11=h516909a_1009
prefix: /home/lecook/.conda/envs/wga
library(plyr)
library(dplyr)
library(data.table)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("biostrings")
library(seqinr)
library(Biostrings)
setwd("/Users/lauracook/OneDrive - The University of Melbourne/PhD (2018-2021)/8-WholeGenomeAlignment/2-UCSC_hoxD55_alignment/1-UCSC_alignment_QC/2-MafFilter/")
# Summarise statistics from MafFilter
mm10 <- read.table("mm10.statistics.csv", header=TRUE, sep="\t") # read in table
smiCra1 <- read.table("smiCra1.statistics.csv", header=TRUE, sep="\t") # read in table
head(mm10) #check file has been imported correctly
mm10ChrCount <- mm10 %>% count(Chr, sort=TRUE)#how many blocks per chromosome
smiCra1ChrCount <- smiCra1 %>% count(Chr, sort=TRUE) #how many blocks per chromosome
#total size (incl gaps) just use mouse as the statistics are the same
size1to10bp <- mm10[which(mm10$BlockLength>=1 & mm10$BlockLength<=10),]
size10to100bp <- mm10[which(mm10$BlockLength>=10 & mm10$BlockLength<=100),]
size100to1kb <- mm10[which(mm10$BlockLength>=100 & mm10$BlockLength<=1000),]
size1kbto10kb <- mm10[which(mm10$BlockLength>=1000 & mm10$BlockLength<=10000),]
size10kbto100kb <- mm10[which(mm10$BlockLength>=10000 & mm10$BlockLength<=100000),]
size100kbto1Mb <- mm10[which(mm10$BlockLength>=100000 & mm10$BlockLength<=1000000),]
count(size1to10bp)
count(size10to100bp)
count(size100to1kb)
count(size1kbto10kb)
count(size10kbto100kb)
count(size100kbto1Mb)
sum(size1to10bp$BlockLength)
sum(size10to100bp$BlockLength)
sum(size100to1kb$BlockLength)
sum(size1kbto10kb$BlockLength)
sum(size10kbto100kb$BlockLength)
sum(size100kbto1Mb$BlockLength)
setwd("/Users/lauracook/OneDrive - The University of Melbourne/PhD (2018-2021)/8-WholeGenomeAlignment/2-UCSC_hoxD55_alignment/1-UCSC_alignment_QC/")
# MOUSE exon coverage
#mm10 refseq ncbi curated exons downloaded from ucsc
system("bedtools sort -i mm10exons.bed > mm10exonsSorted.bed", intern=TRUE) ## sort by coordinates
system("bedtools merge -i mm10exonsSorted.bed > mm10exonsMerged.bed", intern=TRUE) ## bedtools merge where transcripts are the same coordinates or within each other
system("bedtools merge -i mm10_mafAlignment.bed > mm10_mafAlignmentMerged.bed", intern=TRUE) ## also merge the maf alignments from the liftOver
system("bedtools intersect -a mm10exonsMerged.bed -b mm10_mafAlignmentBlocks.bed -wo > mm10exonOverlaps.bed", intern=TRUE) ## intersect exon coordinates with maf alignment file
## calculate coverage
mm10exon <- read.table("mm10exonsMerged.bed") ## exon file
mm10exon$length <- (mm10exon$V3 - mm10exon$V2) ## exon length in ncbi refseq
mm10overlaps <- read.table("mm10exonOverlaps.bed") ## overlap file
dim(mm10overlaps) ## check file imported correctly
sum(mm10overlaps$V10) ## 31776094bps of exons found in maf alignment
sum(mm10exon$length) ## 34221920bps of exons in refseq mm10
mm10exonCoverage <- sum(mm10overlaps$V10)/sum(mm10exon$length) ## converge alignment exon bps/total exon bps
## DUNNART exon coverage
system("bedtools sort -i dunnart_exons.bed > dunnart_exonsSorted.bed", intern = TRUE) ## sort by coordinates
system("bedtools merge -i dunnart_exonsSorted.bed > smiCra1exonsMerged.bed", intern = TRUE) ## bedtools merge where transcripts are the same coordinates or within each other
system("bedtools merge -i smiCra1_mafAlignmentBlocksSorted.bed > smiCra1_mafAlignmentMerged.bed", intern = TRUE) ## also merge the maf alignments from the liftOver
system("bedtools intersect -a smiCra1exonsMerged.bed -b smiCra1_mafAlignmentMerged.bed -wo > smiCra1_exonOverlaps.bed", intern=TRUE) ## intersect exon coordinates with maf alignment file
smiCra1exon <- read.table("smiCra1exonsMerged.bed", sep="\t", header=FALSE) ## exon file
dim(smiCra1exon) ## check file imported correctly
smiCra1exon$length <- (smiCra1exon$V3 - smiCra1exon$V2) ## exon length
smiCra1overlaps <- read.table("smiCra1_exonOverlaps.bed", sep="\t", header=FALSE) ## exon overlaps with maf alignment file
dim(smiCra1overlaps) ## check imported correctly
sum(smiCra1exon$length) # 63502228
sum(smiCra1overlaps$V7) # 43939135
smiCra1exonCoverage <- sum(smiCra1overlaps$V7)/sum(smiCra1exon$length)
## average sequence percentage mismatches for dunnart and mouse
mm10 <- read.table("mm10.divergence.statistics.csv", header=TRUE, sep="\t")
smiCra1 <- read.table("smiCra1.divergence.statistics.csv", header=TRUE, sep="\t")
mean(mm10$Div.mm10.smiCra1)
mean(smiCra1$Div.smiCra1.mm10)
## Genome coverage
readDNAStringSet("mm10_mafAlignment_nogaps.fasta") #number bp in alignment 743350114
readDNAStringSet("smiCra1_mafAlignment_nogaps.fasta") #number bp in alignment 747935780
mm10genomeSize <- 2652783500
smiCra1genomeSize <-2838290115
mm10coverage <- 743350114/mm10genomeSize
smiCra1coveage <- 747935780/smiCra1genomeSize
#!/bin/bash
### Author: Laura E Cook, University of Melbourne, 22/01/2020
### Last update: 07/02/2020
### Purpose: basic array wrapper script
# Array set up:
#SBATCH --array=618-718
# Partition for the job:
# The project ID which this job should run under:
#SBATCH -A punim0586
# Maximum number of tasks/CPU cores used by the job:
#SBATCH --mem=50000
#SBATCH --partition mig
# Send yourself an email when the job:
# aborts abnormally (fails)
#SBATCH --mail-type=FAIL
# ends successfully
#SBATCH --mail-type=END
#SBATCH --mail-user=lecook@student.unimelb.edu.au
# The maximum running time of the job in days-hours:mins:sec
#SBATCH --time=50:00:00
# The name of the job:
#SBATCH --job-name=array
# Output control:
#SBATCH --error="/data/projects/punim0586/lecook/chipseq-pipeline/cross_species/logs/lastz_%a.stderr"
#SBATCH --output="/data/projects/punim0586/lecook/chipseq-pipeline/cross_species/logs/lastz_%a.stdout"
# Load modules:
module load foss
module load lastz
conda activate wga
# Run the simulations:
command=`head -n $SLURM_ARRAY_TASK_ID /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/scripts/lastz_commands.sh | tail -n 1`
echo "SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID
echo $command
eval $command
\ No newline at end of file
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
##!/usr/bin/env bash
## Change file names in a loop
TRA=($(for file in *.fa.masked; do echo $file |cut -d "." -f 1;done))
echo ${TRA[@]}
for tr in ${TRA[@]};
do
mv ${tr}.fa.masked ${tr}.fa
done
\ No newline at end of file
#!usr/bin/env python3
import string
import random
import sys
import os
from Bio import SeqIO
file = sys.argv[1]
# Loop through all the files in the variable
with open(file, 'r') as fasta:
# for each record (TWAR header) in the file parse it as a FASTA
for record in SeqIO.parse(fasta, "fasta"):
# set each fasta to a seq
seq = record.seq
print("Count of A: ", seq.count("A"))
print("Count of C: ", seq.count("C"))
print("Count of G: ", seq.count("G"))
print("Count of T: ", seq.count("T"))
fasta.close()
#GCATGC
#!/usr/bin/env bash
## This script loops through all scaffolds in the dunnart genome and
## generates a separate command for each scaffold
## These commands are then used in a slurm array script to run jobs in parallel
TRA=($(for file in *.chain; do echo $file |cut -d "." -f 1-2;done))
echo ${TRA[@]}
for tr in ${TRA[@]};
do
# generate lastz commands
#echo 'lastz_32 /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/mm10.fa[multiple] /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/smiCra1_RM/'${tr}.fa 'H=2000 K=2400 L=3000 Y=9400 --format=maf > /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/maf/'${tr}_mm10.smiCra1.maf
# generate convert maf to axt format commands
#echo 'maf-convert axt '${tr}.maf' > '${tr}.axt
## build co-linear alignment chains
#echo 'axtChain -linearGap=loose /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/axt/'${tr}'.axt /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/mm10.2bit /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/smiCra1.2bit '${tr}'.chain'
echo 'chainSort '${tr}'.chain > '${tr}'.sorted.chain'
done
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
#!usr/bin/env python3
import string
import random
import sys
import os
from Bio import SeqIO
## usage python3 parse_FASTA.py [input.fasta] > [output.fasta]
file = sys.argv[1]
# Loop through all the files in the variable
with open(file, 'r') as all_TWARs:
# for each record (TWAR header) in the file parse it as a FASTA
for record in SeqIO.parse(all_TWARs, "fasta"):
# set the record ID to a variable
id = record.id
# set the TWAR sequence to a variable
seq = record.seq
print(">" + str(id) + "\n" + str(seq) + "*")
all_TWARs.close()
##!/usr/bin/env bash
## This script loops through all scaffolds in the dunnart genome and
## generates a separate RepeatMasker command for each scaffold
## These commands are then used in a slurm array script to run jobs in parallel
TRA=($(for file in *.fa; do echo $file |cut -d "." -f 1;done))
echo ${TRA[@]}
for tr in ${TRA[@]};
do
echo 'RepeatMasker -q -xsmall smiCra1/'${tr}'.fa -default_search_engine hmmer -trf_prgm /home/lecook/.conda/envs/wga/bin/trf -hmmer_dir /home/lecook/.conda/envs/wga/bin/'
done
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment