Skip to content
Snippets Groups Projects
Commit 65bae9dc authored by lecook's avatar lecook
Browse files

updated

parent c1ea4c9f
No related branches found
No related tags found
No related merge requests found
Showing
with 1604 additions and 4122 deletions
# GC content and CpG % plots
library(data.table)
library(tidyverse)
library(ggridges)
library(ggpubr)
library(reshape2)
library(RColorBrewer)
library(ggplot2)
library(VennDiagram)
library(viridis)
library(hrbrthemes)
library(gghalves)
library(dplyr)
promoter <- fread("dunnart_promoters_homerAnnot.txt")
enhancer <- fread("dunnart_enhancers_homerAnnot.txt")
peaks = list(promoter, enhancer)
peaks = Map(mutate,peaks,cre=c("promoters", "enhancers"))
peaks = lapply(peaks, function(x) x %>% dplyr::select("Chr", "Start", "End", "Peak Score", "Distance to TSS", "CpG%", "GC%", "cre") %>% as.data.table())
peaks = rbindlist(peaks)
colnames(peaks)[5:7] <- c("distanceToTSS","CpG", "GC")
peaks[is.na(peaks)] <- 0
peaks = peaks[,log10_abs_dist:=log10(abs(distanceToTSS)+1)]
promoter = subset(peaks, cre == "promoters")
enhancer = subset(peaks, cre == "enhancers")
promoter.subset=subset(promoter, log10_abs_dist >= 3.5)
promoter.subset2 = subset(promoter, log10_abs_dist <3.5)
promoter.subset$cre = "subset>3.5"
promoter.subset2$cre = "subset<3.5"
data = rbind(enhancer, promoter, promoter.subset2, promoter.subset)
p <- ggplot(data, aes(factor(cre), y = GC)) +
#geom_violin(aes(fill=factor(cre), color=factor(cre)),
# position = "dodge"
# )+
geom_boxplot(aes(color=factor(cre)),
#outlier.shape = NA,
width = .2,
notch = TRUE,
fill=c("#FCF8EC","#E4F3F1", "#FCF8EC","#E4F3F1"),
position = position_dodge(width=.1)
) + theme_bw() + xlab("") + ylab("CpG %")
pdf("GC.pdf", width=9, height = 8)
p + scale_color_manual(values = c("#E9C46A","#2A9D8F","#E9C46A","#2A9D8F")) +
scale_fill_manual(values = c("#F1DAA2","#7AC2B9","#E9C46A","#2A9D8F"))
dev.off()
pairwise.wilcox.test(data$GC, data$cre, p.adjust.method="fdr")
p <- ggplot(peaks, aes(factor(cre), y = GC.width)) +
geom_violin(aes(fill=factor(cre), color=factor(cre)),
position = "dodge"
)+
geom_boxplot(aes(color=factor(cre)),
outlier.shape = NA,
width = .15,
notch = TRUE,
fill=c("#FCF8EC","#E4F3F1"),
position = position_dodge(width=.1)
) + theme_bw() + xlab("") + ylab("GC content %")
pdf("GC.width_plot.pdf", width=9, height = 8)
p + scale_color_manual(values = c("#E9C46A","#2A9D8F")) +
scale_fill_manual(values = c("#F1DAA2","#7AC2B9")) + stat_compare_means(method = "wilcox.test" )
dev.off()
\ No newline at end of file
## GO analysis on orthologous peaks in mouse and dunnart
## Input is liftOver peaks in mouse coordinates from comparePeaks.R
## Load packages
library(clusterProfiler)
library(data.table)
library(enrichplot)
library(GOSemSim)
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
library(viridis)
library(ComplexHeatmap)
library(tidyverse)
library(ggplot2)
library(plyr)
library(UpSetR)
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
mmGO = godata('org.Mm.eg.db', ont="BP")
###################################################################################################
############################### GENE ONTOLOGY & PATHWAY ANALYSIS ##################################
###################################################################################################
compareOrthoPeaks <- function(bedList, dotplot, names, upsetPlot, mergedNames){
bedFiles = list.files(bedList, pattern= "liftOver.bed", full.names=T) # create list of files in directory
bedFiles = as.list(bedFiles)
## Annotate peaks
peakAnnoList <- lapply(bedFiles, annotatePeak, TxDb=txdb,
tssRegion=c(-3000, 3000), verbose=FALSE, annoDb="org.Mm.eg.db")
## Make dataframe from upset plot
df2 = lapply(peakAnnoList, function(x) as.data.frame(x@anno))
subset = lapply(df2, function(x) x %>% dplyr::select(ENSEMBL) %>% as.data.table() %>% unique())
names(subset) = names
subset = Map(mutate, subset, stage = names(subset))
merged <- subset %>% purrr::reduce(full_join, by = "ENSEMBL") # join all dataframes by ensembl geneID
merged[is.na(merged)] <- 0
merged = as.data.frame(merged)
geneId = merged$geneId
colnames(merged) = mergedNames
upset.df = data.frame(lapply(merged[,2:8], function(x) as.numeric(x!="0")))
rownames(upset.df) = geneId
## Upset plot
pdf(upsetPlot)
print(upset(upset.df,
nsets=7, order.by="freq",
number.angles = 45))
dev.off()
## Make a background universe for GO enrichment testing
## Take the union of ALL genes with peaks for mouse and dunnart
bedFiles = list.files(backgroundList, pattern= ".txt", full.names=T) # create list of files in directory
bedFiles = as.list(bedFiles)
data = lapply(bedFiles, function(x) fread(x, header=TRUE, sep="\t", quote = "", na.strings=c("", "NA")))
data = lapply(data, function(x) x=setnames(x, old="geneId", new="mouseensembl", skip_absent=TRUE) %>% as.data.table())
genes = lapply(data, function(i) as.data.frame(i)$mouseensembl)
genes = lapply(genes, function(x) bitr(x, fromType="ENSEMBL", toType="ENTREZID", OrgDb="org.Mm.eg.db"))
bkg_genes = lapply(genes, function(i) as.data.frame(i)$ENTREZID)
bkg_genes_union = Reduce(union, genes)
## GO terms
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = names
go_cluster <- pairwise_termsim(
setReadable(
compareCluster(
geneCluster = genes,
fun = enrichGO,
ont="BP",
keyType="ENTREZID",
pAdjustMethod = "fdr",
pvalueCutoff = 0.001,
OrgDb = org.Mm.eg.db),
OrgDb = org.Mm.eg.db,
keyType="ENTREZID"))
## Dotplot
pdf(dotplot, width = 20, height = 10)
print(dotplot(go_cluster) +
scale_color_viridis() +
theme(axis.text.x = element_text(angle = 45, hjust=1)))
dev.off()
return(genes)
}
names = c("E10", "E11", "E12", "E13", "E14", "E15", "dunnart")
all_promoters = compareOrthoPeaks(bedList = "GO_analysis/promoters", backgroundList = "../consensus/promoters/clustered", names = c("E10_mouse", "E10_combined", "E10_dunnart", "E11_mouse", "E11_combined", "E11_dunnart",
"E12_mouse", "E12_combined", "E12_dunnart", "E13_mouse", "E13_combined", "E13_dunnart",
"E14_mouse", "E14_combined", "E14_dunnart", "E15_mouse", "E15_combined", "E15_dunnart"), mergedNames = c("geneId", "E10_mouse", "E10_combined", "E10_dunnart", "E11_mouse", "E11_combined", "E11_dunnart",
"E12_mouse", "E12_combined", "E12_dunnart", "E13_mouse", "E13_combined", "E13_dunnart",
"E14_mouse", "E14_combined", "E14_dunnart", "E15_mouse", "E15_combined", "E15_dunnart"), dotplot = "promoter_dotplot_GOenrich.pdf", upsetPlot = "promoter_upsetPlot_GOenrich.pdf")
all.enhancers = compareOrthoPeaks(bedList = "GO_analysis/enhancers", names = c("E10_dunnart", "E10_mouse", "E10_combined", "E11_dunnart","E11_mouse", "E11_combined",
"E12_dunnart","E12_mouse", "E12_combined", "E13_dunnart","E13_mouse", "E13_combined", "E14_dunnart",
"E14_mouse", "E14_combined", "E15_dunnart", "E15_mouse","E15_combined"), mergedNames = c("geneId", "E10_dunnart", "E10_mouse", "E10_combined", "E11_dunnart","E11_mouse", "E11_combined",
"E12_dunnart","E12_mouse", "E12_combined", "E13_dunnart","E13_mouse", "E13_combined", "E14_dunnart",
"E14_mouse", "E14_combined", "E15_dunnart", "E15_mouse","E15_combined"), dotplot = "enhancer_dotplot_GOenrich.pdf", upsetPlot = "enhancer_upsetPlot_GOenrich.pdf")
enhancerSim = mclusterSim(all.enhancers, semData=mmGO, measure="Wang", combine="BMA")
pdf("enhancerGOSemSim_heatmap.pdf")
Heatmap(enhancerSim)
dev.off()
promoterSim = mclusterSim(all.promoters, semData=mmGO, measure="Wang", combine="BMA")
pdf("promoterGOSemSim_heatmap.pdf")
Heatmap(promoterSim)
dev.off()
## Now look at merged mouse peaks across all stages and compare to the dunnart
##
bedFiles = list("GO_analysis/merged/dunnart_specific_mouse_merged_enhancerOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_merged_enhancerOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_specific_enhancer_overlap_50bpsummits.sorted.merged_mm10TOsmiCra_backTOmm10.bed")
bedFiles = list("GO_analysis/merged/dunnart_specific_mouse_merged_promoterOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_merged_promoterOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_specific_promoter_overlap_50bpsummits.sorted.merged_mm10TOsmiCra_backTOmm10.bed")
peakAnnoList <- lapply(bedFiles, annotatePeak, TxDb=txdb,
tssRegion=c(-3000, 3000), verbose=FALSE, annoDb="org.Mm.eg.db")
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = c("dunnart_biased", "common", "mouse_biased")
go_cluster <- simplify(
setReadable(
compareCluster(
geneCluster = genes,
fun = enrichGO,
ont="BP",
keyType="ENTREZID",
pvalueCutoff = 0.01,
OrgDb = org.Mm.eg.db),
OrgDb = org.Mm.eg.db,
keyType="ENTREZID"),
cutoff=0.9,
by="p.adjust",
select_fun=min)
pdf("summary_enhancer_cnetplot_GOenrich.pdf")
print(cnetplot(go_cluster))
dev.off()
pdf("summary_enhancer_dotplot_GOenrich.pdf", width = 10, height = 8)
print(dotplot(go_cluster, showCategory = 15) + theme(axis.text.x = element_text(angle = 45, hjust=1)))
dev.off()
## Script that takes as input file with GO enriched terms and returns you a barplot
library(dplyr); library(data.table)
library(magrittr);
library(ggthemes);library(ggplot2);library(ggpubr)
## change here the working directory(ies)
setwd('/Users/lauracook/OneDrive - The University of Melbourne/PhD (2018-2021)/6-ChIPseq/3-DataAnalysis/4-dunnartPeaks/1-FullDataset/')
go_files_dir='/data/projects/punim0586/lecook/chipseq-pipeline/cross_species/peakAnnotation/overlaps/promoters/GO_noOverlap/'
plot_dir='/data/projects/punim0586/lecook/chipseq-pipeline/cross_species/peakAnnotation/overlaps/promoters/GO_noOverlap'
## read files into the directory
## it creates a list of dataframes (one per file)
## then discards the geneID field
read_files=function(dir){
x=as.character(list.files(dir,recursive =F,full.names = T)) %>%
lapply(function(y)
fread(y,sep = '\t',header = T)[
,geneID:=NULL
] %>% setorderv('p.adjust',1)) ## sorted by most significant ones
## this returns you the filename without file extension.
## I will use it as plot title below (you can remove it if u want)
file_names=as.character(list.files(dir,recursive =F,full.names = F))
file_names=gsub("\\..*","",file_names)
names(x)=file_names
## I will make a new field with the filename that I will use as categorical variable to assign different colors
## i.e. each plot will have a different color
x=Map(mutate,x,filename=names(x))
x=lapply(x,function(y)y=as.data.table(y))
return(x)
}
go_terms=read_files(go_files_dir)
## I will make another column with the color for the given file.
## all row entries will be identical but I just need this for plotting
## I am going to do in this way so that u can change the colors as u like for each file
colors=c('#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC',
'#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC') ## remember to add a color per input file
go_colors=Map(cbind, go_terms, color=colors)
go_colors=lapply(go_colors, function(x)x=as.data.table(x))
## this keeps only the significant GO terms
significant_go_terms=lapply(go_colors,function(x)x=x[p.adjust<=0.05])
plot_results=function(file){
top_n_GOs=copy(file)
top_n_GOs=top_n_GOs[1:20]
ggplot(top_n_GOs,
aes(x=reorder(Description,-log10(p.adjust)), y=-log10(p.adjust),fill=filename)) + ## this reorders the x-axis and converts the p-adj to log10 values
geom_bar(stat = 'identity',position = 'dodge')+
xlab(" ") +
ylab("\n -Log10 (P) \n ") +
scale_y_continuous(breaks = round(seq(0, max(-log10(top_n_GOs$p.adjust)), by = 2), 1)) +
scale_fill_manual(name= " ",values=top_n_GOs$color)+
theme( ## these are just parameters to change the theme of the plot. you can add more or remove them
legend.position='none',
legend.background=element_rect(),
axis.text.x=element_text(angle=0, hjust=1.10),
axis.text.y=element_text(angle=0, vjust=0.8),
axis.title=element_text(),
axis.line = element_line(color = "black",size = 0, linetype = "solid"),
panel.background =element_rect(fill = 'white', size = 0.5,colour = 'black'),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
title=element_text()) +
coord_flip()
}
go_plot=lapply(significant_go_terms,function(x)x=plot_results(x))
# Save each plot to a different pdf in the plot_dir/
lapply(names(go_plot),
function(x)ggsave(filename=paste(plot_dir,x,'_plot',".pdf",sep=""),
plot=go_plot[[x]]))
## input: vector of GO terms
## output: heatmap semantic similarity scores
library(tools)
library(GOSemSim)
library(ComplexHeatmap)
library(circlize)
library('BiocParallel')
library(data.table)
library(tidyverse)
## GO data from mouse org database for biological processes
## Can do for mouse phenotype and molecular processes
mmGO = godata('org.Mm.eg.db', ont="BP")
plot_dir = ("consensus/promoters/")
## Wang is actually a guy but here it refers to a semantic similarity metric (see https://yulab-smu.top/biomedical-knowledge-mining-book/semantic-similarity-overview.html)
## combine = 'BMA' returns a single value estimating how similar the 2 sets are, you can change it with min,max,avg ... (see link above)
goPCA <- function(fileList, file_to_compare_to){
files =list.files(fileList, pattern= "", full.names=T) # create list of files in directory
files = as.list(files)
list = lapply(files, function(x) read.table(x, header=TRUE, sep="\t", quote = "") %>% as.data.table()) # read in all files
## this keeps only the significant GO terms
significant_go_terms=lapply(list,function(x) x=x[p.adjust<=0.01])
listVector = lapply(significant_go_terms, function(x) dplyr::pull(x, ID))
file_input = read.table(file_to_compare_to, sep="\t", header=TRUE, quote="") %>% as.data.frame()
file_sig_go = dplyr::pull(file_input[file_input$p.adjust <= 0.01,], ID)
## This extracts the file names for all the files in the list (all embryonic stages)
file_names=as.character(list.files(fileList,recursive =F,full.names = F)) ## directory with all embryonic stages
file_names=gsub("\\..*","",file_names) ## Removes everything except 'E*.5'
file_names=gsub("[^0-9]","", file_names) ## Removes the 'E' so that it's a number
names(listVector)=file_names ## Add new names to list
## Check if input file name is less than or equal to any in listVector
## This will avoid running comparisons that have already been run
## Eg. E10.5 vs E11.5 is the same as E11.5 vs E10.5
file_to_compare_to_name = basename(file_path_sans_ext(file_to_compare_to)) ## Name of file to compare to
if(grepl('^E', file_to_compare_to_name) == TRUE){ ## If file begins with E (this is so that it doesn't stall with the dunnart files)
file_to_compare_to_name=gsub("\\..*","",file_to_compare_to_name) ## Removes everything except 'E*.5'
file_to_compare_to_name=gsub("[^0-9]","",file_to_compare_to_name) ## Removes the 'E' so that it's a number
listVectorSubset = listVector[names(listVector) > file_to_compare_to_name] ## Keep only vectors in the list that are greater than the file being compared to
## Eg. this means that when comparing E12 to the list then it will only compare to 13,14 & 15
} else
listVectorSubset = listVector
## Compare the similarity between GO lists
## This will parallelise the lapply function
comparisons = bplapply(listVectorSubset,function(x) mgoSim(file_sig_go,x, semData=mmGO,measure="Wang", combine='NULL'))
return(comparisons)
}
## This command checks the time it takes to run
## One comparison is about 95 seconds
system.time(mgoSim(listVector[[1]], listVector[[2]],semData=mmGO, measure="Wang", combine='BMA'))
register(SerialParam())
## Promoters
plot_dir = ("consensus/promoters/")
dunnartvsmouse = GOsimilarity(fileList = "consensus/promoters/mouse_GO",
file_to_compare_to = "consensus/promoters/dunnart_GO/promoter_mm10GOenrich") #dunnart versus all mouse stages
e10vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO",
file_to_compare_to = "consensus/promoters/mouse_GO/E10.5_promoter_mm10GOenrich")
e11vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO",
file_to_compare_to = "consensus/promoters/mouse_GO/E11.5_promoter_mm10GOenrich")
e12vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO",
file_to_compare_to = "consensus/promoters/mouse_GO/E12.5_promoter_mm10GOenrich")
e13vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO",
file_to_compare_to = "consensus/promoters/mouse_GO/E13.5_promoter_mm10GOenrich")
e14vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO",
file_to_compare_to = "consensus/promoters/mouse_GO/E14.5_promoter_mm10GOenrich")
## Enhancers
plot_dir = ("consensus/enhancers/")
dunnartvsmouse = GOsimilarity(fileList = "consensus/enhancers/mouse_GO",
file_to_compare_to = "consensus/enhancers/dunnart_GO/enhancer_mm10GOenrich_subsample.txt")
e10vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO",
file_to_compare_to = "consensus/enhancers/mouse_GO/E10.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
e11vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO",
file_to_compare_to = "consensus/enhancers/mouse_GO/E11.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
e12vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO",
file_to_compare_to = "consensus/enhancers/mouse_GO/E12.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
e13vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO",
file_to_compare_to = "consensus/enhancers/mouse_GO/E13.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
e14vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO",
file_to_compare_to = "consensus/enhancers/mouse_GO/E14.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
README.md 100644 → 100755
File mode changed from 100644 to 100755
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
## Load packages
library(data.table)
library(tidyverse)
bedList("")
bedFiles = list.files(pattern= "dunnart_mouse", full.names=T) # create list of files in directory
files = as.list(bedFiles)
data = lapply(files, function(x) fread(x, header=FALSE, sep="\t", quote = "", na.strings=c("", "NA"))) # read in all files
data2 = rbindlist(data,)
data2$score <- ifelse(data2$V27 == data2$V46, "1", "0")
split = data2 %>% split(by="V49") ## split by stage
counts = lapply(split, function(x) length(which(x$score !=0)))
\ No newline at end of file
This diff is collapsed.
name: wga
channels:
- bioconda
- biocore
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_gnu
- bzip2=1.0.8=h516909a_3
- c-ares=1.11.0=h470a237_1
- ca-certificates=2020.6.20=hecda079_0
- cairo=1.16.0=h3fc0475_1005
- cd-hit=4.8.1=h8b12597_3
- certifi=2020.6.20=py38h32f6830_0
- curl=7.71.1=he644dc0_6
- entrez-direct=13.9=pl526h375a9b1_0
- expat=2.2.9=he1b5a44_2
- fontconfig=2.13.1=h1056068_1002
- freetype=2.10.2=he06d7ca_0
- fribidi=1.0.10=h516909a_0
- genometools-genometools=1.6.1=py38h23571c4_2
- gettext=0.19.8.1=hc5be6a0_1002
- glib=2.66.0=h0dae87d_0
- gmp=6.1.2=hf484d3e_1000
- gnutls=3.5.19=h2a4e5f8_1
- graphite2=1.3.13=he1b5a44_1001
- harfbuzz=2.7.2=hee91db6_0
- hmmer=3.3.1=he1b5a44_0
- icu=67.1=he1b5a44_0
- krb5=1.17.1=hfafb76e_3
- ld_impl_linux-64=2.35=h769bd43_9
- libcurl=7.71.1=hcdd3856_6
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.3.0=h24d8f2e_16
- libgomp=9.3.0=h24d8f2e_16
- libiconv=1.16=h516909a_0
- libnghttp2=1.41.0=hab1572f_1
- libpng=1.6.37=hed695b0_2
- libssh2=1.9.0=hab1572f_5
- libstdcxx-ng=9.3.0=hdf63c60_16
- libuuid=2.32.1=h14c3975_1000
- libxcb=1.13=h14c3975_1002
- libxml2=2.9.10=h68273f3_2
- llvm-openmp=8.0.1=hc9558a2_0
- ltr_retriever=2.9.0=0
- mafft=7.471=h516909a_0
- ncurses=6.2=he1b5a44_1
- nettle=3.3=0
- nseg=1.0.1=h516909a_0
- openmp=8.0.1=0
- openssl=1.1.1h=h516909a_0
- pango=1.42.4=h7062337_4
- pcre=8.44=he1b5a44_0
- perl=5.26.2=h516909a_1006
- perl-app-cpanminus=1.7044=pl526_1
- perl-archive-tar=2.32=pl526_0
- perl-base=2.23=pl526_1
- perl-business-isbn=3.004=pl526_0
- perl-business-isbn-data=20140910.003=pl526_0
- perl-carp=1.38=pl526_3
- perl-common-sense=3.74=pl526_2
- perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
- perl-compress-raw-zlib=2.087=pl526hc9558a2_0
- perl-constant=1.33=pl526_1
- perl-data-dumper=2.173=pl526_0
- perl-digest-hmac=1.03=pl526_3
- perl-digest-md5=2.55=pl526_0
- perl-encode=2.88=pl526_1
- perl-encode-locale=1.05=pl526_6
- perl-exporter=5.72=pl526_1
- perl-exporter-tiny=1.002001=pl526_0
- perl-extutils-makemaker=7.36=pl526_1
- perl-file-listing=6.04=pl526_1
- perl-file-path=2.16=pl526_0
- perl-file-temp=0.2304=pl526_2
- perl-file-which=1.23=pl526_0
- perl-html-parser=3.72=pl526h6bb024c_5
- perl-html-tagset=3.20=pl526_3
- perl-html-tree=5.07=pl526_1
- perl-http-cookies=6.04=pl526_0
- perl-http-daemon=6.01=pl526_1
- perl-http-date=6.02=pl526_3
- perl-http-message=6.18=pl526_0
- perl-http-negotiate=6.01=pl526_3
- perl-io-compress=2.087=pl526he1b5a44_0
- perl-io-html=1.001=pl526_2
- perl-io-socket-ssl=2.066=pl526_0
- perl-io-zlib=1.10=pl526_2
- perl-json=4.02=pl526_0
- perl-json-xs=2.34=pl526h6bb024c_3
- perl-libwww-perl=6.39=pl526_0
- perl-list-moreutils=0.428=pl526_1
- perl-list-moreutils-xs=0.428=pl526_0
- perl-lwp-mediatypes=6.04=pl526_0
- perl-lwp-protocol-https=6.07=pl526_4
- perl-mime-base64=3.15=pl526_1
- perl-mozilla-ca=20180117=pl526_1
- perl-net-http=6.19=pl526_0
- perl-net-ssleay=1.88=pl526h90d6eec_0
- perl-ntlm=1.09=pl526_4
- perl-parent=0.236=pl526_1
- perl-pathtools=3.75=pl526h14c3975_1
- perl-scalar-list-utils=1.52=pl526h516909a_0
- perl-socket=2.027=pl526_1
- perl-storable=3.15=pl526h14c3975_0
- perl-test-requiresinternet=0.05=pl526_0
- perl-text-soundex=3.05=pl526_1000
- perl-time-local=1.28=pl526_1
- perl-try-tiny=0.30=pl526_1
- perl-types-serialiser=1.0=pl526_2
- perl-uri=1.76=pl526_0
- perl-www-robotrules=6.02=pl526_3
- perl-xml-namespacesupport=1.12=pl526_0
- perl-xml-parser=2.44=pl526h4e0c4b3_7
- perl-xml-sax=1.02=pl526_0
- perl-xml-sax-base=1.09=pl526_0
- perl-xml-sax-expat=0.51=pl526_3
- perl-xml-simple=2.25=pl526_1
- perl-xsloader=0.24=pl526_0
- pip=20.2.3=py_0
- pixman=0.38.0=h516909a_1003
- pthread-stubs=0.4=h14c3975_1001
- python=3.8.5=h1103e12_8_cpython
- python_abi=3.8=1_cp38
- readline=8.0=he28a2e2_2
- recon=1.08=h516909a_2
- repeatmasker=4.1.0=pl526_0
- repeatmodeler=2.0.1=pl526_0
- repeatscout=1.0.6=h516909a_1
- rmblast=2.9.0=h2d02072_0
- setuptools=49.6.0=py38h32f6830_1
- sqlite=3.33.0=h4cf870e_0
- tk=8.6.10=hed695b0_0
- trf=4.09.1=h516909a_0
- wheel=0.35.1=pyh9f0ad1d_0
- xorg-kbproto=1.0.7=h14c3975_1002
- xorg-libice=1.0.10=h516909a_0
- xorg-libsm=1.2.3=h84519dc_1000
- xorg-libx11=1.6.12=h516909a_0
- xorg-libxau=1.0.9=h14c3975_0
- xorg-libxdmcp=1.1.3=h516909a_0
- xorg-libxext=1.3.4=h516909a_0
- xorg-libxrender=0.9.10=h516909a_1002
- xorg-renderproto=0.11.1=h14c3975_1002
- xorg-xextproto=7.3.0=h14c3975_1002
- xorg-xproto=7.0.31=h14c3975_1007
- xz=5.2.5=h516909a_1
- zlib=1.2.11=h516909a_1009
prefix: /home/lecook/.conda/envs/wga
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
##!/usr/bin/env bash
## Change file names in a loop
TRA=($(for file in *.fa.masked; do echo $file |cut -d "." -f 1;done))
echo ${TRA[@]}
for tr in ${TRA[@]};
do
mv ${tr}.fa.masked ${tr}.fa
done
\ No newline at end of file
#!/usr/bin/env bash
## This script loops through all scaffolds in the dunnart genome and
## generates a separate command for each scaffold
## These commands are then used in a slurm array script to run jobs in parallel
TRA=($(for file in *.chain; do echo $file |cut -d "." -f 1-2;done))
echo ${TRA[@]}
for tr in ${TRA[@]};
do
# generate lastz commands
#echo 'lastz_32 /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/mm10.fa[multiple] /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/smiCra1_RM/'${tr}.fa 'H=2000 K=2400 L=3000 Y=9400 --format=maf > /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/maf/'${tr}_mm10.smiCra1.maf
# generate convert maf to axt format commands
#echo 'maf-convert axt '${tr}.maf' > '${tr}.axt
## build co-linear alignment chains
#echo 'axtChain -linearGap=loose /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/axt/'${tr}'.axt /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/mm10.2bit /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/smiCra1.2bit '${tr}'.chain'
echo 'chainSort '${tr}'.chain > '${tr}'.sorted.chain'
done
This diff is collapsed.
This diff is collapsed.
#!usr/bin/env python3
import string
import random
import sys
import os
from Bio import SeqIO
## usage python3 parse_FASTA.py [input.fasta] > [output.fasta]
file = sys.argv[1]
# Loop through all the files in the variable
with open(file, 'r') as all_TWARs:
# for each record (TWAR header) in the file parse it as a FASTA
for record in SeqIO.parse(all_TWARs, "fasta"):
# set the record ID to a variable
id = record.id
# set the TWAR sequence to a variable
seq = record.seq
print(">" + str(id) + "\n" + str(seq) + "*")
all_TWARs.close()
##!/usr/bin/env bash
## This script loops through all scaffolds in the dunnart genome and
## generates a separate RepeatMasker command for each scaffold
## These commands are then used in a slurm array script to run jobs in parallel
TRA=($(for file in *.fa; do echo $file |cut -d "." -f 1;done))
echo ${TRA[@]}
for tr in ${TRA[@]};
do
echo 'RepeatMasker -q -xsmall smiCra1/'${tr}'.fa -default_search_engine hmmer -trf_prgm /home/lecook/.conda/envs/wga/bin/trf -hmmer_dir /home/lecook/.conda/envs/wga/bin/'
done
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment