updated

65bae9dc · lecook · c1ea4c9f · 65bae9dc · 65bae9dc · 65bae9dc
Commit 65bae9dc authored 3 years ago by lecook
--- a/GC_content.R
+++ b/GC_content.R
+# GC content and CpG % plots 
+
+library(data.table) 
+library(tidyverse)
+library(ggridges)
+library(ggpubr)
+library(reshape2)
+library(RColorBrewer)
+library(ggplot2)
+library(VennDiagram)
+library(viridis)
+library(hrbrthemes)
+library(gghalves)
+library(dplyr)
+
+
+promoter <- fread("dunnart_promoters_homerAnnot.txt")
+enhancer <- fread("dunnart_enhancers_homerAnnot.txt")
+peaks = list(promoter, enhancer)
+peaks = Map(mutate,peaks,cre=c("promoters", "enhancers"))
+peaks = lapply(peaks, function(x) x %>% dplyr::select("Chr", "Start", "End", "Peak Score", "Distance to TSS", "CpG%", "GC%", "cre") %>% as.data.table())
+peaks = rbindlist(peaks)
+colnames(peaks)[5:7] <- c("distanceToTSS","CpG", "GC")
+peaks[is.na(peaks)] <- 0
+peaks = peaks[,log10_abs_dist:=log10(abs(distanceToTSS)+1)]
+promoter = subset(peaks, cre == "promoters")
+enhancer = subset(peaks, cre == "enhancers")
+promoter.subset=subset(promoter, log10_abs_dist >= 3.5)
+promoter.subset2 = subset(promoter, log10_abs_dist <3.5)
+promoter.subset$cre = "subset>3.5"
+promoter.subset2$cre = "subset<3.5"
+data = rbind(enhancer, promoter, promoter.subset2, promoter.subset)
+
+p <- ggplot(data, aes(factor(cre), y = GC)) + 
+  #geom_violin(aes(fill=factor(cre), color=factor(cre)),
+  #  position = "dodge"
+  #  )+
+  geom_boxplot(aes(color=factor(cre)),
+    #outlier.shape = NA,
+    width = .2, 
+    notch = TRUE,
+    fill=c("#FCF8EC","#E4F3F1", "#FCF8EC","#E4F3F1"),
+    position = position_dodge(width=.1)
+  ) + theme_bw() + xlab("") + ylab("CpG %") 
+pdf("GC.pdf", width=9, height = 8)
+p +  scale_color_manual(values = c("#E9C46A","#2A9D8F","#E9C46A","#2A9D8F")) +
+scale_fill_manual(values = c("#F1DAA2","#7AC2B9","#E9C46A","#2A9D8F"))
+dev.off()
+
+pairwise.wilcox.test(data$GC, data$cre, p.adjust.method="fdr")
+
+p <- ggplot(peaks, aes(factor(cre), y = GC.width)) + 
+  geom_violin(aes(fill=factor(cre), color=factor(cre)),
+    position = "dodge"
+    )+
+  geom_boxplot(aes(color=factor(cre)),
+    outlier.shape = NA,
+    width = .15, 
+    notch = TRUE,
+    fill=c("#FCF8EC","#E4F3F1"),
+    position = position_dodge(width=.1)
+  ) + theme_bw() + xlab("") + ylab("GC content %") 
+pdf("GC.width_plot.pdf", width=9, height = 8)
+p +  scale_color_manual(values = c("#E9C46A","#2A9D8F")) +
+scale_fill_manual(values = c("#F1DAA2","#7AC2B9")) + stat_compare_means(method = "wilcox.test" )
+dev.off()
\ No newline at end of file
--- a/GO_orthologous_peaks.R
+++ b/GO_orthologous_peaks.R
+## GO analysis on orthologous peaks in mouse and dunnart
+## Input is liftOver peaks in mouse coordinates from comparePeaks.R
+
+## Load packages
+library(clusterProfiler)
+library(data.table)
+library(enrichplot)
+library(GOSemSim)
+library(ChIPseeker)
+library(TxDb.Mmusculus.UCSC.mm10.knownGene)
+library(org.Mm.eg.db)
+library(viridis)
+library(ComplexHeatmap)
+library(tidyverse)
+library(ggplot2)
+library(plyr)
+library(UpSetR)
+txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
+mmGO = godata('org.Mm.eg.db', ont="BP") 
+
+###################################################################################################
+############################### GENE ONTOLOGY & PATHWAY ANALYSIS ##################################
+###################################################################################################
+
+compareOrthoPeaks <- function(bedList, dotplot, names, upsetPlot, mergedNames){
+    bedFiles = list.files(bedList, pattern= "liftOver.bed", full.names=T) # create list of files in directory
+    bedFiles = as.list(bedFiles)
+
+    ## Annotate peaks
+    peakAnnoList <- lapply(bedFiles, annotatePeak, TxDb=txdb,
+                       tssRegion=c(-3000, 3000), verbose=FALSE, annoDb="org.Mm.eg.db")
+    
+    ## Make dataframe from upset plot
+    df2 = lapply(peakAnnoList, function(x) as.data.frame(x@anno))
+    subset = lapply(df2, function(x) x %>% dplyr::select(ENSEMBL) %>% as.data.table() %>% unique())
+    names(subset) = names
+    subset = Map(mutate, subset, stage = names(subset)) 
+    merged <- subset %>% purrr::reduce(full_join, by = "ENSEMBL")  # join all dataframes by ensembl geneID
+    merged[is.na(merged)] <- 0
+    merged = as.data.frame(merged)
+    geneId = merged$geneId
+    colnames(merged) = mergedNames 
+    upset.df = data.frame(lapply(merged[,2:8], function(x) as.numeric(x!="0")))
+    rownames(upset.df) = geneId
+
+    ## Upset plot
+    pdf(upsetPlot)
+    print(upset(upset.df, 
+    nsets=7, order.by="freq", 
+    number.angles = 45))
+    dev.off()
+
+    ## Make a background universe for GO enrichment testing
+    ## Take the union of ALL genes with peaks for mouse and dunnart
+    bedFiles = list.files(backgroundList, pattern= ".txt", full.names=T) # create list of files in directory
+    bedFiles = as.list(bedFiles)
+    data = lapply(bedFiles, function(x) fread(x, header=TRUE, sep="\t", quote = "", na.strings=c("", "NA")))
+    data = lapply(data, function(x) x=setnames(x, old="geneId", new="mouseensembl", skip_absent=TRUE) %>% as.data.table())
+    genes = lapply(data, function(i) as.data.frame(i)$mouseensembl)
+    genes = lapply(genes, function(x) bitr(x, fromType="ENSEMBL", toType="ENTREZID", OrgDb="org.Mm.eg.db"))
+    bkg_genes = lapply(genes, function(i) as.data.frame(i)$ENTREZID)
+    bkg_genes_union = Reduce(union, genes) 
+
+    ## GO terms 
+    genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
+    names(genes) = names
+    go_cluster <- pairwise_termsim(
+      setReadable(
+      compareCluster(
+        geneCluster = genes,
+        fun = enrichGO, 
+        ont="BP",
+        keyType="ENTREZID", 
+        pAdjustMethod = "fdr",
+        pvalueCutoff = 0.001, 
+        OrgDb = org.Mm.eg.db),
+      OrgDb = org.Mm.eg.db, 
+      keyType="ENTREZID"))
+
+    ## Dotplot
+    pdf(dotplot, width = 20, height = 10)
+    print(dotplot(go_cluster) + 
+    scale_color_viridis() + 
+    theme(axis.text.x = element_text(angle = 45, hjust=1)))
+    dev.off()
+
+    return(genes)
+}
+
+names = c("E10", "E11", "E12", "E13", "E14", "E15", "dunnart")
+
+all_promoters = compareOrthoPeaks(bedList = "GO_analysis/promoters", backgroundList = "../consensus/promoters/clustered", names = c("E10_mouse", "E10_combined", "E10_dunnart", "E11_mouse", "E11_combined", "E11_dunnart",
+    "E12_mouse", "E12_combined", "E12_dunnart", "E13_mouse", "E13_combined", "E13_dunnart", 
+    "E14_mouse", "E14_combined", "E14_dunnart", "E15_mouse", "E15_combined", "E15_dunnart"), mergedNames = c("geneId", "E10_mouse", "E10_combined", "E10_dunnart", "E11_mouse", "E11_combined", "E11_dunnart",
+    "E12_mouse", "E12_combined", "E12_dunnart", "E13_mouse", "E13_combined", "E13_dunnart", 
+    "E14_mouse", "E14_combined", "E14_dunnart", "E15_mouse", "E15_combined", "E15_dunnart"), dotplot = "promoter_dotplot_GOenrich.pdf", upsetPlot = "promoter_upsetPlot_GOenrich.pdf")
+
+all.enhancers = compareOrthoPeaks(bedList = "GO_analysis/enhancers", names = c("E10_dunnart", "E10_mouse", "E10_combined", "E11_dunnart","E11_mouse", "E11_combined", 
+     "E12_dunnart","E12_mouse", "E12_combined",  "E13_dunnart","E13_mouse", "E13_combined", "E14_dunnart", 
+    "E14_mouse", "E14_combined",  "E15_dunnart", "E15_mouse","E15_combined"), mergedNames = c("geneId", "E10_dunnart", "E10_mouse", "E10_combined", "E11_dunnart","E11_mouse", "E11_combined", 
+     "E12_dunnart","E12_mouse", "E12_combined",  "E13_dunnart","E13_mouse", "E13_combined", "E14_dunnart", 
+    "E14_mouse", "E14_combined",  "E15_dunnart", "E15_mouse","E15_combined"), dotplot = "enhancer_dotplot_GOenrich.pdf", upsetPlot = "enhancer_upsetPlot_GOenrich.pdf")
+
+enhancerSim = mclusterSim(all.enhancers, semData=mmGO, measure="Wang", combine="BMA")
+pdf("enhancerGOSemSim_heatmap.pdf")
+Heatmap(enhancerSim)
+dev.off()
+
+promoterSim = mclusterSim(all.promoters, semData=mmGO, measure="Wang", combine="BMA")
+pdf("promoterGOSemSim_heatmap.pdf")
+Heatmap(promoterSim)
+dev.off()
+
+## Now look at merged mouse peaks across all stages and compare to the dunnart
+
+## 
+bedFiles = list("GO_analysis/merged/dunnart_specific_mouse_merged_enhancerOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_merged_enhancerOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_specific_enhancer_overlap_50bpsummits.sorted.merged_mm10TOsmiCra_backTOmm10.bed")
+bedFiles = list("GO_analysis/merged/dunnart_specific_mouse_merged_promoterOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_merged_promoterOverlapmm10_50bpsummits_backTOsmiCra_smiCraTOmm10.bed", "GO_analysis/merged/mouse_specific_promoter_overlap_50bpsummits.sorted.merged_mm10TOsmiCra_backTOmm10.bed")
+
+peakAnnoList <- lapply(bedFiles, annotatePeak, TxDb=txdb,
+                       tssRegion=c(-3000, 3000), verbose=FALSE, annoDb="org.Mm.eg.db")
+
+genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
+names(genes) = c("dunnart_biased", "common", "mouse_biased")
+go_cluster <- simplify(
+      setReadable(
+      compareCluster(
+        geneCluster = genes, 
+        fun = enrichGO, 
+        ont="BP", 
+        keyType="ENTREZID", 
+        pvalueCutoff = 0.01, 
+        OrgDb = org.Mm.eg.db),
+      OrgDb = org.Mm.eg.db, 
+      keyType="ENTREZID"),
+      cutoff=0.9, 
+      by="p.adjust", 
+      select_fun=min)
+
+pdf("summary_enhancer_cnetplot_GOenrich.pdf")
+print(cnetplot(go_cluster))
+dev.off()
+
+pdf("summary_enhancer_dotplot_GOenrich.pdf", width = 10, height = 8)
+print(dotplot(go_cluster, showCategory = 15) + theme(axis.text.x = element_text(angle = 45, hjust=1)))
+dev.off()
+
+
+
+
+
--- a/GO_plots.R
+++ b/GO_plots.R
+## Script that takes as input file with GO enriched terms and returns you a barplot
+library(dplyr); library(data.table)
+library(magrittr); 
+library(ggthemes);library(ggplot2);library(ggpubr)
+
+## change here the working directory(ies)
+setwd('/Users/lauracook/OneDrive - The University of Melbourne/PhD (2018-2021)/6-ChIPseq/3-DataAnalysis/4-dunnartPeaks/1-FullDataset/')
+
+go_files_dir='/data/projects/punim0586/lecook/chipseq-pipeline/cross_species/peakAnnotation/overlaps/promoters/GO_noOverlap/'
+plot_dir='/data/projects/punim0586/lecook/chipseq-pipeline/cross_species/peakAnnotation/overlaps/promoters/GO_noOverlap'
+
+## read files into the directory
+## it creates a list of dataframes (one per file)
+## then discards the geneID field 
+
+read_files=function(dir){
+  x=as.character(list.files(dir,recursive =F,full.names = T)) %>% 
+    lapply(function(y)
+      fread(y,sep = '\t',header = T)[
+        ,geneID:=NULL
+      ] %>% setorderv('p.adjust',1)) ## sorted by most significant ones 
+  ## this returns you the filename without file extension. 
+  ## I will use it as plot title below (you can remove it if u want)
+  file_names=as.character(list.files(dir,recursive =F,full.names = F))
+  file_names=gsub("\\..*","",file_names) 
+  names(x)=file_names
+  ## I will make a new field with the filename that I will use as categorical variable to assign different colors
+  ## i.e. each plot will have a different color
+  x=Map(mutate,x,filename=names(x))
+  x=lapply(x,function(y)y=as.data.table(y))
+  return(x)
+}
+
+go_terms=read_files(go_files_dir)
+
+## I will make another column with the color for the given file. 
+## all row entries will be identical but I just need this for plotting
+## I am going to do in this way so that u can change the colors as u like for each file
+
+colors=c('#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC',
+'#6699CC','#6699CC','#6699CC','#6699CC','#6699CC','#6699CC') ## remember to add a color per input file
+
+go_colors=Map(cbind, go_terms, color=colors)
+go_colors=lapply(go_colors, function(x)x=as.data.table(x))
+
+## this keeps only the significant GO terms
+significant_go_terms=lapply(go_colors,function(x)x=x[p.adjust<=0.05])
+
+plot_results=function(file){
+  
+  top_n_GOs=copy(file)
+  top_n_GOs=top_n_GOs[1:20]
+  
+  ggplot(top_n_GOs, 
+         aes(x=reorder(Description,-log10(p.adjust)), y=-log10(p.adjust),fill=filename)) + ## this reorders the x-axis and converts the p-adj to log10 values
+    geom_bar(stat = 'identity',position = 'dodge')+
+    xlab(" ") +
+    ylab("\n -Log10 (P) \n ") +
+    scale_y_continuous(breaks = round(seq(0, max(-log10(top_n_GOs$p.adjust)), by = 2), 1)) +
+    scale_fill_manual(name= " ",values=top_n_GOs$color)+
+    theme( ## these are just parameters to change the theme of the plot. you can add more or remove them
+      legend.position='none',
+      legend.background=element_rect(),
+      axis.text.x=element_text(angle=0, hjust=1.10),
+      axis.text.y=element_text(angle=0, vjust=0.8),
+      axis.title=element_text(),
+      axis.line = element_line(color = "black",size = 0, linetype = "solid"),
+      panel.background =element_rect(fill = 'white', size = 0.5,colour = 'black'),
+      panel.grid.minor = element_blank(),
+      panel.grid.major = element_blank(),
+      title=element_text()) +
+    coord_flip()
+  
+}
+
+go_plot=lapply(significant_go_terms,function(x)x=plot_results(x))
+
+
+# Save each plot to a different pdf in the plot_dir/
+lapply(names(go_plot), 
+       function(x)ggsave(filename=paste(plot_dir,x,'_plot',".pdf",sep=""), 
+                         plot=go_plot[[x]]))
+
--- a/GO_similarity_PCA.R
+++ b/GO_similarity_PCA.R
+## input: vector of GO terms
+## output: heatmap semantic similarity scores
+library(tools)
+library(GOSemSim)
+library(ComplexHeatmap)
+library(circlize)
+library('BiocParallel')  
+library(data.table)
+library(tidyverse)
+
+## GO data from mouse org database for biological processes
+## Can do for mouse phenotype and molecular processes
+mmGO = godata('org.Mm.eg.db', ont="BP") 
+plot_dir = ("consensus/promoters/")
+## Wang is actually a guy but here it refers to a semantic similarity metric (see https://yulab-smu.top/biomedical-knowledge-mining-book/semantic-similarity-overview.html)
+## combine = 'BMA' returns a single value estimating how similar the 2 sets are, you can change it with min,max,avg ... (see link above)
+
+goPCA <- function(fileList, file_to_compare_to){
+    files =list.files(fileList, pattern= "", full.names=T) # create list of files in directory
+    files = as.list(files)
+    list = lapply(files, function(x) read.table(x, header=TRUE, sep="\t", quote = "") %>% as.data.table()) # read in all files
+    ## this keeps only the significant GO terms
+    significant_go_terms=lapply(list,function(x) x=x[p.adjust<=0.01])
+    listVector = lapply(significant_go_terms, function(x) dplyr::pull(x, ID))
+    file_input = read.table(file_to_compare_to, sep="\t", header=TRUE, quote="") %>% as.data.frame()
+    file_sig_go = dplyr::pull(file_input[file_input$p.adjust <= 0.01,], ID)
+
+    ## This extracts the file names for all the files in the list (all embryonic stages)
+    file_names=as.character(list.files(fileList,recursive =F,full.names = F)) ## directory with all embryonic stages
+    file_names=gsub("\\..*","",file_names) ## Removes everything except 'E*.5'
+    file_names=gsub("[^0-9]","", file_names) ## Removes the 'E' so that it's a number
+    names(listVector)=file_names ## Add new names to list
+
+    ## Check if input file name is less than or equal to any in listVector
+    ## This will avoid running comparisons that have already been run
+    ## Eg. E10.5 vs E11.5 is the same as E11.5 vs E10.5
+    file_to_compare_to_name = basename(file_path_sans_ext(file_to_compare_to)) ## Name of file to compare to
+    if(grepl('^E', file_to_compare_to_name) == TRUE){ ## If file begins with E (this is so that it doesn't stall with the dunnart files)
+        file_to_compare_to_name=gsub("\\..*","",file_to_compare_to_name) ## Removes everything except 'E*.5'
+        file_to_compare_to_name=gsub("[^0-9]","",file_to_compare_to_name) ## Removes the 'E' so that it's a number
+        listVectorSubset = listVector[names(listVector) > file_to_compare_to_name] ## Keep only vectors in the list that are greater than the file being compared to
+                                                                                ## Eg. this means that when comparing E12 to the list then it will only compare to 13,14 & 15
+    } else 
+    listVectorSubset = listVector
+
+    ## Compare the similarity between GO lists
+    ## This will parallelise the lapply function 
+    comparisons = bplapply(listVectorSubset,function(x) mgoSim(file_sig_go,x, semData=mmGO,measure="Wang", combine='NULL')) 
+    return(comparisons)
+}
+
+## This command checks the time it takes to run
+## One comparison is about 95 seconds
+system.time(mgoSim(listVector[[1]], listVector[[2]],semData=mmGO, measure="Wang", combine='BMA'))
+
+register(SerialParam())
+
+
+## Promoters
+plot_dir = ("consensus/promoters/")
+
+dunnartvsmouse = GOsimilarity(fileList = "consensus/promoters/mouse_GO", 
+file_to_compare_to = "consensus/promoters/dunnart_GO/promoter_mm10GOenrich") #dunnart versus all mouse stages
+
+e10vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO", 
+file_to_compare_to = "consensus/promoters/mouse_GO/E10.5_promoter_mm10GOenrich") 
+
+e11vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO", 
+file_to_compare_to = "consensus/promoters/mouse_GO/E11.5_promoter_mm10GOenrich")
+
+e12vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO", 
+file_to_compare_to = "consensus/promoters/mouse_GO/E12.5_promoter_mm10GOenrich") 
+
+e13vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO", 
+file_to_compare_to = "consensus/promoters/mouse_GO/E13.5_promoter_mm10GOenrich") 
+
+e14vs = GOsimilarity(fileList = "consensus/promoters/mouse_GO", 
+file_to_compare_to = "consensus/promoters/mouse_GO/E14.5_promoter_mm10GOenrich") 
+
+
+## Enhancers
+plot_dir = ("consensus/enhancers/")
+
+dunnartvsmouse = GOsimilarity(fileList = "consensus/enhancers/mouse_GO", 
+file_to_compare_to = "consensus/enhancers/dunnart_GO/enhancer_mm10GOenrich_subsample.txt")
+
+e10vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO", 
+file_to_compare_to = "consensus/enhancers/mouse_GO/E10.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
+
+e11vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO", 
+file_to_compare_to = "consensus/enhancers/mouse_GO/E11.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
+
+e12vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO", 
+file_to_compare_to = "consensus/enhancers/mouse_GO/E12.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
+
+e13vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO", 
+file_to_compare_to = "consensus/enhancers/mouse_GO/E13.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
+
+e14vs = GOsimilarity(fileList = "consensus/enhancers/mouse_GO", 
+file_to_compare_to = "consensus/enhancers/mouse_GO/E14.5_enhancer_mm10GOenrich") #dunnart versus all mouse stages
+
--- a/README.md
+++ b/README.md
--- a/annotation_10M.R
+++ b/annotation_10M.R
--- a/comparePeaks.R
+++ b/comparePeaks.R
--- a/comparePeaks.sh
+++ b/comparePeaks.sh
--- a/compare_liftover_genes.R
+++ b/compare_liftover_genes.R
+
+## Load packages
+library(data.table)
+library(tidyverse)
+
+bedList("")
+bedFiles = list.files(pattern= "dunnart_mouse", full.names=T) # create list of files in directory
+
+files = as.list(bedFiles)
+data = lapply(files, function(x) fread(x, header=FALSE, sep="\t", quote = "", na.strings=c("", "NA"))) # read in all files
+data2 = rbindlist(data,)
+data2$score <- ifelse(data2$V27 == data2$V46, "1", "0")
+split = data2 %>% split(by="V49") ## split by stage
+counts = lapply(split, function(x) length(which(x$score !=0)))
\ No newline at end of file
--- a/cross_species_comparison/README.md
+++ b/cross_species_comparison/README.md
--- a/cross_species_comparison/env/wga.yaml
+++ b/cross_species_comparison/env/wga.yaml
-name: wga
-channels:
-  - bioconda
-  - biocore
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=1_gnu
-  - bzip2=1.0.8=h516909a_3
-  - c-ares=1.11.0=h470a237_1
-  - ca-certificates=2020.6.20=hecda079_0
-  - cairo=1.16.0=h3fc0475_1005
-  - cd-hit=4.8.1=h8b12597_3
-  - certifi=2020.6.20=py38h32f6830_0
-  - curl=7.71.1=he644dc0_6
-  - entrez-direct=13.9=pl526h375a9b1_0
-  - expat=2.2.9=he1b5a44_2
-  - fontconfig=2.13.1=h1056068_1002
-  - freetype=2.10.2=he06d7ca_0
-  - fribidi=1.0.10=h516909a_0
-  - genometools-genometools=1.6.1=py38h23571c4_2
-  - gettext=0.19.8.1=hc5be6a0_1002
-  - glib=2.66.0=h0dae87d_0
-  - gmp=6.1.2=hf484d3e_1000
-  - gnutls=3.5.19=h2a4e5f8_1
-  - graphite2=1.3.13=he1b5a44_1001
-  - harfbuzz=2.7.2=hee91db6_0
-  - hmmer=3.3.1=he1b5a44_0
-  - icu=67.1=he1b5a44_0
-  - krb5=1.17.1=hfafb76e_3
-  - ld_impl_linux-64=2.35=h769bd43_9
-  - libcurl=7.71.1=hcdd3856_6
-  - libedit=3.1.20191231=he28a2e2_2
-  - libev=4.33=h516909a_1
-  - libffi=3.2.1=he1b5a44_1007
-  - libgcc-ng=9.3.0=h24d8f2e_16
-  - libgomp=9.3.0=h24d8f2e_16
-  - libiconv=1.16=h516909a_0
-  - libnghttp2=1.41.0=hab1572f_1
-  - libpng=1.6.37=hed695b0_2
-  - libssh2=1.9.0=hab1572f_5
-  - libstdcxx-ng=9.3.0=hdf63c60_16
-  - libuuid=2.32.1=h14c3975_1000
-  - libxcb=1.13=h14c3975_1002
-  - libxml2=2.9.10=h68273f3_2
-  - llvm-openmp=8.0.1=hc9558a2_0
-  - ltr_retriever=2.9.0=0
-  - mafft=7.471=h516909a_0
-  - ncurses=6.2=he1b5a44_1
-  - nettle=3.3=0
-  - nseg=1.0.1=h516909a_0
-  - openmp=8.0.1=0
-  - openssl=1.1.1h=h516909a_0
-  - pango=1.42.4=h7062337_4
-  - pcre=8.44=he1b5a44_0
-  - perl=5.26.2=h516909a_1006
-  - perl-app-cpanminus=1.7044=pl526_1
-  - perl-archive-tar=2.32=pl526_0
-  - perl-base=2.23=pl526_1
-  - perl-business-isbn=3.004=pl526_0
-  - perl-business-isbn-data=20140910.003=pl526_0
-  - perl-carp=1.38=pl526_3
-  - perl-common-sense=3.74=pl526_2
-  - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
-  - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
-  - perl-constant=1.33=pl526_1
-  - perl-data-dumper=2.173=pl526_0
-  - perl-digest-hmac=1.03=pl526_3
-  - perl-digest-md5=2.55=pl526_0
-  - perl-encode=2.88=pl526_1
-  - perl-encode-locale=1.05=pl526_6
-  - perl-exporter=5.72=pl526_1
-  - perl-exporter-tiny=1.002001=pl526_0
-  - perl-extutils-makemaker=7.36=pl526_1
-  - perl-file-listing=6.04=pl526_1
-  - perl-file-path=2.16=pl526_0
-  - perl-file-temp=0.2304=pl526_2
-  - perl-file-which=1.23=pl526_0
-  - perl-html-parser=3.72=pl526h6bb024c_5
-  - perl-html-tagset=3.20=pl526_3
-  - perl-html-tree=5.07=pl526_1
-  - perl-http-cookies=6.04=pl526_0
-  - perl-http-daemon=6.01=pl526_1
-  - perl-http-date=6.02=pl526_3
-  - perl-http-message=6.18=pl526_0
-  - perl-http-negotiate=6.01=pl526_3
-  - perl-io-compress=2.087=pl526he1b5a44_0
-  - perl-io-html=1.001=pl526_2
-  - perl-io-socket-ssl=2.066=pl526_0
-  - perl-io-zlib=1.10=pl526_2
-  - perl-json=4.02=pl526_0
-  - perl-json-xs=2.34=pl526h6bb024c_3
-  - perl-libwww-perl=6.39=pl526_0
-  - perl-list-moreutils=0.428=pl526_1
-  - perl-list-moreutils-xs=0.428=pl526_0
-  - perl-lwp-mediatypes=6.04=pl526_0
-  - perl-lwp-protocol-https=6.07=pl526_4
-  - perl-mime-base64=3.15=pl526_1
-  - perl-mozilla-ca=20180117=pl526_1
-  - perl-net-http=6.19=pl526_0
-  - perl-net-ssleay=1.88=pl526h90d6eec_0
-  - perl-ntlm=1.09=pl526_4
-  - perl-parent=0.236=pl526_1
-  - perl-pathtools=3.75=pl526h14c3975_1
-  - perl-scalar-list-utils=1.52=pl526h516909a_0
-  - perl-socket=2.027=pl526_1
-  - perl-storable=3.15=pl526h14c3975_0
-  - perl-test-requiresinternet=0.05=pl526_0
-  - perl-text-soundex=3.05=pl526_1000
-  - perl-time-local=1.28=pl526_1
-  - perl-try-tiny=0.30=pl526_1
-  - perl-types-serialiser=1.0=pl526_2
-  - perl-uri=1.76=pl526_0
-  - perl-www-robotrules=6.02=pl526_3
-  - perl-xml-namespacesupport=1.12=pl526_0
-  - perl-xml-parser=2.44=pl526h4e0c4b3_7
-  - perl-xml-sax=1.02=pl526_0
-  - perl-xml-sax-base=1.09=pl526_0
-  - perl-xml-sax-expat=0.51=pl526_3
-  - perl-xml-simple=2.25=pl526_1
-  - perl-xsloader=0.24=pl526_0
-  - pip=20.2.3=py_0
-  - pixman=0.38.0=h516909a_1003
-  - pthread-stubs=0.4=h14c3975_1001
-  - python=3.8.5=h1103e12_8_cpython
-  - python_abi=3.8=1_cp38
-  - readline=8.0=he28a2e2_2
-  - recon=1.08=h516909a_2
-  - repeatmasker=4.1.0=pl526_0
-  - repeatmodeler=2.0.1=pl526_0
-  - repeatscout=1.0.6=h516909a_1
-  - rmblast=2.9.0=h2d02072_0
-  - setuptools=49.6.0=py38h32f6830_1
-  - sqlite=3.33.0=h4cf870e_0
-  - tk=8.6.10=hed695b0_0
-  - trf=4.09.1=h516909a_0
-  - wheel=0.35.1=pyh9f0ad1d_0
-  - xorg-kbproto=1.0.7=h14c3975_1002
-  - xorg-libice=1.0.10=h516909a_0
-  - xorg-libsm=1.2.3=h84519dc_1000
-  - xorg-libx11=1.6.12=h516909a_0
-  - xorg-libxau=1.0.9=h14c3975_0
-  - xorg-libxdmcp=1.1.3=h516909a_0
-  - xorg-libxext=1.3.4=h516909a_0
-  - xorg-libxrender=0.9.10=h516909a_1002
-  - xorg-renderproto=0.11.1=h14c3975_1002
-  - xorg-xextproto=7.3.0=h14c3975_1002
-  - xorg-xproto=7.0.31=h14c3975_1007
-  - xz=5.2.5=h516909a_1
-  - zlib=1.2.11=h516909a_1009
-prefix: /home/lecook/.conda/envs/wga
-
--- a/cross_species_comparison/scripts/axtChain_commands.sh
+++ b/cross_species_comparison/scripts/axtChain_commands.sh
--- a/cross_species_comparison/scripts/chainSort_commands.sh
+++ b/cross_species_comparison/scripts/chainSort_commands.sh
--- a/cross_species_comparison/scripts/changeFilenameLoop.sh
+++ b/cross_species_comparison/scripts/changeFilenameLoop.sh
-##!/usr/bin/env bash
-
-## Change file names in a loop
-
-TRA=($(for file in *.fa.masked; do echo $file |cut -d "." -f 1;done))
-
-echo ${TRA[@]}
-
-for tr in ${TRA[@]};
-
-do
-
-mv ${tr}.fa.masked ${tr}.fa
-
-done
\ No newline at end of file
--- a/cross_species_comparison/scripts/lastz.sh
+++ b/cross_species_comparison/scripts/lastz.sh
-#!/usr/bin/env bash
-
-## This script loops through all scaffolds in the dunnart genome and
-## generates a separate command for each scaffold
-## These commands are then used in a slurm array script to run jobs in parallel
-
-TRA=($(for file in *.chain; do echo $file |cut -d "." -f 1-2;done))
-
-echo ${TRA[@]}
-
-for tr in ${TRA[@]};
-
-do
-
-# generate lastz commands
-#echo 'lastz_32 /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/mm10.fa[multiple] /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/smiCra1_RM/'${tr}.fa 'H=2000 K=2400 L=3000 Y=9400 --format=maf > /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/maf/'${tr}_mm10.smiCra1.maf
-
-# generate convert maf to axt format commands
-#echo 'maf-convert axt '${tr}.maf' > '${tr}.axt
-
-## build co-linear alignment chains
-#echo 'axtChain -linearGap=loose /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/axt/'${tr}'.axt /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/mm10.2bit /data/projects/punim0586/lecook/chipseq-pipeline/cross_species/data/genomes/smiCra1.2bit '${tr}'.chain'
-
-echo 'chainSort '${tr}'.chain > '${tr}'.sorted.chain'
-
-done
--- a/cross_species_comparison/scripts/lastz_commands.sh
+++ b/cross_species_comparison/scripts/lastz_commands.sh
--- a/cross_species_comparison/scripts/maf-convert_commands.sh
+++ b/cross_species_comparison/scripts/maf-convert_commands.sh
--- a/cross_species_comparison/scripts/parse_FASTA.py
+++ b/cross_species_comparison/scripts/parse_FASTA.py
-#!usr/bin/env python3
-
-import string
-import random
-import sys
-import os
-from Bio import SeqIO
-
-## usage python3 parse_FASTA.py [input.fasta] > [output.fasta]
-
-file = sys.argv[1]
-
-# Loop through all the files in the variable
-with open(file, 'r') as all_TWARs:
-
-    # for each record (TWAR header) in the file parse it as a FASTA
-    for record in SeqIO.parse(all_TWARs, "fasta"):
-        # set the record ID to a variable
-        id = record.id
-        # set the TWAR sequence to a variable
-        seq = record.seq
-
-        print(">" + str(id) + "\n" + str(seq) + "*")
-all_TWARs.close()
--- a/cross_species_comparison/scripts/repeatMasker.sh
+++ b/cross_species_comparison/scripts/repeatMasker.sh
-##!/usr/bin/env bash
-
-## This script loops through all scaffolds in the dunnart genome and
-## generates a separate RepeatMasker command for each scaffold
-## These commands are then used in a slurm array script to run jobs in parallel
-
-TRA=($(for file in *.fa; do echo $file |cut -d "." -f 1;done))
-
-echo ${TRA[@]}
-
-for tr in ${TRA[@]};
-
-do
-
-echo 'RepeatMasker -q -xsmall smiCra1/'${tr}'.fa -default_search_engine hmmer -trf_prgm /home/lecook/.conda/envs/wga/bin/trf -hmmer_dir /home/lecook/.conda/envs/wga/bin/'
-
-
-done
--- a/cross_species_comparison/scripts/repeatMaskerCommands.sh
+++ b/cross_species_comparison/scripts/repeatMaskerCommands.sh