Commit fe028630 authored by root's avatar root

Add KRAKEN, RBio and Roary samples

parent 2e6fbc31
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=KRAKEN-test.slurm
# Run multicore
#SBATCH --ntasks-per-node=1
# Set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Specify your email address to be notified of progress.
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load fastqc/0.11.9-java-11.0.2
module load kraken/1.1-perl-5.30.0
# Use FastQC to check the quality of our data
cd wms/data/sub_100000
fastqc *.fastq
# Assign the database
# Running the KRAKEN loop
# Please change the directory path to suit your location
for i in *_1.fastq
prefix=$(basename $i _1.fastq)
# print which sample is being processed
echo $prefix
kraken --db $KRAKEN_DB \
${prefix}_1.fastq ${prefix}_2.fastq > ~/KRAKEN/wms/results/${prefix}.tab
kraken-report --db $KRAKEN_DB \
~/KRAKEN/wms/results/${prefix}.tab > ~/KRAKEN/wms/results/${prefix}_tax.txt
Derived from a tutorial from the Swedish Univeristy of Agricultural Sciences.
Kraken is a system for assigning taxonomic labels to short DNA sequences (i.e. reads) Kraken aims to achieve high sensitivity and
high speed by utilizing exact alignments of k-mers and a novel classification algorithm.
Kraken uses a new approach with exact k-mer matching to assign taxonomy to short reads. It is extremely fast compared to
traditional approaches (i.e. BLAST).
In this example samples are compared from the Pig Gut Microbiome to samples from the Human Gut Microbiome.
Whole Metagenome Sequencing, "shotgun metagenome sequencing", is used, a relatively new and powerful sequencing approach that
provides insight into community biodiversity and function. On the contrary of Metabarcoding, where only a specific region of the
bacterial community (the 16s rRNA) is sequenced, WMS aims at sequencing all the genomic material present in the environment.
The dataset (subset_wms.tar.gz) uses a sample of 3 pigs and 3 humans.
FastQC is used to check the quality of our data.
Kraken includes a script called kraken-report to transform this file into a "tree" view with the percentage of reads assigned to
each taxa. Once the job is run take a look at the _tax.txt files in the results directory.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
sample organism
ERR1135369 pig
ERR1135370 pig
ERR1135375 pig
ERR321573 human
ERR321574 human
ERR321578 human
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=RBio-test.slurm
# Run multicore
#SBATCH --ntasks-per-node=4
# Set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 5:00:00
# Specify your email address to be notified of progress.
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load foss/2019b
module load r-bundle-bioconductor/3.10-r-3.6.2
module load web_proxy
# Running the R script
R --vanilla < Metabarcoding.R
# Load packages
# Check the dataset
path <- 'MiSeq_SOP'
# Filter and Trim
raw_forward <- sort(list.files(path, pattern="_R1_001.fastq",
raw_reverse <- sort(list.files(path, pattern="_R2_001.fastq",
# we also need the sample names
sample_names <- sapply(strsplit(basename(raw_forward), "_"),
`[`, # extracts the first element of a subset
# Initial plot read quality. Forward reads are good quality while the reverse are way worse.
# Dada2 requires us to define the name of our output files
# place filtered files in filtered/ subdirectory
filtered_path <- file.path(path, "filtered")
filtered_forward <- file.path(filtered_path,
paste0(sample_names, "_R1_trimmed.fastq.gz"))
filtered_reverse <- file.path(filtered_path,
paste0(sample_names, "_R2_trimmed.fastq.gz"))
# Use standard filtering parameters: maxN=0 (DADA22 requires no Ns), truncQ=2, rm.phix=TRUE and maxEE=2. The maxEE parameter sets the maximum
# number of “expected errors” allowed in a read, which according to the USEARCH authors is a better filter than simply averaging quality
# scores.
out <- filterAndTrim(raw_forward, filtered_forward, raw_reverse,
filtered_reverse, truncLen=c(240,160), maxN=0,
maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE,
# Learn and plot the Error Rates
# The DADA2 algorithm depends on a parametric error model and every amplicon dataset has a slightly different error rate. The learnErrors of
# Dada2 learns the error model from the data and will help DADA2 to fits its method to your data.
errors_forward <- learnErrors(filtered_forward, multithread=TRUE)
errors_reverse <- learnErrors(filtered_reverse, multithread=TRUE)
plotErrors(errors_forward, nominalQ=TRUE) +
# Dereplication
# From the Dada2 documentation: Dereplication combines all identical sequencing reads into into “unique sequences” with a corresponding
# “abundance”: the number of reads with that unique sequence. Dereplication substantially reduces computation time by eliminating redundant
# comparisons.
derep_forward <- derepFastq(filtered_forward, verbose=TRUE)
derep_reverse <- derepFastq(filtered_reverse, verbose=TRUE)
# name the derep-class objects by the sample names
names(derep_forward) <- sample_names
names(derep_reverse) <- sample_names
# Sample inference
# We are now ready to apply the core sequence-variant inference algorithm to the dereplicated data.
dada_forward <- dada(derep_forward, err=errors_forward, multithread=TRUE)
dada_reverse <- dada(derep_reverse, err=errors_reverse, multithread=TRUE)
# inspect the dada-class object
# Merge Paired-end Reads
# Now that the reads are trimmed, dereplicated and error-corrected we can merge them together
merged_reads <- mergePairs(dada_forward, derep_forward, dada_reverse,
derep_reverse, verbose=TRUE)
# inspect the merger data.frame from the first sample
# Construct Sequence Table
# Construct a sequence table of our mouse samples, a higher-resolution version of the OTU table produced by traditional methods.
seq_table <- makeSequenceTable(merged_reads)
# inspect distribution of sequence lengths
# Remove Chimeras
# The dada method used earlier removes substitutions and indel errors but chimeras remain.
seq_table_nochim <- removeBimeraDenovo(seq_table, method='consensus',
multithread=TRUE, verbose=TRUE)
# which percentage of our reads did we keep?
sum(seq_table_nochim) / sum(seq_table)
# Check the number of reads that made it through each step in the pipeline
get_n <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dada_forward, get_n), sapply(merged_reads, get_n),
rowSums(seq_table), rowSums(seq_table_nochim))
colnames(track) <- c('input', 'filtered', 'denoised', 'merged', 'tabled',
rownames(track) <- sample_names
# Assign Taxonomy
# Aassign taxonomy to the sequences using the SILVA database
taxa <- assignTaxonomy(seq_table_nochim,
taxa <- addSpecies(taxa, 'MiSeq_SOP/silva_species_assignment_v128.fa.gz')
# for inspecting the classification
taxa_print <- taxa # removing sequence rownames for display only
rownames(taxa_print) <- NULL
# Phylogenetic Tree
# DADA2 is reference-free so we have to build the tree ourselves
# Align our sequences
sequences <- getSequences(seq_table)
names(sequences) <- sequences # this propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(sequences), anchor=NA)
# Build a neighbour-joining tree then fit a maximum likelihood tree using the neighbour-joining tree as a starting point.
phang_align <- phyDat(as(alignment, 'matrix'), type='DNA')
dm <-
treeNJ <- NJ(dm) # note, tip order != sequence order
fit = pml(treeNJ, data=phang_align)
## negative edges length changed to 0!
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model='GTR', optInv=TRUE, optGamma=TRUE,
rearrangement = 'stochastic',
control = pml.control(trace = 0))
detach('package:phangorn', unload=TRUE)
# Phyloseq. First load the metadata
sample_data <- read.table(
header=TRUE, row.names="sample_name")
# Construct a phyloseq object from our output and newly created metadata
physeq <- phyloseq(otu_table(seq_table_nochim, taxa_are_rows=FALSE),
# remove mock sample
physeq <- prune_samples(sample_names(physeq) != 'Mock', physeq)
# Look at the alpha diversity of our samples
plot_richness(physeq, x='day', measures=c('Shannon', 'Fisher'), color='when') +
# Ordination methods (beta diversity)
# Perform an MDS with euclidean distance (mathematically equivalent to a PCA)
ord <- ordinate(physeq, 'MDS', 'euclidean')
plot_ordination(physeq, ord, type='samples', color='when',
title='PCA of the samples from the MiSeq SOP') +
# With the Bray-Curtis distance
ord <- ordinate(physeq, 'NMDS', 'bray')
plot_ordination(physeq, ord, type='samples', color='when',
title='PCA of the samples from the MiSeq SOP') +
# Distribution of the most abundant families
top20 <- names(sort(taxa_sums(physeq), decreasing=TRUE))[1:20]
physeq_top20 <- transform_sample_counts(physeq, function(OTU) OTU/sum(OTU))
physeq_top20 <- prune_taxa(top20, physeq_top20)
plot_bar(physeq_top20, x='day', fill='Family') +
facet_wrap(~when, scales='free_x') +
# Place them in a tree
bacteroidetes <- subset_taxa(physeq, Phylum %in% c('Bacteroidetes'))
plot_tree(bacteroidetes, ladderize='left', size='abundance',
group dpw
F3D0 0
F3D1 1
F3D141 141
F3D142 142
F3D143 143
F3D144 144
F3D145 145
F3D146 146
F3D147 147
F3D148 148
F3D149 149
F3D150 150
F3D2 2
F3D3 3
F3D5 5
F3D6 6
F3D7 7
F3D8 8
F3D9 9
group time
F3D0 Early
F3D1 Early
F3D141 Late
F3D142 Late
F3D143 Late
F3D144 Late
F3D145 Late
F3D146 Late
F3D147 Late
F3D148 Late
F3D149 Late
F3D150 Late
F3D2 Early
F3D3 Early
F3D5 Early
F3D6 Early
F3D7 Early
F3D8 Early
F3D9 Early
pcr.seqs(fasta=silva.bacteria.fasta, start=11894, end=25319, keepdots=F)
system(mv silva.bacteria.pcr.fasta silva.v4.fasta)
#change the name of the file from stability.files to whatever suits your study
make.contigs(file=stability.files, processors=8)
screen.seqs(fasta=current, group=current, maxambig=0, maxlength=275)
count.seqs(name=current, group=current)
align.seqs(fasta=current, reference=silva.v4.fasta)
screen.seqs(fasta=current, count=current, start=1968, end=11550, maxhomop=8)
filter.seqs(fasta=current, vertical=T, trump=.)
unique.seqs(fasta=current, count=current)
pre.cluster(fasta=current, count=current, diffs=2)
chimera.uchime(fasta=current, count=current, dereplicate=t)
remove.seqs(fasta=current, accnos=current)
classify.seqs(fasta=current, count=current, reference=trainset9_032012.pds.fasta,, cutoff=80)
remove.lineage(fasta=current, count=current, taxonomy=current, taxon=Chloroplast-Mitochondria-unknown-Archaea-Eukaryota)
remove.groups(count=current, fasta=current, taxonomy=current, groups=Mock)
cluster.split(fasta=current, count=current, taxonomy=current, splitmethod=classify, taxlevel=4, cutoff=0.15)
make.shared(list=current, count=current, label=0.03)
classify.otu(list=current, count=current, taxonomy=current, label=0.03)
make.shared(list=current, count=current, label=1)
classify.otu(list=current, count=current, taxonomy=current, label=1)
F3D0 F3D0_S188_L001_R1_001.fastq F3D0_S188_L001_R2_001.fastq
F3D141 F3D141_S207_L001_R1_001.fastq F3D141_S207_L001_R2_001.fastq
F3D142 F3D142_S208_L001_R1_001.fastq F3D142_S208_L001_R2_001.fastq
F3D143 F3D143_S209_L001_R1_001.fastq F3D143_S209_L001_R2_001.fastq
F3D144 F3D144_S210_L001_R1_001.fastq F3D144_S210_L001_R2_001.fastq
F3D145 F3D145_S211_L001_R1_001.fastq F3D145_S211_L001_R2_001.fastq
F3D146 F3D146_S212_L001_R1_001.fastq F3D146_S212_L001_R2_001.fastq
F3D147 F3D147_S213_L001_R1_001.fastq F3D147_S213_L001_R2_001.fastq
F3D148 F3D148_S214_L001_R1_001.fastq F3D148_S214_L001_R2_001.fastq
F3D149 F3D149_S215_L001_R1_001.fastq F3D149_S215_L001_R2_001.fastq
F3D150 F3D150_S216_L001_R1_001.fastq F3D150_S216_L001_R2_001.fastq
F3D1 F3D1_S189_L001_R1_001.fastq F3D1_S189_L001_R2_001.fastq
F3D2 F3D2_S190_L001_R1_001.fastq F3D2_S190_L001_R2_001.fastq
F3D3 F3D3_S191_L001_R1_001.fastq F3D3_S191_L001_R2_001.fastq
F3D5 F3D5_S193_L001_R1_001.fastq F3D5_S193_L001_R2_001.fastq
F3D6 F3D6_S194_L001_R1_001.fastq F3D6_S194_L001_R2_001.fastq
F3D7 F3D7_S195_L001_R1_001.fastq F3D7_S195_L001_R2_001.fastq
F3D8 F3D8_S196_L001_R1_001.fastq F3D8_S196_L001_R2_001.fastq
F3D9 F3D9_S197_L001_R1_001.fastq F3D9_S197_L001_R2_001.fastq
Mock Mock_S280_L001_R1_001.fastq Mock_S280_L001_R2_001.fastq
Derived from an example from the Swedish University of Agricultural Sciences.
The example job is metabarcoding using a DADA2 pipeline. It uses the data of MiSeq SOP but analyses the data using DADA2.
MiSqeq was acquired with the following commands:
rm -r __MACOSX/
cd MiSeq_SOP
DADA2 analyses amplicon data which uses exact variants instead of OTUs. The advantages of the DADA2 method is described in the following paper:
The job submission script uses R Bioconductor which invokes a version of R. It then calls an R script in vanilla mode (producing a PDF for
the plots).
File added
This diff is collapsed.
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=Roary-test.slurm
# Allocate four CPUs
#SBATCH --ntasks-per-node=4
# Set your minimum acceptable walltime=days-hours:minutes:seconds
# Yes, this is a fair-sized job
#SBATCH -t 10:00:00
# Specify your email address to be notified of progress.
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load foss/2019b
module load fastqc/0.11.9-java-11.0.2
module load megahit/1.1.4-python-3.7.4
module load prokka/1.14.5
module load enabrowsertool/1.5.4-python-3.7.4
module load roary/3.12.0-perl-5.30.0
module load web_proxy
# Get some random strains from the dataset
cut -d',' -f 1 journal.pcbi.1006258.s010.csv | tail -n +2 | shuf | head -10 > strains.txt
# Fastqc
cat strains.txt | parallel enaGroupGet -f fastq {}
mkdir reads
mv ERS*/*/*.fastq.gz reads/
# rm -r ERS*
# Assemble with Megahit
mkdir assemblies
for r1 in reads/*_1.fastq.gz
prefix=$(basename $r1 _1.fastq.gz)
megahit -1 $r1 -2 $r2 -o ${prefix} --out-prefix ${prefix}
mv ${prefix}/${prefix}.contigs.fa assemblies/
rm -r ${prefix}
# Prokka to annotate
mkdir annotation
for assembly in assemblies/*.fa
prefix=$(basename $assembly .contigs.fa)
prokka --usegenus --genus Escherichia --species coli --strain ${prefix} \
--outdir ${prefix} --prefix ${prefix} ${assembly}
mv ${prefix}/${prefix}.gff annotation/
# rm -r ${prefix}
# Run roary for pan-genpmic analysis
roary -f roary -e -n -v annotation/*.gff
# Prepare for visualising plots
cd roary
FastTree -nt -gtr core_gene_alignment.aln > my_tree.newick
This example partially from the Swedish University of Agricultural Science and the Genome annotation and Pangenome Analysis from the CBIB in
Santiago, Chile. It illustrates how to determine a pan-genome from a collection of isolate genomes.
It starts with a dataset of Escherichia coli from the article "Prediction of antibiotic resistance in Escherichia coli from large-scale
pan-genome data", available at
The CSV file has been obtained from
Random strains are selected from the file and read through enaGroupGet and fastq for quality and placed in their own directory (`reads`).
The strains are assembled with MEGAHIT and then annotated with Prokka.
All the .gff files are placed in the same folder and roary is run against them, getting the coding sequences, converting them into protein,
creating pre-clusters, and checking for paralogs. Every isolate is taken and ordered by presence or absence of orthologs. The summary is in
the file summary_statistics.txt along with a gene_presence_absence.csv includes the gene name and gene annotation, and whether a gene is
present in a genome or not. Finally, a tree file is created.
#!/usr/bin/env python
# Copyright (C) <2015> EMBL-European Bioinformatics Institute
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# Neither the institution name nor the name roary_plots
# can be used to endorse or promote products derived from
# this software without prior written permission.
# For written permission, please contact <>.
# Products derived from this software may not be called roary_plots
# nor may roary_plots appear in their names without prior written
# permission of the developers. You should have received a copy
# of the GNU General Public License along with this program.
# If not, see <>.
__author__ = "Marco Galardini"
__version__ = '0.1.0'
def get_options():
import argparse
# create the top-level parser
description = "Create plots from roary outputs"
parser = argparse.ArgumentParser(description = description,
prog = '')
parser.add_argument('tree', action='store',
help='Newick Tree file', default='accessory_binary_genes.fa.newick')
parser.add_argument('spreadsheet', action='store',
help='Roary gene presence/absence spreadsheet', default='gene_presence_absence.csv')
parser.add_argument('--labels', action='store_true',
help='Add node labels to the tree (up to 10 chars)')
help='Output format [Default: png]')
parser.add_argument('-N', '--skipped-columns', action='store',
help='First N columns of Roary\'s output to exclude [Default: 14]')
parser.add_argument('--version', action='version',
version='%(prog)s '+__version__)
return parser.parse_args()
if __name__ == "__main__":
options = get_options()
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import numpy as np
from Bio import Phylo
t =, 'newick')
# Max distance to create better plots
mdist = max([t.distance(t.root, x) for x in t.get_terminals()])
# Load roary
roary = pd.read_csv(options.spreadsheet, low_memory=False)
# Set index (group name)
roary.set_index('Gene', inplace=True)
# Drop the other info columns
roary.drop(list(roary.columns[:options.skipped_columns-1]), axis=1, inplace=True)
# Transform it in a presence/absence matrix (1/0)
roary.replace('.{2,100}', 1, regex=True, inplace=True)
roary.replace(np.nan, 0, regex=True, inplace=True)
# Sort the matrix by the sum of strains presence
idx = roary.sum(axis=1).sort_values(ascending=False).index
roary_sorted = roary.loc[idx]
# Pangenome frequency plot
plt.figure(figsize=(7, 5))
plt.hist(roary.sum(axis=1), roary.shape[1],
histtype="stepfilled", alpha=.7)
plt.xlabel('No. of genomes')
plt.ylabel('No. of genes')
plt.savefig('pangenome_frequency.%s'%options.format, dpi=300)
# Sort the matrix according to tip labels in the tree
roary_sorted = roary_sorted[[ for x in t.get_terminals()]]
# Plot presence/absence matrix against the tree
with sns.axes_style('whitegrid'):
fig = plt.figure(figsize=(17, 10))
ax1=plt.subplot2grid((1,40), (0, 10), colspan=30)
vmin=0, vmax=1,
ax = fig.add_subplot(1,2,1)
# matplotlib v1/2 workaround
ax=plt.subplot2grid((1,40), (0, 0), colspan=10, facecolor='white')
except AttributeError:
ax=plt.subplot2grid((1,40), (0, 0), colspan=10, axisbg='white')
fig.subplots_adjust(wspace=0, hspace=0)
ax1.set_title('Roary matrix\n(%d gene clusters)'%roary.shape[0])
if options.labels:
fsize = 12 - 0.1*roary.shape[1]
if fsize < 7:
fsize = 7
with plt.rc_context({'font.size': fsize}):
Phylo.draw(t, axes=ax,
label_func=lambda x: str(x)[:10],
xticks=([],), yticks=([],),
ylabel=('',), xlabel=('',),
title=('Tree\n(%d strains)'%roary.shape[1],),
Phylo.draw(t, axes=ax,
label_func=lambda x: None,
xticks=([],), yticks=([],),
ylabel=('',), xlabel=('',),
title=('Tree\n(%d strains)'%roary.shape[1],),
plt.savefig('pangenome_matrix.%s'%options.format, dpi=300)
# Plot the pangenome pie chart
plt.figure(figsize=(10, 10))
core = roary[(roary.sum(axis=1) >= roary.shape[1]*0.99) & (roary.sum(axis=1) <= roary.shape[1] )].shape[0]
softcore = roary[(roary.sum(axis=1) >= roary.shape[1]*0.95) & (roary.sum(axis=1) < roary.shape[1]*0.99)].shape[0]
shell = roary[(roary.sum(axis=1) >= roary.shape[1]*0.15) & (roary.sum(axis=1) < roary.shape[1]*0.95)].shape[0]
cloud = roary[roary.sum(axis=1) < roary.shape[1]*0.15].shape[0]
total = roary.shape[0]