Commit fe028630 authored by root's avatar root
Browse files

Add KRAKEN, RBio and Roary samples

parent 2e6fbc31
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=KRAKEN-test.slurm
# Run multicore
#SBATCH --ntasks-per-node=1
# Set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Specify your email address to be notified of progress.
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load fastqc/0.11.9-java-11.0.2
module load kraken/1.1-perl-5.30.0
# Use FastQC to check the quality of our data
cd wms/data/sub_100000
fastqc *.fastq
# Assign the database
# Running the KRAKEN loop
# Please change the directory path to suit your location
for i in *_1.fastq
prefix=$(basename $i _1.fastq)
# print which sample is being processed
echo $prefix
kraken --db $KRAKEN_DB \
${prefix}_1.fastq ${prefix}_2.fastq > ~/KRAKEN/wms/results/${prefix}.tab
kraken-report --db $KRAKEN_DB \
~/KRAKEN/wms/results/${prefix}.tab > ~/KRAKEN/wms/results/${prefix}_tax.txt
Derived from a tutorial from the Swedish Univeristy of Agricultural Sciences.
Kraken is a system for assigning taxonomic labels to short DNA sequences (i.e. reads) Kraken aims to achieve high sensitivity and
high speed by utilizing exact alignments of k-mers and a novel classification algorithm.
Kraken uses a new approach with exact k-mer matching to assign taxonomy to short reads. It is extremely fast compared to
traditional approaches (i.e. BLAST).
In this example samples are compared from the Pig Gut Microbiome to samples from the Human Gut Microbiome.
Whole Metagenome Sequencing, "shotgun metagenome sequencing", is used, a relatively new and powerful sequencing approach that
provides insight into community biodiversity and function. On the contrary of Metabarcoding, where only a specific region of the
bacterial community (the 16s rRNA) is sequenced, WMS aims at sequencing all the genomic material present in the environment.
The dataset (subset_wms.tar.gz) uses a sample of 3 pigs and 3 humans.
FastQC is used to check the quality of our data.
Kraken includes a script called kraken-report to transform this file into a "tree" view with the percentage of reads assigned to
each taxa. Once the job is run take a look at the _tax.txt files in the results directory.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
sample organism
ERR1135369 pig
ERR1135370 pig
ERR1135375 pig
ERR321573 human
ERR321574 human
ERR321578 human
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=RBio-test.slurm
# Run multicore
#SBATCH --ntasks-per-node=4
# Set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 5:00:00
# Specify your email address to be notified of progress.
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load foss/2019b
module load r-bundle-bioconductor/3.10-r-3.6.2
module load web_proxy
# Running the R script
R --vanilla < Metabarcoding.R
# Load packages
# Check the dataset
path <- 'MiSeq_SOP'
# Filter and Trim
raw_forward <- sort(list.files(path, pattern="_R1_001.fastq",
raw_reverse <- sort(list.files(path, pattern="_R2_001.fastq",
# we also need the sample names
sample_names <- sapply(strsplit(basename(raw_forward), "_"),
`[`, # extracts the first element of a subset
# Initial plot read quality. Forward reads are good quality while the reverse are way worse.
# Dada2 requires us to define the name of our output files
# place filtered files in filtered/ subdirectory
filtered_path <- file.path(path, "filtered")
filtered_forward <- file.path(filtered_path,
paste0(sample_names, "_R1_trimmed.fastq.gz"))
filtered_reverse <- file.path(filtered_path,
paste0(sample_names, "_R2_trimmed.fastq.gz"))
# Use standard filtering parameters: maxN=0 (DADA22 requires no Ns), truncQ=2, rm.phix=TRUE and maxEE=2. The maxEE parameter sets the maximum
# number of “expected errors” allowed in a read, which according to the USEARCH authors is a better filter than simply averaging quality
# scores.
out <- filterAndTrim(raw_forward, filtered_forward, raw_reverse,
filtered_reverse, truncLen=c(240,160), maxN=0,
maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE,
# Learn and plot the Error Rates
# The DADA2 algorithm depends on a parametric error model and every amplicon dataset has a slightly different error rate. The learnErrors of
# Dada2 learns the error model from the data and will help DADA2 to fits its method to your data.
errors_forward <- learnErrors(filtered_forward, multithread=TRUE)
errors_reverse <- learnErrors(filtered_reverse, multithread=TRUE)
plotErrors(errors_forward, nominalQ=TRUE) +
# Dereplication
# From the Dada2 documentation: Dereplication combines all identical sequencing reads into into “unique sequences” with a corresponding
# “abundance”: the number of reads with that unique sequence. Dereplication substantially reduces computation time by eliminating redundant
# comparisons.
derep_forward <- derepFastq(filtered_forward, verbose=TRUE)
derep_reverse <- derepFastq(filtered_reverse, verbose=TRUE)
# name the derep-class objects by the sample names
names(derep_forward) <- sample_names
names(derep_reverse) <- sample_names
# Sample inference
# We are now ready to apply the core sequence-variant inference algorithm to the dereplicated data.
dada_forward <- dada(derep_forward, err=errors_forward, multithread=TRUE)
dada_reverse <- dada(derep_reverse, err=errors_reverse, multithread=TRUE)
# inspect the dada-class object
# Merge Paired-end Reads
# Now that the reads are trimmed, dereplicated and error-corrected we can merge them together
merged_reads <- mergePairs(dada_forward, derep_forward, dada_reverse,
derep_reverse, verbose=TRUE)
# inspect the merger data.frame from the first sample
# Construct Sequence Table
# Construct a sequence table of our mouse samples, a higher-resolution version of the OTU table produced by traditional methods.
seq_table <- makeSequenceTable(merged_reads)
# inspect distribution of sequence lengths
# Remove Chimeras
# The dada method used earlier removes substitutions and indel errors but chimeras remain.
seq_table_nochim <- removeBimeraDenovo(seq_table, method='consensus',
multithread=TRUE, verbose=TRUE)
# which percentage of our reads did we keep?
sum(seq_table_nochim) / sum(seq_table)
# Check the number of reads that made it through each step in the pipeline
get_n <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dada_forward, get_n), sapply(merged_reads, get_n),
rowSums(seq_table), rowSums(seq_table_nochim))
colnames(track) <- c('input', 'filtered', 'denoised', 'merged', 'tabled',
rownames(track) <- sample_names
# Assign Taxonomy
# Aassign taxonomy to the sequences using the SILVA database
taxa <- assignTaxonomy(seq_table_nochim,
taxa <- addSpecies(taxa, 'MiSeq_SOP/silva_species_assignment_v128.fa.gz')
# for inspecting the classification
taxa_print <- taxa # removing sequence rownames for display only
rownames(taxa_print) <- NULL
# Phylogenetic Tree
# DADA2 is reference-free so we have to build the tree ourselves
# Align our sequences
sequences <- getSequences(seq_table)
names(sequences) <- sequences # this propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(sequences), anchor=NA)
# Build a neighbour-joining tree then fit a maximum likelihood tree using the neighbour-joining tree as a starting point.
phang_align <- phyDat(as(alignment, 'matrix'), type='DNA')
dm <-
treeNJ <- NJ(dm) # note, tip order != sequence order
fit = pml(treeNJ, data=phang_align)
## negative edges length changed to 0!
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model='GTR', optInv=TRUE, optGamma=TRUE,
rearrangement = 'stochastic',
control = pml.control(trace = 0))
detach('package:phangorn', unload=TRUE)
# Phyloseq. First load the metadata
sample_data <- read.table(
header=TRUE, row.names="sample_name")
# Construct a phyloseq object from our output and newly created metadata
physeq <- phyloseq(otu_table(seq_table_nochim, taxa_are_rows=FALSE),
# remove mock sample
physeq <- prune_samples(sample_names(physeq) != 'Mock', physeq)
# Look at the alpha diversity of our samples
plot_richness(physeq, x='day', measures=c('Shannon', 'Fisher'), color='when') +
# Ordination methods (beta diversity)
# Perform an MDS with euclidean distance (mathematically equivalent to a PCA)
ord <- ordinate(physeq, 'MDS', 'euclidean')
plot_ordination(physeq, ord, type='samples', color='when',
title='PCA of the samples from the MiSeq SOP') +
# With the Bray-Curtis distance
ord <- ordinate(physeq, 'NMDS', 'bray')
plot_ordination(physeq, ord, type='samples', color='when',
title='PCA of the samples from the MiSeq SOP') +
# Distribution of the most abundant families
top20 <- names(sort(taxa_sums(physeq), decreasing=TRUE))[1:20]
physeq_top20 <- transform_sample_counts(physeq, function(OTU) OTU/sum(OTU))
physeq_top20 <- prune_taxa(top20, physeq_top20)
plot_bar(physeq_top20, x='day', fill='Family') +
facet_wrap(~when, scales='free_x') +
# Place them in a tree
bacteroidetes <- subset_taxa(physeq, Phylum %in% c('Bacteroidetes'))
plot_tree(bacteroidetes, ladderize='left', size='abundance',
group dpw
F3D0 0
F3D1 1
F3D141 141
F3D142 142
F3D143 143
F3D144 144
F3D145 145
F3D146 146
F3D147 147
F3D148 148
F3D149 149
F3D150 150
F3D2 2
F3D3 3
F3D5 5
F3D6 6
F3D7 7
F3D8 8
F3D9 9
group time
F3D0 Early
F3D1 Early
F3D141 Late
F3D142 Late
F3D143 Late
F3D144 Late
F3D145 Late
F3D146 Late
F3D147 Late
F3D148 Late
F3D149 Late
F3D150 Late
F3D2 Early
F3D3 Early
F3D5 Early
F3D6 Early
F3D7 Early
F3D8 Early
F3D9 Early
pcr.seqs(fasta=silva.bacteria.fasta, start=11894, end=25319, keepdots=F)
system(mv silva.bacteria.pcr.fasta silva.v4.fasta)
#change the name of the file from stability.files to whatever suits your study
make.contigs(file=stability.files, processors=8)
screen.seqs(fasta=current, group=current, maxambig=0, maxlength=275)
count.seqs(name=current, group=current)
align.seqs(fasta=current, reference=silva.v4.fasta)
screen.seqs(fasta=current, count=current, start=1968, end=11550, maxhomop=8)
filter.seqs(fasta=current, vertical=T, trump=.)
unique.seqs(fasta=current, count=current)
pre.cluster(fasta=current, count=current, diffs=2)
chimera.uchime(fasta=current, count=current, dereplicate=t)
remove.seqs(fasta=current, accnos=current)
classify.seqs(fasta=current, count=current, reference=trainset9_032012.pds.fasta,, cutoff=80)
remove.lineage(fasta=current, count=current, taxonomy=current, taxon=Chloroplast-Mitochondria-unknown-Archaea-Eukaryota)
remove.groups(count=current, fasta=current, taxonomy=current, groups=Mock)
cluster.split(fasta=current, count=current, taxonomy=current, splitmethod=classify, taxlevel=4, cutoff=0.15)
make.shared(list=current, count=current, label=0.03)
classify.otu(list=current, count=current, taxonomy=current, label=0.03)
make.shared(list=current, count=current, label=1)
classify.otu(list=current, count=current, taxonomy=current, label=1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment