Commit 3ce0c5c1 authored by root's avatar root

Update for 6/1/2021

parent 9f2c5178
ABAQUS/Door.odb
AdvLinux/NAMD
AFNI/ARzs_data.tgz
Beast/Dengue4.env.trees
BLAST/dbs
BLAST/rat-ests
......@@ -10,11 +9,8 @@ Delft3D/*
!Delft3D/run_all_examples.sh
!Delft3D/run_all_examples_sp.sh
!Delft3D/sed_in_file.tcl
digits/digits.img
FreeSurfer/buckner_data
FreeSurfer/buckner_data-tutorial_subjs.tar.gz
FSL/intro
FSL/preCourse.tar.gz
FSL/fmri
Gaussian/g16
Gaussian/tests
......@@ -24,9 +20,24 @@ NAMD/apoa1
NAMD/NAMD_BENCHMARKS_SPARTAN
NAMD/stmv
Python/minitwitter.csv
SAMtools/sample.sam.gz
Singularity/vsoch-hello-world-master.simg
Trimmomatic/*.gz
Trimmomatic/*.fa
Trimmomatic/.backup
*.fastq
*.fastq.gz
*.fasta
*.faa
*.tar
*.tar.gz
*.sam
*.sam.gz
*.simg
*.gz
*.fa
*.img
*.tgz
*.zip
*.jpg
*.png
*.jpeg
*.JPG
*.PNG
*.JPEG
#!/bin/bash
#SBATCH --job-name=ansys-multithread-test
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --output=output.%j.ansys-multithread-test
#SBATCH --time=01:00:00
#### SLURM 1 node, 4 processor per node Ansys test to run for 1 hour.
module load ansys_cfd/20.1
ANSYS_OPTS="-p aa_r -dir $(pwd) -b -np $SLURM_NTASKS"
time ansys201 $ANSYS_OPTS < ansys-test.in
#!/bin/bash
#SBATCH --job-name=ansys-serial-test
#SBATCH --ntasks=1
#SBATCH --output=output.%j.ansys-serial-test
#SBATCH --time=01:00:00
#### SLURM 1 processor Ansys test to run for 1 hour.
module load ansys_cfd/20.1
ANSYS_OPTS="-p aa_r -dir $(pwd) -b"
time ansys201 $ANSYS_OPTS < ansys-test.in
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
#!/bin/bash
# Add your project account details here.
# SBATCH --account=XXXX
#SBATCH --partition=gpgpu
#SBATCH --ntasks=4
#SBATCH --ntasks=1
#SBATCH --time=1:00:00
#SBATCH --partition=snowy
# Using a CUDA version of Amber and GPUs?
# You'll need to invoke the correct directives
# SBATCH --partition=gpgpu
# SBATCH --gres=gpu:p100:4
# Change this as appropriate.
# SBATCH --qos=gpgpuhpcadmin
# Invoke the old build system.
module purge
source /usr/local/module/spartan_old.sh
module load Amber/16-gompi-2017b-CUDA-mpi
srun /usr/local/easybuild/software/Amber/16-gompi-2017b-CUDA-mpi/amber16/bin/pmemd.cuda_DPFP.MPI -O -i mdin -o mdout -inf mdinfo -x mdcrd -r restrt
# Select a version of Amber
# module load Amber/16-GCC-6.2.0
# module load Amber/16-GCC-6.2.0-CUDA
# module load Amber/16-gompi-2017b-CUDA-mpi
module load AmberTools/18-spartan_intel-2017.u2
# module load AmberTools/18-spartan_intel-2017.u2
ambpdb -p 0.15_80_10_pH6.5_1ODX.top -c 0.15_80_10_pH6.5_1ODX.crd > 0.15_80_10_pH6.5_1ODX.pdb
# LL 20190805
Amber (originally Assisted Model Building with Energy Refinement) is software for performing molecular dynamics and structure prediction.
TUTORIAL NOT YET COMPLETE
The sample job is a step from the more extensive tutorial at: https://ambermd.org/tutorials/pengfei/index.htm
It generates a protein PDB file using ambpdb program from topology and coordinate files of the protein system.
#!/bin/bash
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=Bowtie-test.slurm
# Run on single CPU
#SBATCH --ntasks=1
# set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Specify your email address to be notified of progress.
# SBATCH --mail-user=youreamiladdress@unimelb.edu
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load foss/2019b
module load bowtie2/2.3.5.1
module load samtools/1.9
module load bcftools/1.9
# Build an index
bowtie2-build pO157_Sakai.fasta.gz pO157_Sakai
# Map the reads, using the trimmed data from fastqc (qv)
bowtie2 -x pO157_Sakai -1 SRR957824_trimmed_R1.fastq -2 SRR957824_trimmed_R2.fastq -S SRR957824.sam
# Convert the SAM file into BAM, a compressed version of SAM that can be indexed.
samtools view -hSbo SRR957824.bam SRR957824.sam
# Sort the bam file per position in the genome and index it
samtools sort SRR957824.bam -o SRR2584857.sorted.bam
samtools index SRR2584857.sorted.bam
# Set up an interactive session with X-windows forwarding or use FastX and visualise with samtools tview
# samtools tview SRR2584857.sorted.bam pO157_Sakai.fasta.gz
# A frequent application for mapping reads is variant calling, i.e. finding positions where the reads are systematically different
# from the reference genome. Single nucleotide polymorphism (SNP)-based typing is particularly popular and used for a broad range of
# applications. For an EHEC O157 outbreak you could use it to identify the source, for instance.
samtools mpileup -uD -f pO157_Sakai.fasta.gz SRR2584857.sorted.bam | bcftools view - > variants.vcf
# In a graphical interactive session examine one of the variants with tview
# samtools tview SRR2584857.sorted.bam pO157_Sakai.fasta.gz -p 'gi|10955266|ref|NC_002128.1|:43071'
Derived from the Swedish Univeristy of Agricultural Sciences
This file contains the sequence of the pO157 plasmid from the Sakai outbreak strain of E. coli O157.
Available from: curl -O -J -L https://osf.io/rnzbe/download
The example maps a prepared read set to a reference sequnce to the virulence plasmid to determine whether p0157 is present in the St Louis
outbreak strain.
#!/bin/bash
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=Busco-test.slurm
# Run on single CPU
#SBATCH --ntasks=1
# set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Specify your email address to be notified of progress.
# SBATCH --mail-user=youreamiladdress@unimelb.edu
# SBATCH --mail-type=ALL
# Load the environment variables
module purge
module load foss/2019b
module load busco/4.0.5-python-3.7.4
module load web_proxy
# You will need to create a busco configuration file. See README.md
# Then modify and uncomment the following line.
# export BUSCO_CONFIG_FILE="/path/to/myconfig.ini"
export BUSCO_CONFIG_FILE="/home/lev/Busco/my-busco.conf"
# The m_genitalium.fasta file is from final.contigs.fa generated in the MEGAHIT example.
busco -i m_genitalium.fasta -l bacteria_odb10 -o busco_genitalium -m genome
NOTA BENE: TUTORIAL IS INCOMPLETE. RUNS BUT WITH ERRORS.
Sone content derived from the Swedish Univeristy of Agricultural Sciences
Busco (Benchmark Universal Single Copy Orthologs) can be used to to find marker genes in a assembly. Marker genes are conserved
across a range of species and finding intact conserved genes in the assembly would be a good indication of its quality.
The file `m_genitalium.fasta` is from the MEGAHIT job example.
The file bacteria_odb9.tar.gz is available from:
http://busco.ezlab.org/v2/datasets/bacteria_odb9.tar.gz
Busco requires editing of a configuration file to operate. A suggested process is as follows:
mkdir Busco
cd Busco
module load foss/2019b ; module load busco/4.0.5-python-3.7.4
cp /usr/local/easybuild-2019/easybuild/software/mpi/gcc/8.3.0/openmpi/3.1.4/busco/4.0.5-python-3.7.4/config/config.ini my-busco.conf
busco_configurator.py /usr/local/easybuild-2019/easybuild/software/mpi/gcc/8.3.0/openmpi/3.1.4/busco/4.0.5-python-3.7.4/config/config.ini my-busco.conf
#include <stdio.h>
/*
* Notice the absence of the previously expected argument `N`.
* Refactor `loop` to be a CUDA Kernel. The new kernel should
* only do the work of 1 iteration of the original loop.
*/
__global__ void loop()
void loop(int N)
{
/*
* This kernel does the work of only 1 iteration
* of the original for loop. Indication of which
* "iteration" is being executed by this kernel is
* still available via `threadIdx.x`.
*/
printf("This is iteration number %d\n", threadIdx.x);
for (int i = 0; i < N; ++i)
{
printf("This is iteration number %d\n", i);
}
}
__global__ void loop_gpu(){
int time_loop = threadIdx.x;
printf("This is gpu iteration number %d\n", time_loop);
}
int main()
{
/*
* It is the execution context that sets how many "iterations"
* of the "loop" will be done.
* When refactoring `loop` to launch as a kernel, be sure
* to use the execution configuration to control how many
* "iterations" to perform.
*
* For this exercise, only use 1 block of threads.
*/
loop<<<1, 10>>>();
int N = 10;
loop(N);
loop_gpu<<<1, N>>>();
cudaDeviceSynchronize();
}
#include <stdio.h>
__global__ void printSuccessForCorrectExecutionConfiguration()
{
if(threadIdx.x == 1023 && blockIdx.x == 255)
{
printf("Success!\n");
} else {
printf("Failure. Update the execution configuration as necessary.\n");
}
}
int main()
{
/*
* Update the execution configuration so that the kernel
* will print `"Success!"`.
*/
printSuccessForCorrectExecutionConfiguration<<<1, 1>>>();
cudaDeviceSynchronize();
}
#include <stdio.h>
__global__ void loop()
{
/*
* This idiomatic expression gives each thread
* a unique index within the entire grid.
*/
/*
* Refactor `loop` to be a CUDA Kernel. The new kernel should
* only do the work of 1 iteration of the original loop.
*/
int i = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n", i);
void loop(int N)
{
for (int i = 0; i < N; ++i)
{
printf("This is iteration number %d\n", i);
}
}
__global__ void loop_gpu(){
int time_loop = blockIdx.x;
printf("This is gpu iteration number %d\n", time_loop);
}
int main()
{
/*
* Additional execution configurations that would
* work and meet the exercises contraints are:
* When refactoring `loop` to launch as a kernel, be sure
* to use the execution configuration to control how many
* "iterations" to perform.
*
* <<<5, 2>>>
* <<<10, 1>>>
* For this exercise, be sure to use more than 1 block in
* the execution configuration.
*/
loop<<<2, 5>>>();
int N = 10;
loop(N);
loop_gpu<<<N, 1>>>();
cudaDeviceSynchronize();
}
# Setup
All examples in this directory with a numerical prefix, 01-, 02- etc are from NVidia.
To run a sample CUDA job start with interactive job. Change the qos as appropriate.
`sinteractive --partition=gpgpu --gres=gpu:p100:4 --qos=gpgpuhpcadmin`
Invoke the old (2015-2020) build system
`source /usr/local/module/spartan_new.sh`
Load a CUDA module
`module load CUDA/8.0.44-GCC-4.9.2`
Copy the CUDA directory to home and enter:
`cd ~; cp -r /usr/local/common/CUDA . ; cd CUDA`
# Structure of CUDA Code
As with all parallel programming, start with serial code, engage in decomposition, then generate parallel code.
......@@ -19,49 +39,37 @@ int main()
cudaDeviceSynchronize();
}
The __global__ keyword indicates that the following function will run on the GPU, and can be invoked globally, which in this context means either by
the CPU, or, by the GPU.
Often, code executed on the CPU is referred to as host code, and code running on the GPU is referred to as device code.
# Hello World from CPU and GPU
# Compiling a Sample GPU Job
To run a sample CUDA job start with interactive job.
A CUDA program can run portions of the code on the CPU and portions on the GPU.
sinteractive --partition=gpgpu --gres=gpu:p100:4
Review the non-CUDA code:
Load a CUDA module
`01-hello-gpu.cu`
`module load CUDA/8.0.44-GCC-4.9.2`
Compile and execute:
To compile 01-hello-gpu-solution.cu, run:
`nvcc 01-hello-gpu-solution.cu -o helloCUDA -gencode arch=compute_60,code=sm_60`
Execute the generated helloCUDA running:
`nvcc 01-hello-gpu.cu -o helloCUDA -gencode arch=compute_60,code=sm_60`
`./helloCUDA`
Or, as an alternative, compile with `-run` at the end of the compilation line which will run the compiled binary right away.
# Examples
All examples with a numerical prefix, 01-, 02- etc are from NVidia.
Refactor the code to take advantage of CUDA functions, recompile, and execute.
# Debug with printf
Calling printf from a CUDA kernel function is no different than calling printf on CPU code. In the vector addition example, edit vec_add.cu and insert the following code after line 18:
if(threadIdx.x == 10)
printf("c[%d] = %dn", id, c[id]);
`nvcc 01-hello-gpu-solution.cu -o helloCUDA -gencode arch=compute_60,code=sm_60`
`./helloCUDA`
# Supported Gencode variations for sm and compute
What are thos gencode requirements?
Below are the supported sm variations and sample cards from that generation
Supported on CUDA 7 and later
Fermi (CUDA 3.2 until CUDA 8) (deprecated from CUDA 9):
......@@ -95,6 +103,111 @@ Turing (CUDA 10 and later)
(c.f., http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/)
# Parallel Kernels
Review the non-CUDA code:
`01-first-parallel.cu`
Compile and execute:
`nvcc 01-first-parallel.cu -o firstCUDA -gencode arch=compute_60,code=sm_60`
`./firstCUDA`
Refactor the code to take advantage of CUDA functions, recompile, and execute.
`nvcc 01-first-parallel-solution.cu -o firstCUDA -gencode arch=compute_60,code=sm_60`
`./firstCUDA`
Modify the distribution of kernels as desired.
# Thread and Block Indices
Currently, the 01-thread-and-block-idx.cu file contains a kernel function that fails.
`nvcc 01-thread-and-block-idx.cu -o indexCUDA -gencode arch=compute_60,code=sm_60`
Refactor the code so that the index is correct, recompile, and execute.
`nvcc 01-thread-and-block-idx-solution.cu -o indexCUDA -gencode arch=compute_60,code=sm_60`
# Accelerating For Loops
Consider the non-accelerated (CPU-based) loop, compile and run.
`nvcc 01-single-block-loop.cu -o loopCUDA -gencode arch=compute_60,code=sm_60`
`./loopCUDA`
Refactor, recompile, and execute.
`nvcc 01-single-block-loop-solution.cu -o loopCUDA -gencode arch=compute_60,code=sm_60`
`./loopCUDA`
Modify the number of iterations as desired.
# Multiple Blocks of Threads
Consider the non-accelerated (CPU-based) loop, compile and run.
`nvcc 02-multi-block-loop.cu -o loop2CUDA -gencode arch=compute_60,code=sm_60`
`./loop2CUDA`
Refactor, recompile, and execute for multiple blocks.
`nvcc `02-multi-block-loop-solution.cu -o loop2CUDA -gencode arch=compute_60,code=sm_60`
`./loop2CUDA`
Note the order of returns.
# Loop with a Mismatched Execution
The program in 02-mismatched-config-loop.cu uses cudaMallocManaged to allocate memory for an integer array of 1000 elements, and
then attempts to initialize all the values in the array in parallel using CUDA kernel function.
`nvcc 02-mismatched-config-loop.cu -o mismatchCUDA -gencode arch=compute_60,code=sm_60`
`./mismatchCUDA`
Refactor, recompile, and execute for multiple blocks.
`nvcc `02-mismatched-config-loop-solution.cu -o loop2CUDA -gencode arch=compute_60,code=sm_60`
`./mismatch`
# Grid-Stride Loops
Grid span cycle: the number of data elements is often greater than the number of threads in the grid. In this case, the thread
cannot process only one element, or the work will not be completed. One of the ways to solve this problem programmatically is to use
the grid span cycle. In the grid span cycle, the first element of the thread is still calculated by
threadIdx.x+blockIdx.x*blockDim.x, and then the thread will move forward according to the number of threads in the grid (blockDim.x
* gridDim.x),
`nvcc 03-grid-stride-double.cu -o gridstrideCUDA -gencode arch=compute_60,code=sm_60`
`./gridstrideCUDA`
`nvcc 03-grid-stride-double-solution.cu -o mismatchCUDA -gencode arch=compute_60,code=sm_60`
`./gridstrideCUDA`
# Debug with printf
Calling printf from a CUDA kernel function is no different than calling printf on CPU code. In the vector addition example, edit vec_add.cu and insert the following code after line 18:
if(threadIdx.x == 10)
printf("c[%d] = %dn", id, c[id]);
# CUDA Error Handling
Most CUDA functions return a value of type cudaError_t, which can be used to check for errors when calling a function.
......
#!/bin/bash
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=FastQC-test1
# Run on single CPU
#SBATCH --ntasks=1
# set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Specify your email address to be notified of progress.
# SBATCH --mail-user=youreamiladdress@unimelb.edu
# SBATCH --mail-type=ALL
# Load the module to check the quality of the sequence data we will use a tool called FastQC.
module purge
module load fastqc/0.11.9-java-11.0.2
# Run FastQC
fastqc SRR957824_500K_R1.fastq.gz SRR957824_500K_R2.fastq.gz
#!/bin/bash
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=FastQC-test1
# Run on single CPU
#SBATCH --ntasks=1
# set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Specify your email address to be notified of progress.
# SBATCH --mail-user=youreamiladdress@unimelb.edu
# SBATCH --mail-type=ALL
# Load the module to check the quality of the sequence data we will use a tool called FastQC.
module purge
module load fastqc/0.11.9-java-11.0.2
# Run FastQC
fastqc SRR957824_trimmed_R1.fastq SRR957824_trimmed_R2.fastq
Derived from the University of Agricultural Science, Sweden
The first dataset you will be working with is from an Illumina MiSeq dataset. The sequenced organism is an enterohaemorrhagic E.
coli (EHEC) of the serotype O157, a potentially fatal gastrointestinal pathogen. The sequenced bacterium was part of an outbreak
investigation in the St. Louis area, USA in 2011. The sequencing was done as paired-end 2x150bp.
The raw data were deposited at the European Nucleotide Archive, under the accession number SRR957824. A subset of the original
dataset for this tutorial.
FastQC is used to check the quality of the data. There are two example job submission scripts. The first is on the raw data, the
second after running Scythe (qv) and Sickle (qv) to remove poor quality bases.
......@@ -60,7 +60,7 @@ gdb gdbtest
cp gdbtest.c gdbtest2.c
gcc -Wall -g gdbtest2.c -o gdbtest2
$ ./gdbtest
$ ./gdbtest2
# There is still another bug! Can you find it? Use GDB to help.
......
#!/bin/bash
# To give your job a name, replace "MyJob" with an appropriate name
#SBATCH --job-name=Gretl-test.slurm
# Run on single CPU
#SBATCH --ntasks=1
# set your minimum acceptable walltime=days-hours:minutes:seconds
#SBATCH -t 0:15:00
# Load the environment variables
module purge
source /usr/local/module/spartan_old.sh
module load gretl/2018c-GCC-6.2.0-LAPACK-3.8.0-OpenBLAS-0.3.5
module load LAPACK/3.8.0-GCC-6.2.0-OpenBLAS-0.3.5
module load OpenSSL/1.0.2l-GCC-6.2.0
module load gnuplot/5.0.0-GCC-6.2.0
# Run a basic Gretl example
gretlcli -b first_ex.inp
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
open AWM.gdt --quiet # load data from disk
/* data transformations and visualisation */
# the "series" concept: operate on
# vectors on an element-by-element basis
# (but you also have special functions)
series y = 100 * hpfilt(ln(YER))
series u = 100 * URX
series r = STN - 100*sdiff(ln(HICP))
scatters y r u --output=display # command example with an option: graph data
/* in-house VAR */
scalar p = 2 # strong typing: a scalar is not a # matrix nor a series
var p y r u # estimation command
A = $coeff # and corresponding accessor
/* by iterated OLS */
list X = y r u # the list is yet another variable type
matrix B = {} # initialize an empty matrix
# loop over the 3 var equations
# using native OLS command
# and store the estimated coefficients
loop foreach i X