Commit 44a3e773 authored by root's avatar root

Update for August 2021

parent 0780dff0
......@@ -18,6 +18,7 @@ NAMD/apoa1
NAMD/NAMD_BENCHMARKS_SPARTAN
NAMD/stmv
Trimmomatic/.backup
PyTorch/data/*
*.fastq
*.fastq.gz
*.fasta
......@@ -45,3 +46,4 @@ Trimmomatic/.backup
*.data
*.inp
*.fast5
*.pt
Spacing in shell scripts is important!
$ UPDATE_STATUS=0
$ if [[ ${UPDATE_STATUS}=1 ]]; then echo Why does this work; fi
What's wrong with spaces in filenames?
$ touch "This is a long file name"
$ for item in $(ls ./*); do echo ${item}; done
Command line arguments are separated by whitespace, unless the arguments are quoted. When passing arguments to data that is separated by spaces it
will treat the data as separate fields.
One way around this is to change the Internal Field Separator in the shell script. For example, before loop construct, setting a newline as the
delimiter instead.
$ mkdir test; cd test
$ touch "This is a long file name"
$ cat ifs.sh
#!/bin/bash
IFS=$'\n'
for item in $(ls ./*)
do echo ${item}
done
unset IFS
exit 0
......@@ -8,7 +8,7 @@ To run a sample CUDA job start with interactive job. Change the qos as appropria
Invoke the old (2015-2020) build system
`source /usr/local/module/spartan_new.sh`
`source /usr/local/module/spartan_old.sh`
Load a CUDA module
......@@ -133,7 +133,6 @@ Refactor the code so that the index is correct, recompile, and execute.
`nvcc 01-thread-and-block-idx-solution.cu -o indexCUDA -gencode arch=compute_60,code=sm_60`
# Accelerating For Loops
Consider the non-accelerated (CPU-based) loop, compile and run.
......@@ -150,7 +149,6 @@ Refactor, recompile, and execute.
Modify the number of iterations as desired.
# Multiple Blocks of Threads
Consider the non-accelerated (CPU-based) loop, compile and run.
......
......@@ -26,7 +26,6 @@ sbatch -q gpgpudeeplearn -p deeplearn
scontrol show partition deeplearn
# Derived from:
# https://stackoverflow.com/questions/7663343/simplest-possible-example-to-show-gpu-outperform-cpu-using-cuda
......
#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --nodelist=spartan-bm005
#SBATCH --nodelist=spartan-bm055
# Alternative to exclude specific nodes.
# SBATCH --exclude=spartan-bm005
# SBATCH --exclude=spartan-bm055
echo $(hostname ) $SLURM_JOB_NAME running $SLURM_JOBID >> hostname.txt
#!/bin/bash
#SBATCH -p physical
#SBATCH --ntasks=8
module load MATLAB
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --time=1:0:0
module purge
module load MATLAB/2016a
time matlab -nodesktop -nodisplay -nosplash < tictoc.m
time matlab -nodesktop -nodisplay -nosplash < tictoc-p.m
#!/bin/bash
#SBATCH --ntasks=8
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --time=1:0:0
module purge
......
......@@ -9,3 +9,8 @@ $ matlab
To install packages for your own environment, follow the instructions here:
https://www.mathworks.com/help/matlab/matlab_env/get-add-ons.html
Remember to use the addpath function to ensure that new packages (or modules) are included in your enviroment.
http://www.ece.northwestern.edu/local-apps/matlabhelp/techdoc/ref/addpath.html
poolobj = parpool;
fprintf('Number of workers: %g\n', poolobj.NumWorkers);
tic
n = 400;
A = 1000;
a = zeros(n);
parfor i = 1:n
a(i) = max(abs(eig(rand(A))));
end
......
#!/bin/bash
#SBATCH --partition gpgpu
# Use a project ID that has access.
# SBATCH --account=test
#SBATCH --qos=gpgpuhpcadmin
# For example if you wish to access up four GPUs in a single job use:
#SBATCH --gres=gpu:4
module load fosscuda/2020b
module load numba/0.53.1
python3 gpu.py
#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
module load foss/2020b
module load numba/0.53.1
export OMP_NUM_THREADS=4
time python3 monte-carlo.py
#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
module load foss/2020b
module load numba/0.53.1
export OMP_NUM_THREADS=4
time python3 lennardjones.py
sleep 60
python compile.py
from numba import jit
import numpy as np
import time
x = np.arange(100).reshape(10, 10)
@jit(nopython=True)
def go_fast(a): # Function is compiled and runs in machine code
trace = 0.0
for i in range(a.shape[0]):
trace += np.tanh(a[i, i])
return a + trace
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
start = time.time()
go_fast(x)
end = time.time()
print("Elapsed (with compilation) = %s" % (end - start))
# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
start = time.time()
go_fast(x)
end = time.time()
print("Elapsed (after compilation) = %s" % (end - start))
from numba import cuda, float32
# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 16
@cuda.jit
def fast_matmul(A, B, C):
# Define an array in the shared memory
# The size and type of the arrays must be known at compile time
sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bpg = cuda.gridDim.x # blocks per grid
if x >= C.shape[0] and y >= C.shape[1]:
# Quit if (x, y) is outside of valid C boundary
return
# Each thread computes one element in the result matrix.
# The dot product is chunked into dot products of TPB-long vectors.
tmp = 0.
for i in range(bpg):
# Preload data into shared memory
sA[tx, ty] = A[x, ty + i * TPB]
sB[tx, ty] = B[tx + i * TPB, y]
# Wait until all threads finish preloading
cuda.syncthreads()
# Computes partial product on the shared memory
for j in range(TPB):
tmp += sA[tx, j] * sB[j, ty]
# Wait until all threads finish computing
cuda.syncthreads()
C[x, y] = tmp
import numba
@numba.njit
def lj_numba_scalar_prange(r):
sr6 = (1./r)**6
pot = 4.*(sr6*sr6 - sr6)
return pot
@numba.njit
def distance_numba_scalar_prange(atom1, atom2):
dx = atom2[0] - atom1[0]
dy = atom2[1] - atom1[1]
dz = atom2[2] - atom1[2]
r = (dx * dx + dy * dy + dz * dz) ** 0.5
return r
@numba.njit(parallel=True)
def potential_numba_scalar_prange(cluster):
energy = 0.0
# numba.prange requires parallel=True flag to compile.
# It causes the loop to run in parallel in multiple threads.
for i in numba.prange(len(cluster)-1):
for j in range(i + 1, len(cluster)):
r = distance_numba_scalar_prange(cluster[i], cluster[j])
e = lj_numba_scalar_prange(r)
energy += e
return energy
# Example derived from
# https://www.analyticsvidhya.com/blog/2021/04/numba-for-data-science-make-your-py-code-run-1000x-faster/
import random
from numba import jit,njit,vectorize
import time
def monte_carlo_pi(nsamples):
acc = 0
for i in range(nsamples):
x = random.random()
y = random.random()
if (x ** 2 + y ** 2) < 1.0:
acc += 1
return 4.0 * acc / nsamples
# Standard Python
start = time.time()
monte_carlo_pi( 10000 )
end = time.time()
print(end - start)
# Numba JIT compilation, example actually slower!
start = time.time()
monte_carlo_pi_jit = jit() (monte_carlo_pi)
monte_carlo_pi_jit( 10000 )
end = time.time()
print(end - start)
# Object now compiled, now 1000x faster!
start = time.time()
monte_carlo_pi_jit( 10000 )
end = time.time()
print(end - start)
#!/bin/bash
#SBATCH --partition=gpgpu
# Change the qos for your gpu access
#SBATCH --qos=gpgpuhpcadmin
#SBATCH --time=04:00:00
#SBATCH --gres=gpu:p100:4
module load fosscuda/2019b
module load pytorch/1.5.1-python-3.7.4
module load web_proxy
python3 quickstart_tutorial.py
sleep 60
python3 tensorqs_tutorial.py
# Introduction
The Pytorch project offers some excellent tutorials which are replicated here for the Spartan environment.
https://github.com/pytorch/tutorials
Pytorch is a machine learning workflow, a subset of A.I., that involves data, creating models, optimising parameters, and training
models.
In this example, the the FashionMNIST dataset is used to train a neural network that predicts if an input image belongs to one of
the following classes: T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, or Ankle boot.
## What is PyTorch?
PyTorch is a Python-based scientific computing package serving two broad purposes:
* A replacement for NumPy to use the power of GPUs and other accelerators.
* An automatic differentiation library that is useful to implement neural networks.
## Slurm Requirements
Load an interactive job
$ sinteractive --partition=gpgpu --gres=gpu:p100:4 --qos=gpgpuhpcadmin --time=04:00:00
Load modules
$ module load fosscuda/2019b
$ module load pytorch/1.5.1-python-3.7.4
Note that the fosscuda/2019b modules consists of gcc-cuda/8.3.0-10.1.243 with OpenMPI: 3.1.4
See what else is loaded
$ module list
A web_proxy is also required if operating on a compute node.
$ module load web_proxy
Run the machine-learning example.
$ python3 quickstart_tutorial.py
Or sumit the Slurm script
$ sbatch 2019pytorch.slurm
## Step-by-Step Explanation of Basic Tutorial
Start python, import libraries
$ python3
..
>>> import torch
>>> from torch import nn
>>> from torch.utils.data import DataLoader
>>> from torchvision import datasets
>>> from torchvision.transforms import ToTensor, Lambda, Compose
>>> import matplotlib.pyplot as plt
PyTorch has two primitives to work with data: torch.utils.data.DataLoader and torch.utils.data.Dataset. The Dataset primitive stores
the samples and their corresponding labels, whereas the DataLoader primitive wraps an iterable around the Dataset.
The torchvision.datasets module contains Dataset objects for many real-world vision data. In this tutorial, we use the FashionMNIST
dataset. Every TorchVision Dataset includes two arguments: transform and target_transform to modify the samples and labels
respectively.
- - -
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)
# Download test data from open datasets.
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)
- - -
The Dataset is passed as an argument to DataLoader, which wraps an iterable over the dataset and supports automatic batching,
sampling, shuffling and multiprocess data loading. A batch size of 64 is defined, i.e. each element in the dataloader iterable
will return a batch of 64 features and labels.
- - -
batch_size = 64
# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print("Shape of X [N, C, H, W]: ", X.shape)
print("Shape of y: ", y.shape, y.dtype)
break
- - -
## Creating Models
To define a neural network in PyTorch, we create a class that inherits from nn.Module. We define the layers of the network in the
__init__ function and specify how data will pass through the network in the forward function. To accelerate operations in the neural
network, we move it to the GPU if available.
- - -
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
# Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork().to(device)
print(model)
- - -
## Optimizing the Model Parameters
To train a model, we need a loss function and an optimizer.
- - -
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
- - -
In a single training loop, the model makes predictions on the training dataset (fed to it in batches), and backpropagates the
prediction error to adjust the model's parameters.
- - -
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
- - -
Check the model's performance against the test dataset to ensure it is learning.
- - -
def test(dataloader, model):
size = len(dataloader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= size
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
- - -
The training process is conducted over several iterations (epochs). During each epoch, the model learns parameters to make better
predictions. We print the model's accuracy and loss at each epoch; we’d like to see the accuracy increase and the loss decrease with
every epoch.
- - -
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model)
print("Done!")
- - -
## Saving Models
A common way to save a model is to serialize the internal state dictionary (containing the model parameters).
- - -
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")
- - -
## Loading Models
The process for loading a model includes re-creating the model structure and loading the state dictionary into it.
- - -
model = NeuralNetwork()
model.load_state_dict(torch.load("model.pth"))
- - -
This model can now be used to make predictions.
- - -
classes = [
"T-shirt/top",
"Trouser",
"Pullover",
"Dress",
"Coat",
"Sandal",
"Shirt",
"Sneaker",
"Bag",
"Ankle boot",
]
model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
pred = model(x)
predicted, actual = classes[pred[0].argmax(0)], classes[y]
print(f'Predicted: "{predicted}", Actual: "{actual}"')
- - -
"""
`Learn the Basics <intro.html>`_ ||
**Quickstart** ||
`Tensors <tensorqs_tutorial.html>`_ ||
`Datasets & DataLoaders <data_tutorial.html>`_ ||
`Transforms <transforms_tutorial.html>`_ ||
`Build Model <buildmodel_tutorial.html>`_ ||
`Autograd <autogradqs_tutorial.html>`_ ||
`Optimization <optimization_tutorial.html>`_ ||
`Save & Load Model <saveloadrun_tutorial.html>`_
Quickstart
===================
This section runs through the API for common tasks in machine learning. Refer to the links in each section to dive deeper.
Working with data
-----------------
PyTorch has two `primitives to work with data <https://pytorch.org/docs/stable/data.html>`_:
``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``.
``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around
the ``Dataset``.
"""
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
######################################################################
# PyTorch offers domain-specific libraries such as `TorchText <https://pytorch.org/text/stable/index.html>`_,
# `TorchVision <https://pytorch.org/vision/stable/index.html>`_, and `TorchAudio <https://pytorch.org/audio/stable/index.html>`_,
# all of which include datasets. For this tutorial, we will be using a TorchVision dataset.
#
# The ``torchvision.datasets`` module contains ``Dataset`` objects for many real-world vision data like
# CIFAR, COCO (`full list here <https://pytorch.org/vision/stable/datasets.html>`_). In this tutorial, we
# use the FashionMNIST dataset. Every TorchVision ``Dataset`` includes two arguments: ``transform`` and
# ``target_transform`` to modify the samples and labels respectively.
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)
# Download test data from open datasets.
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)
######################################################################
# We pass the ``Dataset`` as an argument to ``DataLoader``. This wraps an iterable over our dataset, and supports
# automatic batching, sampling, shuffling and multiprocess data loading. Here we define a batch size of 64, i.e. each element
# in the dataloader iterable will return a batch of 64 features and labels.
batch_size = 64
# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print("Shape of X [N, C, H, W]: ", X.shape)
print("Shape of y: ", y.shape, y.dtype)
break
######################################################################
# Read more about `loading data in PyTorch <data_tutorial.html>`_.
#
######################################################################
# --------------
#
################################
# Creating Models
# ------------------
# To define a neural network in PyTorch, we create a class that inherits
# from `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_. We define the layers of the network
# in the ``__init__`` function and specify how data will pass through the network in the ``forward`` function. To accelerate
# operations in the neural network, we move it to the GPU if available.
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
# Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork().to(device)
print(model)
######################################################################
# Read more about `building neural networks in PyTorch <buildmodel_tutorial.html>`_.
#
######################################################################
# --------------
#
#####################################################################
# Optimizing the Model Parameters
# ----------------------------------------
# To train a model, we need a `loss function <https://pytorch.org/docs/stable/nn.html#loss-functions>`_
# and an `optimizer <https://pytorch.org/docs/stable/optim.html>`_.
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
#######################################################################
# In a single training loop, the model makes predictions on the training dataset (fed to it in batches), and
# backpropagates the prediction error to adjust the model's parameters.
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
##############################################################################
# We also check the model's performance against the test dataset to ensure it is learning.