Commit 9f2c5178 authored by root's avatar root

Nov 4 update

parent c4b3a28a
#!/bin/bash
# Handy Extract Program
# Modified Lev Lafayette 20201006 for order and LZMA2 xz files
if [[ -f $1 ]]; then
case $1 in
*.tar.bz2) tar xvjf $1 ;;
*.gz) gunzip $1 ;;
*.tar) tar xvf $1 ;;
*.tgz) tar xvzf $1 ;;
*.tar.xz) tar xvf $1 ;;
*.tar.gz) tar xvzf $1 ;;
*.bz2) bunzip2 $1 ;;
*.tar.bz2) tar xvjf $1 ;;
*.rar) unrar x $1 ;;
*.gz) gunzip $1 ;;
*.tar) tar xvf $1 ;;
*.tbz2) tar xvjf $1 ;;
*.tgz) tar xvzf $1 ;;
*.zip) unzip $1 ;;
*.Z) uncompress $1 ;;
*.7z) 7z x $1 ;;
......
To run comsol in GUI:
start FastX
load the module
run command:
comsol -3drend sw
......@@ -30,9 +30,7 @@ Often, code executed on the CPU is referred to as host code, and code running on
To run a sample CUDA job start with interactive job.
sinteractive --partition=gpgputest -A hpcadmingpgpu --gres=gpu:p100:4
Change "hpcadmingpgpu" to another gpgpu project.
sinteractive --partition=gpgpu --gres=gpu:p100:4
Load a CUDA module
......
......@@ -20,9 +20,7 @@ sinteractive --x11=first --partition=shortgpgpu --gres=gpu:p100:1
sinteractive --x11=first --partition=deeplearn --qos=gpgpudeeplearn --gres=gpu:v100:1
sinteractive --partition=gpgpu --account=hpcadmingpgpu --gres=gpu:2
# (Change hpcadmingpgpu to another gpgpu-enabled account)
sinteractive --partition=gpgpu --gres=gpu:2
# If the user is not using a Linux local machine they will need to install an X-windows client, such as Xming for MS-Windows or X11 on Mac OSX from the XQuartz project.
......
#!/bin/bash
#SBATCH --account=hpcadmingpgpu # Use a project ID that has access.
#SBATCH --partition=gpgputest
#SBATCH --account=hpcadmin # Use a project ID that has access.
#SBATCH --partition=gpgpu
#SBATCH --gres=gpu:2
#SBATCH --time=0:10:00
#SBATCH --ntasks=2
......
......@@ -8,7 +8,7 @@
#SBATCH --job-name namdgpu
# Which partition
#SBATCH --partition=gpgpu-test
#SBATCH --partition=gpgpu
# How many cores ?
#SBATCH --nodes=1
......
#!/bin/bash
# This is a sample job template for Gadi.
# Change the project to your project
#PBS -P vp61
# Standard Queue
#PBS -q normal
#PBS -l walltime=0:10:00
# PBS -l mem=5GB
# This is for local compute disk.
# PBS -l jobfs=1GB
#PBS -j oe
#PBS -l ncpus=2
# Change to working directory
#PBS -l wd
module load openmpi/4.0.2
mpiexec ./mpi-helloworld
# Basic Scheduler Commands
To submit a job use `qsub $JobName`. Job status can be determined by `qstat $JobID`, `qstat -s $JobID` or `qstat -u $Username`, or `qdel $JobID` to delete a job. To review a job's details use `qstat -f $JobID`.
Standard output and error streams are collected by PBSPro and saved in `<Jobname>.o<Jobid>` for standard output and `<Jobname>.e<Jobid>` for standard error.
To put a user hold on a job use `qhold $JobID`, and `qrls -h u $JobID` to release.
A job can be terminated and relaunched with `qrerun $JobID`
A job selection can be shown e.g., `qselect -u $username -l ncpus.gt. $number`
# PBS Directives
`#PBS -N job_name` for job name
`#PBS -j oe` or `eo` to combine output and error files; also `-e directory` or `-o $directory` for specific locations.
`#PBS -m abe` for mail when job aborts, begins, ends. Combine with `-M $email` directive.
# Gadi Directives
Jobs must explicitly declare the file systems accessed. Files in `/scratch/$project` and `/g/data/$project` directories must include the directive `-lstorage=scratch/$project+gdata/$project`.
# Example Scripts
# About NCI and Gadi
National Computational Infrastructure (NCI) is Australia's peak facility for computational research and is located at ANU, Canberra.
Main HPC system in Gadi, Australia's peak research supercomputer; 9 PetaFLOP peak compute performance, 15 PFs theoretical. Number 24 in the Top 500 in June 2020.
# Getting An Account
Getting an account on Gadi is not as easy as Spartan.
There are merit allocations schemes, collaborator schemes, a start-up scheme for new users and an industry access scheme.
Register for an account or new project at the MyNCI portal. `https://my.nci.org.au/`
The NCI Flagship Allocation Scheme provides for projects identified by the NCI Board as being of high-impact or national strategic
Main access through National Computational Merit Allocation Scheme (NCMAS). `https://ncmas.nci.org.au`. Includes NCI (Gadi), Pawsey Centre (Magnus), Monash (MASSIVE), and UQ (FlashLite).
NCI Start-up Scheme, much smaller compute quota, used primarily for evaluation. Follow the 'propose a project' link on MyNCI portal to submit a start-up proposal.
# Accessing Gadi
The hostname for Gadi is gadi.nci.org.au. As with similar systems logins are via SSH. The command `ssh username@Gadi.nci.org.au` will put the user one of the login nodes; use -Y for X-Windows forwarding.
Do consider using an SSH config and/or passwordless SSH, it will make things a lot easier
# Shell Environment
Gadi login configuation is located at `.config/gadi-login.conf`
This caan be used to change the the default project and the CLI shell that Gadi initiates, which is bash by default (e.g., `SHELL /bin/tcsh`). If you try to use a shell not registered it will default to bash.
# Modules and Scheduler
Gadi has software under a TCL enviroment modules scheme.
Gadi uses PBSPro for workload management, not Slurm. See PBS_Commands file.
There are sample job submission scripts in this directory for multicore, multinode, job dependencies, and job arrays on Gadi.
#!/bin/bash
# Example script to submit two jobs in a dependency
# Directives include; `after`, `afterok`, `afternotok`, `afterany`, `before`, `beforeok`, `beforenotok`, `beforeany`
FIRST=$(qsub job1-1.pbs)
echo $FIRST
SUB1=$(echo ${FIRST##* })
SECOND=$(qsub -W depend=afterany:$SUB1 job1-2.pbs)
echo $SECOND
#!/bin/bash
# NCI is not fond of job arrays and they are restricted on Gadi.
# Create the equivalent through multiple jobs using a heredoc.
# Put this in its own directory; run the herescript
# Then submit with
# for item in {1..5}; do qsub helloworld-${item}.pbs; done
for item in {1..5}
do
cat <<- EOF > helloworld-${item}.pbs
#!/bin/bash
#PBS -P vp61
#PBS -q normal
#PBS -l walltime=0:10:00
#PBS -j oe
#PBS -l wd
#PBS -l ncpus=2
mpiexec mpihelloworld-${item}
EOF
done
#!/bin/bash
#PBS -N HelloWorld
#PBS -P vp61
#PBS -q normal
#PBS -l walltime=0:10:00
# PBS -l mem=5GB
# PBS -l jobfs=1GB
#PBS -j oe
#PBS -l ncpus=2
#PBS -l wd
module load openmpi/4.0.2
mpiexec ./mpi-helloworld
sleep 120
#!/bin/bash
#PBS -N HelloWorld
#PBS -P vp61
#PBS -q normal
#PBS -l walltime=0:10:00
# PBS -l mem=5GB
# PBS -l jobfs=1GB
#PBS -j oe
#PBS -l ncpus=2
#PBS -l wd
module load openmpi/4.0.2
mpiexec ./mpi-helloworld
sleep 120
#!/bin/bash
#PBS -N HelloWorld
#PBS -P vp61
#PBS -q normal
#PBS -l walltime=0:10:00
# PBS -l mem=5GB
# PBS -l jobfs=1GB
#PBS -j oe
#PBS -l ncpus=2
#PBS -l wd
module load openmpi/4.0.2
mpiexec ./mpi-helloworld
#!/bin/bash
#PBS -N HelloWorld
#PBS -P vp61
#PBS -q normal
#PBS -l walltime=0:10:00
# PBS -l mem=5GB
# PBS -l jobfs=1GB
#PBS -j oe
#PBS -l ncpus=2
#PBS -l wd
module load openmpi/4.0.2
mpiexec ./mpi-helloworld
#!/bin/bash
#PBS -q normal
#PBS -l walltime=00:30:00,ncpus=4,mem=8GB
#PBS -l jobfs=100GB
INPUT_DIR=${PBS_O_WORKDIR}
OUTPUT_DIR=/g/data/$projectid
cp -r ${INPUT_DIR} ${PBS_JOBFS}/mydata
cd ${PBS_JOBFS}/mydata
myprogramme
tar -cf ${PBS_JOBID}.tar .
cp ${PBS_JOBID}.tar $OUTPUT_DIR
#include <stdio.h>
#include "mpi.h"
int main( argc, argv )
int argc;
char **argv;
{
int rank, size;
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &size );
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
printf( "Hello world from process %d of %d\n", rank, size );
MPI_Finalize();
return 0;
}
CXX=pgc++
CXXFLAGS=-fast -Minfo=all,intensity,ccff
LDFLAGS=${CXXFLAGS}
cg.x: main.o
${CXX} $^ -o $@ ${LDFLAGS}
main.o: main.cpp matrix.h matrix_functions.h vector.h vector_functions.h
.SUFFIXES: .o .cpp .h
.PHONY: clean
clean:
rm -Rf cg.x pgprof* *.o core
/*
* Copyright 2016 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstdlib>
#include <cstdio>
#include <omp.h>
#include "vector.h"
#include "vector_functions.h"
#include "matrix.h"
#include "matrix_functions.h"
#define N 200
#define MAX_ITERS 100
#define TOL 1e-12
int main() {
vector x,b;
vector r,p,Ap;
matrix A;
double one=1.0, zero=0.0;
double normr, rtrans, oldtrans, p_ap_dot , alpha, beta;
int iter=0;
//create matrix
allocate_3d_poisson_matrix(A,N);
printf("Rows: %d, nnz: %d\n", A.num_rows, A.row_offsets[A.num_rows]);
allocate_vector(x,A.num_rows);
allocate_vector(Ap,A.num_rows);
allocate_vector(r,A.num_rows);
allocate_vector(p,A.num_rows);
allocate_vector(b,A.num_rows);
initialize_vector(x,100000);
initialize_vector(b,1);
waxpby(one, x, zero, x, p);
matvec(A,p,Ap);
waxpby(one, b, -one, Ap, r);
rtrans=dot(r,r);
normr=sqrt(rtrans);
double st = omp_get_wtime();
do {
if(iter==0) {
waxpby(one,r,zero,r,p);
} else {
oldtrans=rtrans;
rtrans = dot(r,r);
beta = rtrans/oldtrans;
waxpby(one,r,beta,p,p);
}
normr=sqrt(rtrans);
matvec(A,p,Ap);
p_ap_dot = dot(Ap,p);
alpha = rtrans/p_ap_dot;
waxpby(one,x,alpha,p,x);
waxpby(one,r,-alpha,Ap,r);
if(iter%10==0)
printf("Iteration: %d, Tolerance: %.4e\n", iter, normr);
iter++;
} while(iter<MAX_ITERS && normr>TOL);
double et = omp_get_wtime();
printf("Total Iterations: %d Total Time: %lfs\n", iter, (et-st));
free_vector(x);
free_vector(r);
free_vector(p);
free_vector(Ap);
free_matrix(A);
return 0;
}
/*
* Copyright 2016 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include<cstdlib>
struct matrix {
unsigned int num_rows;
unsigned int nnz;
unsigned int *row_offsets;
unsigned int *cols;
double *coefs;
};
void allocate_3d_poisson_matrix(matrix &A, int N) {
int num_rows=(N+1)*(N+1)*(N+1);
int nnz=27*num_rows;
A.num_rows=num_rows;
A.row_offsets=(unsigned int*)malloc((num_rows+1)*sizeof(unsigned int));
A.cols=(unsigned int*)malloc(nnz*sizeof(unsigned int));
A.coefs=(double*)malloc(nnz*sizeof(double));
int offsets[27];
double coefs[27];
int zstride=N*N;
int ystride=N;
int i=0;
for(int z=-1;z<=1;z++) {
for(int y=-1;y<=1;y++) {
for(int x=-1;x<=1;x++) {
offsets[i]=zstride*z+ystride*y+x;
if(x==0 && y==0 && z==0)
coefs[i]=27;
else
coefs[i]=-1;
i++;
}
}
}
nnz=0;
for(int i=0;i<num_rows;i++) {
A.row_offsets[i]=nnz;
for(int j=0;j<27;j++) {
int n=i+offsets[j];
if(n>=0 && n<num_rows) {
A.cols[nnz]=n;
A.coefs[nnz]=coefs[j];
nnz++;
}
}
}
A.row_offsets[num_rows]=nnz;
A.nnz=nnz;
}
void free_matrix(matrix &A) {
unsigned int *row_offsets=A.row_offsets;
unsigned int * cols=A.cols;
double * coefs=A.coefs;
free(row_offsets);
free(cols);
free(coefs);
}
/*
* Copyright 2016 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "vector.h"
#include "matrix.h"
void matvec(const matrix& A, const vector& x, const vector &y) {
unsigned int num_rows=A.num_rows;
unsigned int *row_offsets=A.row_offsets;
unsigned int *cols=A.cols;
double *Acoefs=A.coefs;
double *xcoefs=x.coefs;
double *ycoefs=y.coefs;
for(int i=0;i<num_rows;i++) {
double sum=0;
int row_start=row_offsets[i];
int row_end=row_offsets[i+1];
for(int j=row_start;j<row_end;j++) {
unsigned int Acol=cols[j];
double Acoef=Acoefs[j];
double xcoef=xcoefs[Acol];
sum+=Acoef*xcoef;
}
ycoefs[i]=sum;
}
}
/*
* Copyright 2016 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include<cmath>
struct vector {
unsigned int n;
double *coefs;
};
void allocate_vector(vector &v, unsigned int n) {
v.n=n;
v.coefs=(double*)malloc(n*sizeof(double));
}
void free_vector(vector &v) {
free(v.coefs);
}
void initialize_vector(vector &v,double val) {
for(int i=0;i<v.n;i++)
v.coefs[i]=val;
}
/*
* Copyright 2016 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include<cstdlib>
#include "vector.h"
double dot(const vector& x, const vector& y) {
double sum=0;
unsigned int n=x.n;
double *xcoefs=x.coefs;
double *ycoefs=y.coefs;
for(int i=0;i<n;i++) {
sum+=xcoefs[i]*ycoefs[i];
}
return sum;
}
void waxpby(double alpha, const vector &x, double beta, const vector &y, const vector& w) {
unsigned int n=x.n;
double *xcoefs=x.coefs;
double *ycoefs=y.coefs;
double *wcoefs=w.coefs;
for(int i=0;i<n;i++) {
wcoefs[i]=alpha*xcoefs[i]+beta*ycoefs[i];
}
}
......@@ -15,13 +15,27 @@ main()
Examples exercises and solutions from Pawsey Supercomputing Centre.
1. Start an interactive job. Use a project ID that has gpgpu access.
`sinteractive --partition=gpgputest -A hpcadmingpgpu --gres=gpu:p100:4`
$ sinteractive --partition=gpgpu -A hpcadmin --gres=gpu:p100:4
$ cd ~ ; cp -r /usr/local/common/OpenACC .
$ source /usr/local/module/spartan_old.sh
$ module load PGI/19.10-GCC-8.3.0-2.32
2.Start with serial code
cd ~/OpenACC/Exercise/exe1
module load PGI/19.10-GCC-8.3.0-2.32
make
time ./heat_eq_serial
2. The Importance of Profiling
Excample here from ComputeCanada
$ cd ~/OpenACC/Profile
$ make
Check profile information.
3. Run serial code
Examples here from Pawsey Supercomputing Centre.
$ cd ~/OpenACC/Exercise/exe1
$ make
$ time ./heat_eq_serial
The output should be something like:
......@@ -32,7 +46,7 @@ real 0m5.062s
user 0m5.051s
sys 0m0.008s
3. Identify parallel blocks.
4. Identify parallel blocks.
PGI has inbuilt profiling tools. Nice!
......
......@@ -17,8 +17,8 @@ $ module load gcc/8.3.0
$ export OMP_NUM_THREADS=8
# Compile with OpenMP directives. These examples use free-form for Fortran e.g.,
$ gcc -fopenmp helloomp.c -o helloompc
$ gfortran -fopenmp helloomp.f90 -o helloompf
$ gcc -fopenmp helloomp1.c -o helloompc
$ gfortran -fopenmp helloomp1.f90 -o helloompf
# Execute the programs
......
# Don't do this on the head node.
# Many of these examples are from Lev Lafayette, Sequential and Parallel Programming with C and Fortran, VPAC, 2015-2016, ISBN 978-0-9943373-1-3, https://github.com/VPAC/seqpar
$ sinteractive --time=6:00:00 --ntasks=1 --cpus-per-task=8
# 2015 modules system ..
$ module purge
$ source /usr/local/module/spartan_old.sh
$ module load GCC/4.9.2
# .. or 2019 modules system
$ module purge
$ module load spartan_2019
$ module load gcc/8.3.0
# Export with the number of threads desired. Note that it is most efficient to have a number of cpus equal to the number of threads.
$ export OMP_NUM_THREADS=8
# Compile with OpenMP directives. These examples use free-form for Fortran e.g.,
$ gcc -fopenmp helloomp.c -o helloompc
$ gfortran -fopenmp helloomp.f90 -o helloompf
# Execute the programs
$ ./helloompc
$ ./helloompf
# Note that creating executables with different compilers requires a different compiler command OpenMP flag. For example:
$ module load intel/2017.u2
$ icc -qopenmp helloomp.c -o hellompc
$ ifort -qopenmp helloomp.f90 -o hellompf
$ ./helloompc
$ ./helloompf
$ module load PGI/18.5
$ pgcc -mp helloomp.c -o hellompc
$ pgf90 -mp helloomp.f90 -o hellompf
$ ./helloompc
$ ./helloompf
# Parallel regions can call functions within them with parallel regions. By default, these have 1 thread unless an environment variable is set.
# This example from Oracle's Sun Studio 12: OpenMP API User's Guide
$ gcc -fopenmp nested.c -o nestedc
$ export OMP_NESTED=true
$ ./nestedc
$ export OMP_NESTED=false
$ ./nestedc
# The same variable name can have different values with the parallel section and outside it.
$ gcc -fopenmp sharedhello.c -o sharedhelloompc
$ gfortran -fopenmp sharedhello.f90 -o sharedhelloompf
$ ./sharedhelloompc
$ ./sharedhelloompf
# One of the most typical applications is the parallelisation of loops. This includes a worksharing construct, which distributes the execution of the parallel region among the thread team members. There is an implicit barrier at the end of a loop construct, unless a `nowait` clause has been stated. Loop iteration variables are private by default.
# Note that this example makes use of "parallel for" and "parallel do". In most cases they are mostly equivalent; parallel spawns a group of threads, while the for/do divides loop iterations between the spawned threads.
$ gcc -fopenmp hello1millomp.c -o hello1millc
$ gfortran -fopenmp hello1millomp.f90 -o hello1millf
$ ./hello1millc
$ ./hello1millf
# Sometimes separating them is a good idea for "thread aware" constructions. e.g.,
#pragma omp parallel
{
#pragma omp for
for(1...10) // first parallel block
{
}
#pragma omp single
{} // single thread processing
#pragma omp for // second parallel block
for(1...10)
{
}
#pragma omp single
{} // make some single thread processing again
}
# There is also the simd directive; this allows loop iterations to be executed on SIMD lanes that are available to the thead.
# OpenMP only used to exploit multiple threads for multiple cores; the newer simd extention allows use of SIMD instructions on modern CPUs, such as Intel's AVX/SSE and ARM's NEON etc.
# On Spartan, the AVX-512 instructions are on all of the physical nodes, phi nodes, and bm[053-066].
# Use sbatch --constraint=avx512 to run specifically on the bm nodes with this.
$ gfortran -fopenmp hello1millsimd.f90 -o hello1millsimdf
$ gcc -fopenmp hello1millsimd.c -o hello1millsimdc
$ ./hello1millsimdf
$ ./hello1millsimdc
# The sections construct distributes threads among structured blocks. Note the threadids
gfortran -fopenmp hello3versomp.f90 -o hello3versompf
gcc -fopenmp hello3versomp.c -o hello3versompc
$ ./hello3versompf
$ ./hello3versompc
# The `task` constructs are very useful to mosty efficiently implement parallelism. The general principle is that a thread generates tasks which are then executed according to the runtime system, either immediately or delayed.
$ gfortran -fopenmp colourless-3.f90 -o colourless-3f
$ gcc -fopenmp colourless-3.c -o colourless-3c
$ ./colourless-3f
$ ./colourless-3c
# Internal control variables and their interactions with runtime library routines are illustrated by the examples icv1.f90 and icv1.c.
# Four ICV's - nest-var, mex-active-levels-var, dyn-var, and nthreads-var - are modified by calls their respective library routines (omp_set_nested(), omp_set_max_active_levels(), omp_set_dynamic(), and omp_set_num_threads()).
$ gcc -fopenmp icv1.c -o icv1c
$ gfortran -fopenmp icv1.f90 -o icv1f
$ ./icv1c
$ ./icv1f
# When submitting OpenMP jobs to the cluster don't forget to include the environment variables in the job script!
# See: hello3vers.slurm
#include <stdio.h>
#include <stdlib.h>
#include "omp.h"
int main (void)
{
#pragma omp parallel
{
#pragma omp single
{
printf("Colourless ");
printf("green ");
printf("ideas ");
printf("sleep furiously ");
}
}
printf("\n");
return(0);
}
program colourless
include "omp_lib.h"
!$omp parallel
!$omp single
print *, "Colourless "
print *, "green "
print *, "ideas "
print *, "sleep furiously "
!$omp end single
!$omp end parallel
end program colourless
......@@ -6,7 +6,7 @@ int main (void)
{
#pragma omp parallel
{
#pragma omp single
#pragma nowait
{
printf("Noam Chomsky said ");
#pragma omp task
......
#include <stdio.h>
#include <stdlib.h>
#include "omp.h"
int main (void)
{
printf("Noam Chomsky said ");
#pragma parallel
{
#pragma omp sections nowait
{
#pragma omp section
printf("Colourless ");
#pragma omp section
printf("green ");
#pragma omp section
printf("ideas ");
#pragma omp section
printf("sleep furiously ");
}
}
printf("\n");
return(0);
}
#include <stdio.h>
#include <stdlib.h>
#include "omp.h"
int main (int argc, char *argv[])
{
#pragma omp parallel
{
#pragma omp single
{
printf("Colourless ");
#pragma omp task
{
printf("green ");
}
#pragma omp task
{
printf("ideas ");
}
#pragma omp taskwait
printf("sleep furiously ");
}
}
printf("\n");
return(0);
}
#include <stdio.h>
#include <omp.h>
#define N 1000
#define CHUNKSIZE 100
// Vector-add program
// Arrays A, B, C, and variable N will be shared by all threads.
// Variable I will be private to each thread; each thread will have its own unique copy.
// The iterations of the loop will be distributed dynamically in CHUNK sized pieces.
// Threads will not synchronize upon completing their individual pieces of work (NOWAIT).
main(int argc, char *argv[]) {
int i, chunk;
float a[N], b[N], c[N];
/* Some initializations */
for (i=0; i < N; i++)
a[i] = b[i] = i * 1.0;
chunk = CHUNKSIZE;
#pragma omp parallel shared(a,b,c,chunk) private(i)
{
#pragma omp for schedule(dynamic,chunk) nowait
for (i=0; i < N; i++)
c[i] = a[i] + b[i];
} /* end of parallel region */
}
PROGRAM VEC_ADD_DO
! Vector-add program
! Arrays A, B, C, and variable N will be shared by all threads.
! Variable I will be private to each thread; each thread will have its own unique copy.
! The iterations of the loop will be distributed dynamically in CHUNK sized pieces.
! Threads will not synchronize upon completing their individual pieces of work (NOWAIT).
INTEGER N, CHUNKSIZE, CHUNK, I
PARAMETER (N=1000)
PARAMETER (CHUNKSIZE=100)
REAL A(N), B(N), C(N)
! Some initializations
DO I = 1, N
A(I) = I * 1.0
B(I) = A(I)
ENDDO
CHUNK = CHUNKSIZE
!$OMP PARALLEL SHARED(A,B,C,CHUNK) PRIVATE(I)
!$OMP DO SCHEDULE(DYNAMIC,CHUNK)
DO I = 1, N
C(I) = A(I) + B(I)
ENDDO
!$OMP END DO NOWAIT
!$OMP END PARALLEL
END
#include <stdio.h>
int main(void)
{
char greetings[] = "Hello world!";
int a;
for ( a = 0; a < 1000000; a = a + 1 )
{
printf("%s\n", greetings);
}
return 0;
}
#include <stdio.h>
#include "omp.h"
int main(void)
{
char greetings[] = "Hello world!";
int a;
#pragma omp parallel
{
#pragma omp for
for ( a = 0; a < 1000000; a = a + 1 )
{
printf("%s\n", greetings);
}
}
return 0;
}
#include <stdio.h>
#include "omp.h"
int main(void)
{
int id;
#pragma omp parallel num_threads(4) private(id)
{
int id = omp_get_thread_num();
printf("Hello world %d\n", id);
}
return 0;
}
program hello2omp
include "omp_lib.h"
integer :: id
!$omp parallel num_threads(17) private(id)
id = omp_get_thread_num()
print *, "Hello world", id
!$omp end parallel
end program hello2omp
#include <stdio.h>
int main(void)
{
char greetings[] = "Hello world!";
int a;
#pragma omp parallel
for ( a = 1; a < 11; a = a + 1 )
{
printf("%s\n", greetings);
}
return 0;
}
program SharedHello
implicit none
character(len=16) :: greetings
greetings = "Hello World!"
print *, "Before parallel section: ", greetings
!$omp parallel private(greetings)
greetings = "Saluton mondo!"
print *, "Inside parallel section: ", greetings
!$omp end parallel
print *, "After parallel section: ", greetings
end program SharedHello
#include <stdio.h>
#include <omp.h>
int main(void)
{
......
#include <omp.h>
#include <stdio.h>
#include <omp.h>
main(int argc, char *argv[]) {
int nthreads, threadid;
/* Fork a team of threads with each thread having a private tid variable */
#pragma omp parallel private(tid)
#pragma omp parallel private(threadid)
{
/* Obtain and print thread id */
......@@ -13,7 +14,7 @@
printf("Hello World from thread = %d\n", threadid);
/* Only master thread does this */
if (tid == 0)
if (threadid == 0)
{
nthreads = omp_get_num_threads();
printf("Number of threads = %d\n", nthreads);
......
PROGRAM HELLO
INTEGER NTHREADS, TID, OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM
INTEGER NTHREADS, THREADID, OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM
! Fork a team of threads with each thread having a private TID variable
!$OMP PARALLEL PRIVATE(THREADID)
! Obtain and print thread id
TID = OMP_GET_THREAD_NUM()
THREADID = OMP_GET_THREAD_NUM()
PRINT *, 'Hello World from thread = ', THREADID
! Only master thread does this
......
! @@name: icv.1f
! @@type: F-fixed
! @@compilable: yes
! @@linkable: yes
! @@expect: success
program icv
use omp_lib
call omp_set_nested(.true.)
call omp_set_max_active_levels(8)
call omp_set_dynamic(.false.)
call omp_set_num_threads(2)
!$omp parallel
call omp_set_num_threads(3)
!$omp parallel
call omp_set_num_threads(4)
!$omp single
! The following should print:
! Inner: max_act_lev= 8 , num_thds= 3 , max_thds= 4
! Inner: max_act_lev= 8 , num_thds= 3 , max_thds= 4
print *, "Inner: max_act_lev=", omp_get_max_active_levels(),
& ", num_thds=", omp_get_num_threads(),
& ", max_thds=", omp_get_max_threads()
!$omp end single
!$omp end parallel
!$omp barrier
!$omp single
! The following should print:
! Outer: max_act_lev= 8 , num_thds= 2 , max_thds= 3
print *, "Outer: max_act_lev=", omp_get_max_active_levels(),
& ", num_thds=", omp_get_num_threads(),
& ", max_thds=", omp_get_max_threads()
!$omp end single
!$omp end parallel
end
#include <omp.h>
#define CHUNKSIZE 100
#define N 1000
main ()
{
int i, chunk;
float a[N], b[N], c[N];
/* Some initializations */
for (i=0; i < N; i++)
a[i] = b[i] = i * 1.0;
chunk = CHUNKSIZE;
#pragma omp parallel shared(a,b,c,chunk) private(i)
{
#pragma omp for schedule(dynamic,chunk) nowait
for (i=0; i < N; i++)
c[i] = a[i] + b[i];
} /* end of parallel section */
}
PROGRAM VEC_ADD_DO
INTEGER N, CHUNKSIZE, CHUNK, I
PARAMETER (N=1000)
PARAMETER (CHUNKSIZE=100)
REAL A(N), B(N), C(N)
! Some initializations
DO I = 1, N
A(I) = I * 1.0
B(I) = A(I)
ENDDO
CHUNK = CHUNKSIZE
!$OMP PARALLEL SHARED(A,B,C,CHUNK) PRIVATE(I)
!$OMP DO SCHEDULE(DYNAMIC,CHUNK)
DO I = 1, N
C(I) = A(I) + B(I)
ENDDO
!$OMP END DO NOWAIT
print *, C(I)
!$OMP END PARALLEL
END
/******************************************************************************
* FILE: omp_mm.c
* DESCRIPTION:
* OpenMp Example - Matrix Multiply - C Version
* Demonstrates a matrix multiply using OpenMP. Threads share row iterations
* according to a predefined chunk size.
* AUTHOR: Blaise Barney
* LAST REVISED: 06/28/05
******************************************************************************/
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define NRA 62 /* number of rows in matrix A */
#define NCA 15 /* number of columns in matrix A */
#define NCB 7 /* number of columns in matrix B */
int main (int argc, char *argv[])
{
int tid, nthreads, i, j, k, chunk;
double a[NRA][NCA], /* matrix A to be multiplied */
b[NCA][NCB], /* matrix B to be multiplied */
c[NRA][NCB]; /* result matrix C */
chunk = 10; /* set loop iteration chunk size */
/*** Spawn a parallel region explicitly scoping all variables ***/
#pragma omp parallel shared(a,b,c,nthreads,chunk) private(tid,i,j,k)
{
tid = omp_get_thread_num();
if (tid == 0)
{
nthreads = omp_get_num_threads();
printf("Starting matrix multiple example with %d threads\n",nthreads);
printf("Initializing matrices...\n");
}
/*** Initialize matrices ***/
#pragma omp for schedule (static, chunk)
for (i=0; i<NRA; i++)
for (j=0; j<NCA; j++)
a[i][j]= i+j;
#pragma omp for schedule (static, chunk)
for (i=0; i<NCA; i++)
for (j=0; j<NCB; j++)
b[i][j]= i*j;
#pragma omp for schedule (static, chunk)
for (i=0; i<NRA; i++)
for (j=0; j<NCB; j++)
c[i][j]= 0;
/*** Do matrix multiply sharing iterations on outer loop ***/
/*** Display who does which iterations for demonstration purposes ***/
printf("Thread %d starting matrix multiply...\n",tid);
#pragma omp for schedule (static, chunk)
for (i=0; i<NRA; i++)
{
printf("Thread=%d did row=%d\n",tid,i);
for(j=0; j<NCB; j++)
for (k=0; k<NCA; k++)
c[i][j] += a[i][k] * b[k][j];
}
} /*** End of parallel region ***/
/*** Print results ***/
printf("******************************************************\n");
printf("Result Matrix:\n");
for (i=0; i<NRA; i++)
{
for (j=0; j<NCB; j++)
printf("%6.2f ", c[i][j]);
printf("\n");
}
printf("******************************************************\n");
printf ("Done.\n");
}
C******************************************************************************
C FILE: omp_mm.f
C DESCRIPTION:
C OpenMp Example - Matrix Multiply - Fortran Version
C Demonstrates a matrix multiply using OpenMP. Threads share row iterations
C according to a predefined chunk size.
C AUTHOR: Blaise Barney
C LAST REVISED: 1/5/04 Blaise Barney
C******************************************************************************
PROGRAM MATMULT
INTEGER NRA, NCA, NCB, TID, NTHREADS, I, J, K, CHUNK,
+ OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM
C number of rows in matrix A
PARAMETER (NRA=62)
C number of columns in matrix A
PARAMETER (NCA=15)
C number of columns in matrix B
PARAMETER (NCB=7)
REAL*8 A(NRA,NCA), B(NCA,NCB), C(NRA,NCB)
C Set loop iteration chunk size
CHUNK = 10
C Spawn a parallel region explicitly scoping all variables
!$OMP PARALLEL SHARED(A,B,C,NTHREADS,CHUNK) PRIVATE(TID,I,J,K)
TID = OMP_GET_THREAD_NUM()
IF (TID .EQ. 0) THEN
NTHREADS = OMP_GET_NUM_THREADS()
PRINT *, 'Starting matrix multiple example with', NTHREADS,
+ 'threads'
PRINT *, 'Initializing matrices'
END IF
C Initialize matrices
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 30 I=1, NRA
DO 30 J=1, NCA
A(I,J) = (I-1)+(J-1)
30 CONTINUE
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 40 I=1, NCA
DO 40 J=1, NCB
B(I,J) = (I-1)*(J-1)
40 CONTINUE
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 50 I=1, NRA
DO 50 J=1, NCB
C(I,J) = 0
50 CONTINUE
C Do matrix multiply sharing iterations on outer loop
C Display who does which iterations for demonstration purposes
PRINT *, 'Thread', TID, 'starting matrix multiply...'
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 60 I=1, NRA
PRINT *, 'Thread', TID, 'did row', I
DO 60 J=1, NCB
DO 60 K=1, NCA
C(I,J) = C(I,J) + A(I,K) * B(K,J)
60 CONTINUE
C End of parallel region
!$OMP END PARALLEL
C Print results
PRINT *, '******************************************************'
PRINT *, 'Result Matrix:'
DO 90 I=1, NRA
DO 80 J=1, NCB
WRITE(*,70) C(I,J)
70 FORMAT(2x,f8.2,$)
80 CONTINUE
PRINT *, ' '
90 CONTINUE
PRINT *, '******************************************************'
PRINT *, 'Done.'
END
C******************************************************************************
C FILE: omp_mm.f
C DESCRIPTION:
C OpenMp Example - Matrix Multiply - Fortran Version
C Demonstrates a matrix multiply using OpenMP. Threads share row iterations
C according to a predefined chunk size.
C AUTHOR: Blaise Barney
C LAST REVISED: 1/5/04 Blaise Barney
C******************************************************************************
PROGRAM MATMULT
INTEGER NRA, NCA, NCB, TID, NTHREADS, I, J, K, CHUNK,
+ OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM
C number of rows in matrix A
PARAMETER (NRA=62)
C number of columns in matrix A
PARAMETER (NCA=15)
C number of columns in matrix B
PARAMETER (NCB=7)
REAL*8 A(NRA,NCA), B(NCA,NCB), C(NRA,NCB)
C Set loop iteration chunk size
CHUNK = 10
C Spawn a parallel region explicitly scoping all variables
!$OMP PARALLEL SHARED(A,B,C,NTHREADS,CHUNK) PRIVATE(TID,I,J,K)
TID = OMP_GET_THREAD_NUM()
IF (TID .EQ. 0) THEN
NTHREADS = OMP_GET_NUM_THREADS()
PRINT *, 'Starting matrix multiple example with', NTHREADS,
+ 'threads'
PRINT *, 'Initializing matrices'
END IF
C Initialize matrices
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 30 I=1, NRA
DO 30 J=1, NCA
A(I,J) = (I-1)+(J-1)
30 CONTINUE
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 40 I=1, NCA
DO 40 J=1, NCB
B(I,J) = (I-1)*(J-1)
40 CONTINUE
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 50 I=1, NRA
DO 50 J=1, NCB
C(I,J) = 0
50 CONTINUE
C Do matrix multiply sharing iterations on outer loop
C Display who does which iterations for demonstration purposes
PRINT *, 'Thread', TID, 'starting matrix multiply...'
!$OMP DO SCHEDULE(STATIC, CHUNK)
DO 60 I=1, NRA
PRINT *, 'Thread', TID, 'did row', I
DO 60 J=1, NCB
DO 60 K=1, NCA
C(I,J) = C(I,J) + A(I,K) * B(K,J)
60 CONTINUE
C End of parallel region
!$OMP END PARALLEL
C Print results
PRINT *, '******************************************************'
PRINT *, 'Result Matrix:'
DO 90 I=1, NRA
DO 80 J=1, NCB
WRITE(*,70) C(I,J)
70 FORMAT(2x,f8.2,$)
80 CONTINUE
PRINT *, ' '
90 CONTINUE
PRINT *, '******************************************************'
PRINT *, 'Done.'
END
FUNCTION f(a)
IMPLICIT NONE
double precision a
double precision f
f = 2.d0 / SQRT(1.d0 - a*a)
END
! ===================================================
PROGRAM Compute_PI
IMPLICIT NONE
interface
FUNCTION f(a)
double precision a
double precision f
END FUNCTION
end interface
INTEGER N, i, num_threads
DOUBLE PRECISION w, x, sum
DOUBLE PRECISION pi, mypi
N = 50000000 !! Number of intervals
w = 1.0d0/N !! width of each interval
sum = 0.0d0
!$OMP PARALLEL PRIVATE(x, mypi)
mypi = 0.0d0;
!$OMP DO
DO i = 0, N-1 !! Parallel Loop
x = w * (i + 0.5d0)
mypi = mypi + w*f(x)
END DO
!$OMP END DO
!$OMP CRITICAL
pi = pi + mypi
!$OMP END CRITICAL
!$OMP END PARALLEL
PRINT *, "Pi = ", pi
END PROGRAM
#include <stdio.h>
#include "omp.h"
int main () {
int i, j;
#pragma omp parallel
#pragma omp for
/* local variable definition */
for(i = 2; i<10000; i++) {
for(j = 2; j <= (i/j); j++)
if(!(i%j)) break; // if factor found, not prime
if(j > (i/j))
printf("%d is prime\n", i);
}
return 0;
}
#include <stdio.h>
#include <omp.h>
main(int argc, char *argv[]) {
int i, n, chunk;
float a[100], b[100], result;
/* Some initializations */
n = 100;
chunk = 10;
result = 0.0;
for (i=0; i < n; i++) {
a[i] = i * 1.0;
b[i] = i * 2.0;
}
#pragma omp parallel for \
default(shared) private(i) \
schedule(static,chunk) \
reduction(+:result)
for (i=0; i < n; i++)
result = result + (a[i] * b[i]);
printf("Final result= %f\n",result);
}
PROGRAM DOT_PRODUCT
INTEGER N, CHUNKSIZE, CHUNK, I
PARAMETER (N=100)
PARAMETER (CHUNKSIZE=10)
REAL A(N), B(N), RESULT
! Some initializations
DO I = 1, N
A(I) = I * 1.0
B(I) = I * 2.0
ENDDO
RESULT= 0.0
CHUNK = CHUNKSIZE
!$OMP PARALLEL DO
!$OMP& DEFAULT(SHARED) PRIVATE(I)
!$OMP& SCHEDULE(STATIC,CHUNK)
!$OMP& REDUCTION(+:RESULT)
DO I = 1, N
RESULT = RESULT + (A(I) * B(I))
ENDDO
!$OMP END PARALLEL DO
PRINT *, 'Final Result= ', RESULT
END
#include <stdio.h>
#include <omp.h>
#define N 1000
/* Vector Add with Sections directive */
main(int argc, char *argv[]) {
int i;
float a[N], b[N], c[N], d[N];
/* Some initializations */
for (i=0; i < N; i++) {
a[i] = i * 1.5;
b[i] = i + 22.35;
}
#pragma omp parallel shared(a,b,c,d) private(i)
{
#pragma omp sections nowait
{
#pragma omp section
for (i=0; i < N; i++)
c[i] = a[i] + b[i];
#pragma omp section
for (i=0; i < N; i++)
d[i] = a[i] * b[i];
} /* end of sections */
} /* end of parallel region */
}
PROGRAM VEC_ADD_SECTIONS
! Simple vector add with sections
INTEGER N, I
PARAMETER (N=1000)
REAL A(N), B(N), C(N), D(N)
! Some initializations
DO I = 1, N
A(I) = I * 1.5
B(I) = I + 22.35
ENDDO
!$OMP PARALLEL SHARED(A,B,C,D), PRIVATE(I)
!$OMP SECTIONS
!$OMP SECTION
DO I = 1, N
C(I) = A(I) + B(I)
ENDDO
!$OMP SECTION
DO I = 1, N
D(I) = A(I) * B(I)
ENDDO
!$OMP END SECTIONS NOWAIT
!$OMP END PARALLEL
END
/******************************************************************************
* FILE: omp_workshare2.c
* DESCRIPTION:
* OpenMP Example - Sections Work-sharing - C Version
* In this example, the OpenMP SECTION directive is used to assign
* different array operations to each thread that executes a SECTION.
* AUTHOR: Blaise Barney 5/99
* LAST REVISED: 07/16/07
******************************************************************************/
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define N 50
int main (int argc, char *argv[])
{
int i, nthreads, tid;
float a[N], b[N], c[N], d[N];
/* Some initializations */
for (i=0; i<N; i++) {
a[i] = i * 1.5;
b[i] = i + 22.35;
c[i] = d[i] = 0.0;
}
#pragma omp parallel shared(a,b,c,d,nthreads) private(i,tid)
{
tid = omp_get_thread_num();
if (tid == 0)
{
nthreads = omp_get_num_threads();
printf("Number of threads = %d\n", nthreads);
}
printf("Thread %d starting...\n",tid);
#pragma omp sections nowait
{
#pragma omp section
{
printf("Thread %d doing section 1\n",tid);
for (i=0; i<N; i++)
{
c[i] = a[i] + b[i];
printf("Thread %d: c[%d]= %f\n",tid,i,c[i]);
}
}
#pragma omp section
{
printf("Thread %d doing section 2\n",tid);
for (i=0; i<N; i++)
{
d[i] = a[i] * b[i];
printf("Thread %d: d[%d]= %f\n",tid,i,d[i]);
}
}
} /* end of sections */
printf("Thread %d done.\n",tid);
} /* end of parallel section */
}
C******************************************************************************
C FILE: omp_workshare2.f
C DESCRIPTION:
C OpenMP Example - Sections Work-sharing - Fortran Version
C In this example, the OpenMP SECTION directive is used to assign
C different array operations to each thread that executes a SECTION.
C AUTHOR: Blaise Barney 5/99
C LAST REVISED: 07/16/07
C******************************************************************************
PROGRAM WORKSHARE2
INTEGER N, I, NTHREADS, TID, OMP_GET_NUM_THREADS,
+ OMP_GET_THREAD_NUM
PARAMETER (N=50)
REAL A(N), B(N), C(N), D(N)
! Some initializations
DO I = 1, N
A(I) = I * 1.5
B(I) = I + 22.35
C(N) = 0.0
D(N) = 0.0
ENDDO
!$OMP PARALLEL SHARED(A,B,C,D,NTHREADS), PRIVATE(I,TID)
TID = OMP_GET_THREAD_NUM()
IF (TID .EQ. 0) THEN
NTHREADS = OMP_GET_NUM_THREADS()
PRINT *, 'Number of threads =', NTHREADS
END IF
PRINT *, 'Thread',TID,' starting...'
!$OMP SECTIONS
!$OMP SECTION
PRINT *, 'Thread',TID,' doing section 1'
DO I = 1, N
C(I) = A(I) + B(I)
WRITE(*,100) TID,I,C(I)
100 FORMAT(' Thread',I2,': C(',I2,')=',F8.2)
ENDDO
!$OMP SECTION
PRINT *, 'Thread',TID,' doing section 2'
DO I = 1, N
D(I) = A(I) * B(I)
WRITE(*,100) TID,I,D(I)
ENDDO
!$OMP END SECTIONS NOWAIT
PRINT *, 'Thread',TID,' done.'
!$OMP END PARALLEL
END
#include <stdio.h>
#include <omp.h>
int a, b, i, tid;
float x;
#pragma omp threadprivate(a, x)
main(int argc, char *argv[]) {
/* Explicitly turn off dynamic threads */
omp_set_dynamic(0);
printf("1st Parallel Region:\n");
#pragma omp parallel private(b,tid)
{
tid = omp_get_thread_num();
a = tid;
b = tid;
x = 1.1 * tid +1.0;
printf("Thread %d: a,b,x= %d %d %f\n",tid,a,b,x);
} /* end of parallel region */
printf("************************************\n");
printf("Master thread doing serial work here\n");
printf("************************************\n");
printf("2nd Parallel Region:\n");
#pragma omp parallel private(tid)
{
tid = omp_get_thread_num();
printf("Thread %d: a,b,x= %d %d %f\n",tid,a,b,x);
} /* end of parallel region */
}
PROGRAM THREADPRIV
INTEGER A, B, I, TID, OMP_GET_THREAD_NUM
REAL*4 X
COMMON /C1/ A
!$OMP THREADPRIVATE(/C1/, X)
! Explicitly turn off dynamic threads
CALL OMP_SET_DYNAMIC(.FALSE.)
PRINT *, '1st Parallel Region:'
!$OMP PARALLEL PRIVATE(B, TID)
TID = OMP_GET_THREAD_NUM()
A = TID
B = TID
X = 1.1 * TID + 1.0
PRINT *, 'Thread',TID,': A,B,X=',A,B,X
!$OMP END PARALLEL
PRINT *, '************************************'
PRINT *, 'Master thread doing serial work here'
PRINT *, '************************************'
PRINT *, '2nd Parallel Region: '
!$OMP PARALLEL PRIVATE(TID)
TID = OMP_GET_THREAD_NUM()
PRINT *, 'Thread',TID,': A,B,X=',A,B,X
!$OMP END PARALLEL
END
# A number of these examples come from Lev Lafayette, Sequential and Parallel Programming with C and Fortran, VPAC, 2015-2016, ISBN 978-0-9943373-1-3
# You will need to add a partition in each of these Slurm scripts; "physical" is recommended. e.g.,
# #SBATCH --partition=physical
module purge
module load spartan_2019
module load openmpi/3.1.4
......
......@@ -5,6 +5,7 @@
int main(argc,argv)
int argc;
char *argv[];
{
int myid, numprocs;
int tag, source, destination, count;
......@@ -18,6 +19,17 @@ char *argv[];
source=0;
destination=1;
count=1;
/*
volatile int i = 0;
char hostname[256];
gethostname(hostname, sizeof(hostname));
printf("PID %d on %s ready for attach\n", getpid(), hostname);
fflush(stdout);
while (0 == i)
sleep(5);
*/
if(myid == source){
printf( "I am the root 0 process of the group (total %d).\n", numprocs );
buffer=1729;
......
......@@ -5,6 +5,7 @@ Short for 'Virtual Environment', a virtualenv allows you to create an isolated w
## Load a python module and check that virtualenv is available
```
$ source /usr/local/module/spartan_old.sh
$ module load Python/3.7.1-GCC-6.2.0
$ which virtualenv
/usr/local/easybuild/software/Python/3.7.1-GCC-6.2.0/bin/virtualenv
......
#!/bin/bash
#SBATCH --partition=gpgpu
#SBATCH --gres=gpu:4
#SBATCH --account=hpcadmingpgpu
#SBATCH --account=hpcadmin
# Use a project ID that has gpgpu access.
module load QuantumESPRESSO/5.4.0-intel-2016.u3
module load CUDA/9.0.176-intel-2017.u2
#!/bin/bash
#SBATCH --partition=gpgpu
#SBATCH --gres=gpu:4
#SBATCH --account=hpcadmingpgpu
#SBATCH --account=hpcadmin
# Use a project ID that has gpgpu access.
module load QuantumESPRESSO/5.4.0-intel-2016.u3
module load CUDA/9.0.176-intel-2017.u2
This diff is collapsed.
# Program make a simple calculator that can add, subtract, multiply and divide using functions
# Calculator functions
add <- function(x, y) {
return(x + y)
}
subtract <- function(x, y) {
return(x - y)
}
multiply <- function(x, y) {
return(x * y)
}
divide <- function(x, y) {
return(x / y)
}
# User input for the calculator
cat("Select the calculator operation.\n")
print("1.Add")
print("2.Subtract")
print("3.Multiply")
print("4.Divide")
cat("Please enter a number please: ");
choice <- as.integer(readLines("stdin",n=1));
cat("You entered")
print(choice);
cat( "\n" )
# User input for integers
cat("Enter your first integer.\n")
num1 <- as.integer(readLines("stdin",n=1));
cat("Enter your second integer.\n")
num2 <- as.integer(readLines("stdin",n=1));
operator <- switch(choice,"+","-","*","/")
operator <- switch(choice,"add","subtract","multiply","divide")
result <- switch(choice, add(num1, num2), subtract(num1, num2), multiply(num1, num2), divide(num1, num2))
print(paste(num1, operator, num2, "=", result))
# Program to check if the input number is prime or not
# Take input from the user and works as a Rscript, Rstudio, interactive etc.
#
# Read in user input
#
# Lev Lafayette, 20200922
#
cat("Enter an integer please: ");
num <- as.integer(readLines("stdin",n=1));
print(num);
flag = 0
# Prime numbers are greater than 1
if(num > 1)
{
# check for factors
flag = 1
for(i in 2:(num-1)) {
if ((num %% i) == 0) {
flag = 0
break
}
}
}
if(num == 2) flag = 1
if(flag == 1) {
print(paste(num,"is a prime number"))
} else {
print(paste(num,"is not a prime number"))
}
1: Once you have Relion GUI, Click the case (eg: 3D classification) you want to run, then selected Running tab on right side:
fill which the number you need:
Number of MPI procs: 8 (equal --ntasks=8)
Number of threads: 4 (equal --cpus-per-task=4)
Submit to queue? Yes
Queue name: gpgpu (equal --partition=gpgpu)
Queue submit command: sbatch
Standard submission script: /usr/local/common/RELION/relion.sh (Where the relion.sh script is, See 2 for more details)
Minimum dedicated cores per node: 1
Additional arguments:
2: You can change variables for '--gres' '--time' and '--mem-per-cpu', but please leave the rest as what it is.
3: To use GPU, you need to go to Compute tab, set 'Use GPU acceleration?' to 'Yes', and leave 'Which GPUs to use' blank.
For more detail, you can link:
https://hpc.nih.gov/apps/RELION/index.html
#!/bin/bash
#SBATCH --ntasks=XXXmpinodesXXX
#SBATCH --partition=XXXqueueXXX
#SBATCH --qos=gpgpuhpcadmin
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=XXXthreadsXXX
#SBATCH --time=1:00:00
#SBATCH --mem-per-cpu=12g
#SBATCH --gres=gpu:2
srun XXXcommandXXX
Quote Characters
----------------
================
It is good practise to enclose the regular expression in single quotes, to prevent the shell from expanding the expression rather than the grep process.
......@@ -39,6 +39,10 @@ time grep -F searchterm *
# The option `-l` with grep will print only the name of the each input file which matches the regular expression. This can be used with xargs to search multiple files for multiple search terms.
# Example: Search through multiple directories for a search term, starting from current working directory. Note that a recursive grep really should be used with `-l`.
grep -rl searchterm .
# Example: Search through a directory of build scripts for those files that use the Tarball block, have an installstep parameter, and use the dummy toolchain.
grep -l 'Tarball' * -R | xargs grep -l 'installstep' | xargs grep -l 'dummy'
......
#!/bin/bash
#SBATCH --nodes 1
#SBATCH --account hpcadmingpgpu
#SBATCH --account hpcadmin
# Use a project ID that has gpgpu access.
#SBATCH --partition gpgpu
#SBATCH --gres=gpu:p100:4
......
#!/bin/bash
#SBATCH --nodes 1
#SBATCH --account hpcadmingpgpu
#SBATCH --account hpcadmin
# Use a project ID that has gpgpu access.
#SBATCH --partition gpgpu
#SBATCH --gres=gpu:p100:4
......
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --account hpcadmingpgpu
#SBATCH --account hpcadmin
# Use a project ID that has gpgpu access.
#SBATCH --partition shortgpgpu
#SBATCH --gres=gpu:p100:1
......
#!/bin/bash
#SBATCH --nodes 1
#SBATCH --partition shortgpgpu
#SBATCH --partition gpgpu
#SBATCH --gres=gpu:p100:1
#SBATCH --time 00:05:00
#SBATCH --cpus-per-task=1
#SBATCH -A hpcadmin
## Use an account that has GPGPU access
module purge
module load fosscuda/2019b
module load tensorflow/2.1.0-python-3.7.4
python tensor_flow.py
python3 tensor_flow3.py
......@@ -9,8 +9,7 @@ with tf.device('/gpu:0'):
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
sess = tf.compat.v1.Session((config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))
# Based on https://www.tensorflow.org/guide/using_gpu
import tensorflow as tf
# Creates a graph -- force it to run on the GPU
with tf.device('/gpu:0'):
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.compat.v1.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment