Normal single threaded programming

#define N   10

void add( int *a, int *b, int *c ) {
    int tid = 0;    // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1;   // we have one CPU, so we increment by one
    }
}

int main( void ) {
    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add( a, b, c );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    return 0;
}

Basic multithreaded application

CPU 1

CPU 2

void add( int *a, int *b, int *c ) 
{
    int tid = 0;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

void add( int *a, int *b, int *c ) 
{
    int tid = 1;
    while (tid < N) {
        c[tid] = a[tid] + b[tid] ;
        tid += 2;
    }
}

CUDA threaded example

The __global__ tells the compiler nvcc compiler that the add subroutine should run on the device.

#define N   10

__global__ void add( int *a, int *b, int *c ) {
    int tid = blockIdx.x;    // this thread handles the data at its thread id
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main( void ) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * sizeof(int) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice );

    add<<<N,1>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    // free the memory allocated on the GPU
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );

    return 0;
}

Memory on the host and memory on the device

As a generel rule: Host pointers can access memory from host code, and device pointers can access memory from device code.

You can pass pointers allocated with cudaMalloc( ) to functions that execute on the device.
You can use pointers allocated with cudaMalloc() to read or write memory from code that executes on the device.
You can pass pointers allocated with cudaMalloc( ) to functions that execute on the host.
You cannot use pointers allocated with cudaMalloc() to read or write memory from code that executes on the host.

Navigation menu

CUDA by example/chapter 4

Contents

Normal single threaded programming

Basic multithreaded application

CUDA threaded example

Memory on the host and memory on the device