Saya menguji dua pendekatan berbeda untuk menyalin array 2D di kernel CUDA.
Yang pertama meluncurkan blok thread TILE_DIM x TILE_DIM. Setiap blok menyalin petak array yang menetapkan satu thread per elemen:
__global__ void simple_copy(float *outdata, const float *indata){
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
outdata[y*width + x] = indata[y*width + x];
}
Yang kedua diambil dari NVIDIA Blog. Ini mirip dengan kernel sebelumnya tetapi menggunakan thread TILE_DIM x BLOCK_ROWS per blok. Setiap thread mengulang beberapa elemen matriks:
__global__ void fast_copy(float *outdata, const float *indata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int k = 0 ; k < TILE_DIM ; k += BLOCK_ROWS)
outdata[(y+k)*width + x] = indata[(y+k)*width + x];
}
Saya menjalankan tes untuk membandingkan kedua pendekatan ini. Kedua kernel melakukan akses gabungan ke memori global, namun kernel kedua tampaknya lebih cepat.
Profiler visual NVIDIA mengonfirmasi pengujian ini.
Jadi bagaimana kernel kedua berhasil mencapai salinan yang lebih cepat?
Ini adalah kode lengkap yang saya gunakan untuk menguji kernel:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <conio.h>
#define TILE_DIM 32
#define BLOCK_ROWS 8
/* KERNELS */
__global__ void simple_copy(float *outdata, const float *indata){
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
outdata[y*width + x] = indata[y*width + x];
}
//###########################################################################
__global__ void fast_copy(float *outdata, const float *indata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int k = 0 ; k < TILE_DIM ; k += BLOCK_ROWS)
outdata[(y+k)*width + x] = indata[(y+k)*width + x];
}
//###########################################################################
/* MAIN */
int main(){
float *indata,*dev_indata,*outdata1,*dev_outdata1,*outdata2,*dev_outdata2;
cudaEvent_t start, stop;
float time1,time2;
int i,j,k;
int n_iter = 100;
int N = 2048;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 grid(N/TILE_DIM, N/TILE_DIM);
dim3 threads1(TILE_DIM,TILE_DIM);
dim3 threads2(TILE_DIM,BLOCK_ROWS);
// Allocations
indata = (float *)malloc(N*N*sizeof(float));
outdata1 = (float *)malloc(N*N*sizeof(float));
outdata2 = (float *)malloc(N*N*sizeof(float));
cudaMalloc( (void**)&dev_indata,N*N*sizeof(float) );
cudaMalloc( (void**)&dev_outdata1,N*N*sizeof(float) );
cudaMalloc( (void**)&dev_outdata2,N*N*sizeof(float) );
// Initialisation
for(j=0 ; j<N ; j++){
for(i=0 ; i<N ; i++){
indata[i + N*j] = i + N*j;
}
}
// Transfer to Device
cudaMemcpy( dev_indata, indata, N*N*sizeof(float),cudaMemcpyHostToDevice );
// Simple copy
cudaEventRecord( start, 0 );
for(k=0 ; k<n_iter ; k++){
simple_copy<<<grid, threads1>>>(dev_outdata1,dev_indata);
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop );
printf("Elapsed time with simple copy: %f\n",time1);
// Fast copy
cudaEventRecord( start, 0 );
for(k=0 ; k<n_iter ; k++){
fast_copy<<<grid, threads2>>>(dev_outdata2,dev_indata);
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time2, start, stop );
printf("Elapsed time with fast copy: %f\n",time2);
// Transfer to Host
cudaMemcpy( outdata1, dev_outdata1, N*N*sizeof(float),cudaMemcpyDeviceToHost );
cudaMemcpy( outdata2, dev_outdata2, N*N*sizeof(float),cudaMemcpyDeviceToHost );
// Check for error
float error = 0;
for(j=0 ; j<N ; j++){
for(i=0 ; i<N ; i++){
error += outdata1[i + N*j] - outdata2[i + N*j];
}
}
printf("error: %f\n",error);
/*// Print the copied matrix
printf("Copy\n");
for(j=0 ; j<N ; j++){
for(i=0 ; i<N ; i++){
printf("%f\t",outdata1[i + N*j]);
}
printf("\n");
}*/
cudaEventDestroy( start );
cudaEventDestroy( stop );
free(indata);
free(outdata1);
free(outdata2);
cudaFree(dev_indata);
cudaFree(dev_outdata1);
cudaFree(dev_outdata2);
cudaDeviceReset();
getch();
return 0;
}
//###########################################################################