Below this CUDA program, I want to calculate the addition of two vectors on GPU on Ubuntu 20.04(my GPU is 3080Ti). Both vectors contain 64 elements. I set each block and each grid to 4×4 and 2×2 respectively, 64 threads in total. After having compiling, I execute the program. But the result vec3 is that the element 0-15 are 64, after that, the left elements are all 0. Why is it?
#include<iostream>
using namespace std;
__global__ void vector_mul(int *const c_vector,const int *const a_vector,const int *const b_vector){
const unsigned int idx=blockIdx.x*blockDim.x threadIdx.x;
const unsigned int idy=blockIdx.y*blockDim.y threadIdx.y;
const unsigned int thid=(idy*blockDim.x*gridDim.x) idx;
c_vector[thid]=a_vector[thid] b_vector[thid];
}
int vec1[64];
int vec2[64];
int vec3[64];
int main(void){
const dim3 thread_layout(4,4);
const dim3 block_layout(2,2);
for(int i=0;i<64;i ){
vec1[i]=i;
vec2[i]=64-i;
}
//declare gpu pointer
int *gpu_vec1;
int *gpu_vec2;
int *gpu_vec3;
//allocate gpu memory to gpu pointer
cudaMalloc((void**)&gpu_vec1,64);
cudaMalloc((void**)&gpu_vec2,64);
cudaMalloc((void**)&gpu_vec3,64);
//copy data from host to device
cudaMemcpy(gpu_vec1,vec1,64,cudaMemcpyHostToDevice);
cudaMemcpy(gpu_vec2,vec2,64,cudaMemcpyHostToDevice);
vector_mul<<<block_layout,thread_layout>>>(gpu_vec3,gpu_vec1,gpu_vec2);
cudaMemcpy(vec3,gpu_vec3,64,cudaMemcpyDeviceToHost);
for(int i=0;i<64;i )
cout << vec3[i] <<endl;
cudaFree(gpu_vec1);
cudaFree(gpu_vec2);
cudaFree(gpu_vec3);
return 0;
} 1,1 Top
CodePudding user response:
For an array that is intended to hold 64 int elements:
int vec1[64];
...
for(int i=0;i<64;i ){
vec1[i]=i;
These are not correct:
cudaMalloc((void**)&gpu_vec1,64);
cudaMalloc((void**)&gpu_vec2,64);
cudaMalloc((void**)&gpu_vec3,64);
...
cudaMemcpy(gpu_vec1,vec1,64,cudaMemcpyHostToDevice);
cudaMemcpy(gpu_vec2,vec2,64,cudaMemcpyHostToDevice);
...
cudaMemcpy(vec3,gpu_vec3,64,cudaMemcpyDeviceToHost);
All of the size parameters for those operatiosn are intended to be the size in bytes. So instead of 64, in each place it should be sizeof(int)*64.
There is a CUDA sample application called vectorAdd where you can see an example of this.
