Worklog: January 2015

This is essentially the code from here:

http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

I have expanded it to handle non-Square matrices whose sizes are not a multiple of the block size:

/**
* writtern By Amir Hossein Bakhtiary, use as you wish. Shouldn't have any copyright problems.
*/
#include
#include
#include

#include
#include
#include
#include
#include
#include
#include

const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;

template
__global__ void transposeCoalesced(Dtype *odata, const Dtype *idata, int rows,int cols)
{
__shared__ Dtype tile[TILE_DIM][TILE_DIM+1];

int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;

// if (x >= cols||y >= rows){
//      return;
// }

int maxJ = TILE_DIM;
int maxJ2 = TILE_DIM;
int otherMaxJ = rows - y;
if (maxJ > otherMaxJ)
    maxJ = otherMaxJ;

if ( x < cols ){
    for (int j = 0; j < maxJ; j += BLOCK_ROWS)
    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*cols + x];
}
__syncthreads();

x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;

int otherMaxJ2 = cols - y;
if (maxJ2 > otherMaxJ2){
      maxJ2 = otherMaxJ2;
}
if ( x < rows){
    for (int j = 0; j < maxJ2; j += BLOCK_ROWS)
       odata[(y+j)*rows + x] = tile[threadIdx.x][threadIdx.y + j];
}

}

template
thrust::device_vector invertMatrix(thrust::device_vector mat,unsigned rows,unsigned cols){
    thrust::device_vector retval(rows*cols);
    dim3 dimGrid((rows+TILE_DIM-1)/TILE_DIM,(cols+TILE_DIM-1)/TILE_DIM, 1);
    //dim3 dimGrid((nHashes)/TILE_DIM,(nSamples)/TILE_DIM, 1);
    dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);

    transposeCoalesced<<< dimGrid, dimBlock>>>(retval.data().get(),mat.data().get(),rows,cols);
    return retval;
}

template
void testInvert(){
    thrust::host_vector h_vec(1000);
    thrust::generate(h_vec.begin(), h_vec.end(), rand);
    int rows = 3 , cols = 4;
    thrust::device_vector a = h_vec;

    thrust::device_vector aInv = invertMatrix(a,rows,cols);
    for (int i = 0 ; i < rows; i++){
        for (int j = 0; j < cols; j++){
            std::cout << (a[i*cols+j]) << " " << aInv[j*rows+i] << std::endl;
        }
    }
}

int main()
{
    testInvert();
}

Worklog

Friday, January 16, 2015

cuda matrix transpose code and test

Labels

About Me

Friday, January 16, 2015

cuda matrix transpose code and test

Subscribe To

Labels

About Me