Friday, January 16, 2015

cuda matrix transpose code and test

This is essentially the code from here:

http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

I have expanded it to handle non-Square matrices whose sizes are not a multiple of the block size:

/**
 * writtern By Amir Hossein Bakhtiary, use as you wish. Shouldn't have any copyright problems.
 */
#include
#include
#include

#include
#include
#include
#include
#include
#include
#include


const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;

template
__global__ void transposeCoalesced(Dtype *odata, const Dtype *idata, int rows,int cols)
{
  __shared__ Dtype tile[TILE_DIM][TILE_DIM+1];

  int x = blockIdx.x * TILE_DIM + threadIdx.x;
  int y = blockIdx.y * TILE_DIM + threadIdx.y;

//  if (x >= cols||y >= rows){
//      return;
//  }

  int maxJ = TILE_DIM;
  int maxJ2 = TILE_DIM;
  int otherMaxJ = rows - y;
  if (maxJ > otherMaxJ)
    maxJ = otherMaxJ;


  if ( x < cols ){
    for (int j = 0; j < maxJ; j += BLOCK_ROWS)
     tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*cols + x];
  }
  __syncthreads();

  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
  y = blockIdx.x * TILE_DIM + threadIdx.y;

  int otherMaxJ2 = cols - y;
  if (maxJ2 > otherMaxJ2){
      maxJ2 = otherMaxJ2;
  }
  if ( x < rows){
    for (int j = 0; j < maxJ2; j += BLOCK_ROWS)
       odata[(y+j)*rows + x] = tile[threadIdx.x][threadIdx.y + j];
  }

}

template
thrust::device_vector invertMatrix(thrust::device_vector mat,unsigned rows,unsigned cols){
    thrust::device_vector retval(rows*cols);
    dim3 dimGrid((rows+TILE_DIM-1)/TILE_DIM,(cols+TILE_DIM-1)/TILE_DIM, 1);
    //dim3 dimGrid((nHashes)/TILE_DIM,(nSamples)/TILE_DIM, 1);
    dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);

    transposeCoalesced<<< dimGrid, dimBlock>>>(retval.data().get(),mat.data().get(),rows,cols);
    return retval;
}


template
void testInvert(){
    thrust::host_vector h_vec(1000);
    thrust::generate(h_vec.begin(), h_vec.end(), rand);
    int rows = 3 , cols = 4;
    thrust::device_vector a = h_vec;

    thrust::device_vector aInv = invertMatrix(a,rows,cols);
    for (int i = 0 ; i < rows; i++){
        for (int j = 0; j < cols; j++){
            std::cout << (a[i*cols+j]) << " " << aInv[j*rows+i] << std::endl;
        }
    }
}


int main()
{
    testInvert();
}