http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
I have expanded it to handle non-Square matrices whose sizes are not a multiple of the block size:
/**
* writtern By Amir Hossein Bakhtiary, use as you wish. Shouldn't have any copyright problems.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;
template
__global__ void transposeCoalesced(Dtype *odata, const Dtype *idata, int rows,int cols)
{
__shared__ Dtype tile[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// if (x >= cols||y >= rows){
// return;
// }
int maxJ = TILE_DIM;
int maxJ2 = TILE_DIM;
int otherMaxJ = rows - y;
if (maxJ > otherMaxJ)
maxJ = otherMaxJ;
if ( x < cols ){
for (int j = 0; j < maxJ; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*cols + x];
}
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
int otherMaxJ2 = cols - y;
if (maxJ2 > otherMaxJ2){
maxJ2 = otherMaxJ2;
}
if ( x < rows){
for (int j = 0; j < maxJ2; j += BLOCK_ROWS)
odata[(y+j)*rows + x] = tile[threadIdx.x][threadIdx.y + j];
}
}
template
thrust::device_vector
thrust::device_vector
dim3 dimGrid((rows+TILE_DIM-1)/TILE_DIM,(cols+TILE_DIM-1)/TILE_DIM, 1);
//dim3 dimGrid((nHashes)/TILE_DIM,(nSamples)/TILE_DIM, 1);
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
transposeCoalesced<<< dimGrid, dimBlock>>>(retval.data().get(),mat.data().get(),rows,cols);
return retval;
}
template
void testInvert(){
thrust::host_vector
thrust::generate(h_vec.begin(), h_vec.end(), rand);
int rows = 3 , cols = 4;
thrust::device_vector
thrust::device_vector
for (int i = 0 ; i < rows; i++){
for (int j = 0; j < cols; j++){
std::cout << (a[i*cols+j]) << " " << aInv[j*rows+i] << std::endl;
}
}
}
int main()
{
testInvert
}