并行程序设计(英)大作业——稀疏神经网络推理
稀疏神经网络前向传播
作业背景与要求
本次作业是 2019 的一个并行竞赛题,也不记得是那个竞赛了,反正题目大致要求下述所示:将一个 60000 * 1024 的矩阵 A 当做左乘矩阵,120 个 1024 * 1024 的稀疏矩阵 B [i], 作为右乘矩阵。A 与 B[i] 相乘得到矩阵 C,C 的维度为 60000 * 1024,对 C 矩阵加 偏置项 bias,并用激活函数 relu 激活,激活后的 C 矩阵作为下一次相乘的矩阵 A 与 B[i+1] 相乘。即神经网络前向传播一次。运算时间通过 gettimeofday(&time,NULL); 获取。
原始代码是通过传统的矩阵乘法,利用三层循环,每个内部循环计算所求矩阵的一个元素,时间复杂度为 O(n^3^),计算一次矩阵相乘的时间需要 10 分钟左右。120个计算下来不得了。所以本次作业的目的就是尽量优化代码,缩短矩阵相乘和 bias+relu 的时间。老师给上一届优化最快的一个小组奖励了樱花键盘,我们这一届也不知道会不会有,反正我不是最快的。
优化思路来源
本学期一共有 5 次家庭作业,4 次都是关于优化矩阵乘法的,我就是筛选出性能最好的一种算法应用于大作业。
- 第一次作业是两个方阵相乘,方阵大小在 100 ~ 2000 之间,并100 为步长共 20 组数据测试。改变循环顺序和使用 openmp 还有 openblas。

上图为改变循环顺序的结果。
上图为改变循环顺序后加openmp的结果,有两个顺序加openmp语句后输出结果不正确,就没有画出来。

上图为 openblas 的结果。
void gemm_OpenBlas(double *A, double *B, double *C, int m, int k, int n)
{
enum CBLAS_ORDER order = CblasRowMajor;
enum CBLAS_TRANSPOSE transposeA = CblasNoTrans;
enum CBLAS_TRANSPOSE transposeB = CblasNoTrans;
double alpha = 1;
double beta = 1;
cblas_dgemm(order,transposeA,transposeB,m,n,k,alpha,A,k,B,n,beta,C,n);
}
gemm_OpenBlas(A, B, C_yours, m, k, n); 很快啊! 的两个矩阵相乘只需要 0.25s。
- 作业三是使用 CUDA 函数继续作业一。
//cublasSgemm
cublasHandle_t s;
cublasCreate(&s);
cublasSgemm('N', 'N', m, n, k, 1.0f, d_A, m, d_B, k, 0, d_C, m);
cublasDestroy(s);
//cublasSgemm_v2
cublasHandle_t s;
cublasCreate_v2(&s);
cublasSgemm_v2(s,CUBLAS_OP_T,CUBLAS_OP_T,m,n,k,&al,d_A,k,d_B,n,&ve,d_C,n);
cublasDestroy_v2(s);
//cblas_sgemm
void gemm_OpenBlas(float *A, float *B, float *C, int m, int k, int n) {
enum CBLAS_ORDER order = CblasRowMajor;
enum CBLAS_TRANSPOSE transposeA = CblasNoTrans;
enum CBLAS_TRANSPOSE transposeB = CblasNoTrans;
float alpha = 1;
float beta = 1;
cblas_sgemm(order,transposeA,transposeB,m,n,k,alpha,A,k,B,n,beta,C,n);
}
//cuda_shared
typedef struct {
int width;
int height;
int stride;
float *elements;
} Matrix;
__device__ float GetElement(const Matrix A, int row, int col) {
return A.elements[row * A.stride + col];
}
__device__ void SetElement(Matrix A, int row, int col,
float value) {
A.elements[row * A.stride + col] = value;
}
__device__ Matrix GetSubMatrix(Matrix A, int row, int col) {
Matrix Asub;
Asub.width
= BLOCK_SIZE;
Asub.height
= BLOCK_SIZE;
Asub.stride
= A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row
+ BLOCK_SIZE * col];
return Asub;
}
#define ifin(r, c, rb, cb, mat_A) \
((c + cb*BLOCK_SIZE < mat_A.width) && (r+rb*BLOCK_SIZE < mat_A.height))
__global__ void MatMulKernel_SharedMemory(Matrix A, Matrix B, Matrix C) {
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
int row = threadIdx.y;
int col = threadIdx.x;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
Matrix subC = GetSubMatrix(C, blockRow, blockCol);
int tot = A.width / BLOCK_SIZE + (A.width % BLOCK_SIZE != 0);
float CValue = 0;
for (int i = 0; i < tot; ++i) {
Matrix Asub = GetSubMatrix(A, blockRow, i);
Matrix Bsub = GetSubMatrix(B, i, blockCol);
//if (i * BLOCK_SIZE + col < A.width)
if(ifin(row,col,blockRow,i,A)) {
As[row][col] = GetElement(Asub, row, col);
} else As[row][col] = 0;
if(ifin(row,col,i,blockCol,B)) {
Bs[row][col] = GetElement(Bsub, row, col);
}else Bs[row][col] = 0;
__syncthreads();
for (int e = 0; e < BLOCK_SIZE; ++e)
CValue += As[row][e] * Bs[e][col];
__syncthreads();
if (ifin(row, col, blockRow, blockCol, C))
SetElement(subC, row, col, CValue);
}
}
void gemm_cuda_shared(float *A, float *B, float *C, int m, int k, int n,double *time_value) {
Matrix d_A;
d_A.width = d_A.stride = k;
d_A.height = m;
size_t size = k * m * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A, size, cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = n;
d_B.height = k;
size = n * k * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B, size, cudaMemcpyHostToDevice);
Matrix d_C;
d_C.width = d_C.stride = n;
d_C.height = m;
size = n * m * sizeof(float);
cudaMalloc(&d_C.elements, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((n / dimBlock.x) + (n % dimBlock.x != 0), (m / dimBlock.y) + (m % dimBlock.y != 0));
timeval t1,t2;
*time_value = 0;
cudaDeviceSynchronize();
gettimeofday(&t1, nullptr);
MatMulKernel_SharedMemory<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaDeviceSynchronize();
gettimeofday(&t2, nullptr);
*time_value+=(t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000000.0;
cudaMemcpy(C, d_C.elements, size, cudaMemcpyDeviceToHost);
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
//cuda_global
__global__ void MatrixMulKernel(float *A, float *B, float *C, int m, int k, int n) {
// Each thread computes one element of C
// by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < m && col < n) {
for (int e = 0; e < k; ++e)
Cvalue += A[row * k + e]
* B[e * n + col];
C[row * n + col] = Cvalue;
}
}
void gemm_cuda(float *A, float *B, float *C, int m, int k, int n,double *time_value) {
float *d_A, *d_B, *d_C;
size_t size = m * k * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = k * n * sizeof(float);
cudaMalloc(&d_B, size);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
size = m * n * sizeof(float);
cudaMalloc(&d_C, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((n / dimBlock.x) + (n % dimBlock.x != 0), (m / dimBlock.y) + (m % dimBlock.y != 0));
timeval t1,t2;
*time_value = 0;
cudaDeviceSynchronize();
gettimeofday(&t1, nullptr);
MatrixMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, m, k, n);
cudaDeviceSynchronize();
gettimeofday(&t2, nullptr);
*time_value += (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000000.0;
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
} 最快的是 cublas_sgemm 的矩阵只用了大约 0.0075s。其次就是 cublas_sgemm_v2 耗时 0.0175s 左右。
- 作业四,是稀疏方阵乘瘦矩阵,就是将方阵转化为 csr 的存储格式。然后再按照csr的方式将矩阵相乘。虽然使用csr方式进行计算耗费时间较少,但是需要的前期准备太多,而且在多个矩阵相乘过程中需要多次转换矩阵形式与分配内存,每次参与运算矩阵的 csr 格式表示都不相同,csr 的三个数组长度也不同,所以每次都需重新分配内存、释放内存,会带来较多的时间消耗。
虽然利用 csr 格式的稀疏矩阵相乘有以上的缺点,但还是对此方法进行了实验,下面是在网上拼凑的代码,原理不是很明白,只是粘取了函数部分,然后在调用函数部分做了改动。将矩阵转化为 csr 存储格式后利用cusparseSpMM:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <stdbool.h>
#include "mmio.h"
#include "mmiohighlevel.h"
#include <omp.h>
#define nthreads 1024
//#ifndef MATRIXMULTISAMPLES_CUSPARSE_SPMM_CUH
//#define MATRIXMULTISAMPLES_CUSPARSE_SPMM_CUH
#include <cusparse.h>
typedef struct
{
VALUE_TYPE *value;
int *columnindex;
int *rowpointer;
}SMatrix;
void dense2csr(int m, int n, VALUE_TYPE *C0, VALUE_TYPE *value,
int *columnindex, int *rowpointer){
VALUE_TYPE *d_C0;
size_t size = m * n * sizeof(VALUE_TYPE);
cudaMalloc(&d_C0, size);
cudaMemcpy(d_C0, C0, size,
cudaMemcpyHostToDevice);
size = m * sizeof(int);
int *nnzPerRow = 0;
cudaMalloc(&nnzPerRow, size);
int nzz = 0;
cusparseHandle_t handle = 0;
cusparseCreate(&handle);
cusparseMatDescr_t descrA = 0;
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW;
cusparseSnnz(handle,dirA,m,n,descrA,
d_C0,m,nnzPerRow,&nzz);
float *csrValA;
size = nzz * sizeof(VALUE_TYPE);
cudaMalloc(&csrValA, size);
int *csrRowPtrA, *csrColIndA;
size = (m+1) * sizeof(int);
cudaMalloc(&csrRowPtrA, size);
size = nzz * sizeof(int);
cudaMalloc(&csrColIndA, size);
cusparseSdense2csr(handle,m,n,descrA,d_C0,
m,nnzPerRow,csrValA,csrRowPtrA,csrColIndA);
size = nzz * sizeof(VALUE_TYPE);
cudaMemcpy(value, csrValA, size,
cudaMemcpyDeviceToHost);
size = (m+1) * sizeof(int);
cudaMemcpy(rowpointer, csrRowPtrA, size,
cudaMemcpyDeviceToHost);
size = nzz * sizeof(int);
cudaMemcpy(columnindex, csrColIndA, size,
cudaMemcpyDeviceToHost);
cudaFree(d_C0);
cudaFree(csrValA);
cudaFree(csrRowPtrA);
cudaFree(csrColIndA);
}
void scan(int *array, int len)
{
int old, _new;
old = array[0];
array[0] = 0;
for (int i = 1; i < len; i++)
{
_new = array[i];
array[i] = old + array[i - 1];
old = _new;
}
}
void toRowIndx_(int line, int ld, VALUE_TYPE *val) {
VALUE_TYPE *temp = (VALUE_TYPE *) malloc(sizeof(VALUE_TYPE) * line * ld);
for (int i = 0; i < line; ++i) {
for (int j = 0; j < ld; ++j) {
temp[i * ld + j] = val[j * line + i];
}
}
memcpy(val, temp, sizeof(VALUE_TYPE) * line * ld);
free(temp);
}
void toColIndx_(int line, int ld, VALUE_TYPE *val) {
VALUE_TYPE *temp = (VALUE_TYPE *) malloc(sizeof(VALUE_TYPE) * line * ld);
for (int i = 0; i < ld; ++i) {
for (int j = 0; j < line; ++j) {
temp[i * line + j] = val[j * ld + i];
}
}
memcpy(val, temp, sizeof(VALUE_TYPE) * line * ld);
free(temp);
}
int main(int argc, char ** argv)
{
struct timeval t1, t2, t3, t4;
int size1 = 0;
int size2 = 0;
int *tc1;
int *tc2;
double bias = -0.3000;
int mA;
int nA;
int nnzA;
int isSymmetricA;
SMatrix A;
int mB;
int nB;
int nnzB;
int isSymmetricB;
SMatrix B[120];
int mC,nC;
omp_set_num_threads(nthreads);
// load matrix data from file
gettimeofday(&t3, NULL);
char filename1[]="sparse-images-1024.tsv";
mmio_info(&mA, &nA, &nnzA, &isSymmetricA, filename1);
A.value=(VALUE_TYPE*)malloc((nnzA)*sizeof(VALUE_TYPE));
A.columnindex=(int*)malloc((nnzA)*sizeof(int));
A.rowpointer=(int*)malloc((mA+1)*sizeof(int));
mmio_data(A.rowpointer, A.columnindex, A.value, filename1);
printf("input matrix A: ( %i, %i ) nnz = %i\n", mA, nA, nnzA);
/*
VALUE_TYPE *A0 = (VALUE_TYPE *)malloc(mA * nA * sizeof(VALUE_TYPE));
memset(A0, 0, sizeof(VALUE_TYPE) * mA * nA);
for (int i = 0; i < mA; i++)
{
for (int j = A.rowpointer[i]; j < A.rowpointer[i+1]; j++)
{
A0[i * nA + A.columnindex[j]] = A.value[j];
}
}
free(A.rowpointer);
free(A.columnindex);
free(A.value);
*/
char neuronfile1[] = "neuron1024/n1024-l";
char neuronfile2[] = ".tsv";
char filename3[60];
VALUE_TYPE *B0[120];
for (int k = 0; k < 120; k++)
{
char filenum[5];
int k1=k+1;
snprintf(filenum,sizeof(filenum),"%d",k1);
strcpy(filename3, neuronfile1);
strcat(filename3, filenum);
strcat(filename3, neuronfile2);
mmio_info(&mB, &nB, &nnzB, &isSymmetricB, filename3);
B[k].value=(VALUE_TYPE*)malloc((nnzB)*sizeof(VALUE_TYPE));
B[k].columnindex=(int*)malloc((nnzB)*sizeof(int));
B[k].rowpointer=(int*)malloc((mB+1)*sizeof(int));
mmio_data(B[k].rowpointer, B[k].columnindex, B[k].value, filename3);
B0[k] = (VALUE_TYPE *)malloc(mB * nB * sizeof(VALUE_TYPE));
memset(B0[k], 0, sizeof(VALUE_TYPE) * mB * nB);
for (int i = 0; i < mB; i++)
{
for (int j = B[k].rowpointer[i]; j < B[k].rowpointer[i+1]; j++)
{
B0[k][i * nB + B[k].columnindex[j]] = B[k].value[j];
}
}
free(B[k].rowpointer);
free(B[k].columnindex);
free(B[k].value);
}
gettimeofday(&t4,NULL);
double time_load = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0;
printf("Weight matrix load time: %f ms \n",time_load);
mC = mA;
nC = nB;
SMatrix C;
C.value=(VALUE_TYPE*)malloc((mA*nB)*sizeof(VALUE_TYPE));
C.columnindex=(int*)malloc((mA*nB)*sizeof(int));
C.rowpointer=(int*)malloc((mA+1)*sizeof(int));
VALUE_TYPE *C0 =(VALUE_TYPE *)malloc((mA*nB)*sizeof(VALUE_TYPE));
float *dRes;
VALUE_TYPE *d_B[120];
for (int k=0;k<120;k++)
{
size_t size = nA * nB * sizeof(VALUE_TYPE);
toColIndx_(nB,mB,B0[k]);
cudaMalloc(&d_B[k], size);
cudaMemcpy(d_B[k], B0[k], size,
cudaMemcpyHostToDevice);
}
gettimeofday(&t3, NULL);
for (int k = 0; k < 120; k++)
{
//memset(C0, 0, sizeof(VALUE_TYPE)*mC*nC);
int *dRow, *dCol;
size_t size = (mA + 1) * sizeof(int);
cudaMalloc(&dRow, size);
cudaMemcpy(dRow, A.rowpointer, size,
cudaMemcpyHostToDevice);
size = A.rowpointer[mA]* sizeof(int);
cudaMalloc(&dCol, size);
cudaMemcpy(dCol, A.columnindex, size,
cudaMemcpyHostToDevice);
VALUE_TYPE *dA, *dB0;
size = A.rowpointer[mA]* sizeof(VALUE_TYPE);
cudaMalloc(&dA, size);
cudaMemcpy(dA, A.value, size,
cudaMemcpyHostToDevice);
size = mB * nB* sizeof(VALUE_TYPE);
cudaMalloc(&dB0, size);
cudaMemcpy(dB0, d_B[k], size, cudaMemcpyDeviceToDevice);
size = mA* nA * sizeof(VALUE_TYPE);
cudaMalloc(&dRes, size);
cusparseHandle_t handle;
cusparseCreate(&handle);
cusparseOperation_t a = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t b = CUSPARSE_OPERATION_NON_TRANSPOSE;
VALUE_TYPE al = 1, be = 0;
cusparseSpMatDescr_t csrMtxA;
cusparseCreateCsr(&csrMtxA, (int64_t) mA, (int64_t) nA,
(int64_t) nnzA, dRow, dCol, dA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
cusparseDnMatDescr_t dnsMtxB;
cusparseCreateDnMat(&dnsMtxB, (int64_t) mB, (int64_t) nB,
(int64_t) nB, dB0, CUDA_R_32F, CUSPARSE_ORDER_COL);
cusparseDnMatDescr_t dnsMtxC;
cusparseCreateDnMat(&dnsMtxC, (int64_t) mA, (int64_t) nA,
(int64_t) mA, dRes, CUDA_R_32F, CUSPARSE_ORDER_COL);
gettimeofday(&t1,NULL);
cusparseSpMM(handle, a, b, &al, csrMtxA, dnsMtxB, &be, dnsMtxC, CUDA_R_32F, CUSPARSE_MM_ALG_DEFAULT, NULL);
gettimeofday(&t2,NULL);
double time_gemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
gettimeofday(&t1,NULL);
gettimeofday(&t2,NULL);
gettimeofday(&t1,NULL);
cudaMemcpy(C0, dRes,(mC*nC)*sizeof(VALUE_TYPE) , cudaMemcpyDeviceToHost);
cudaFree(dCol);
cudaFree(dRow);
cudaFree(dRes);
cudaFree(dA);
cudaFree(dB0);
#pragma omp parallel for
for (int i = 0; i < mC*nC; i++)
{
C0[i] += bias;
if (C0[i] <= 0)
{
C0[i] = 0;
}
else if (C0[i] >= 32)
{
C0[i] = 32;
}
}
int *rowpointer, *columnindex;
VALUE_TYPE* value;
columnindex = (int*)malloc((mA*nA)*sizeof(int));
rowpointer = (int*)malloc((mA+1)*sizeof(int));
value = (VALUE_TYPE*)malloc(mA*nA*sizeof(VALUE_TYPE));
dense2csr(mA, nB, C0, value, columnindex, rowpointer);
A.rowpointer = (int*)malloc((mA+1)*sizeof(int));
A.columnindex = (int*)malloc(rowpointer[mA]*sizeof(int));
A.value = (VALUE_TYPE*)malloc(rowpointer[mA]*sizeof(VALUE_TYPE));
memcpy(A.rowpointer, rowpointer, (mA+1)*sizeof(int));
memcpy(A.columnindex, columnindex, rowpointer[mA]*sizeof(int));
memcpy(A.value, value, rowpointer[mA]*sizeof(VALUE_TYPE));
gettimeofday(&t2,NULL);
double time_biasrelu = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
printf("k = %d, GEMM time: %4.5f ms, Bias+ReLU time: %4.5f ms\n", k+1, time_gemm, time_biasrelu);
}
//cudaMemcpy(A0, d_C, (mC*nC)*sizeof(VALUE_TYPE),cudaMemcpyDeviceToHost);
gettimeofday(&t4,NULL);
double time_inference = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0;
printf("Inference time: %f ms \n",time_inference);
//free(C0);
VALUE_TYPE *A0 = (VALUE_TYPE *)malloc(mA * nA * sizeof(VALUE_TYPE));
memset(A0, 0, sizeof(VALUE_TYPE) * mA * nA);
for (int i = 0; i < mA; i++)
{
for (int j = A.rowpointer[i]; j < A.rowpointer[i+1]; j++)
{
A0[i * nA + A.columnindex[j]] = A.value[j];
}
}
// check results
printf("test\n");
FILE* fs;
fs=fopen("sparse-images-1024-1.tsv","w+");
for (int i = 0; i <mA; i++)
{
int sum =0;
for (int j = (i*nA); j < ((i+1)*nA); j++)
{
sum+=A0[j];
}
if(sum!=0)
{
fprintf(fs,"%d\n", i+1);
}
}
fclose(fs);
FILE* fp2=NULL;
fp2 = fopen("sparse-images-1024-1.tsv", "rb");
if (fp2 == NULL)
{
printf("Error:Open file fail!\n");
}
fseek(fp2, 0, SEEK_END);
size2 = ftell(fp2);
rewind(fp2);
tc2 = (int*)malloc(sizeof(int) * size2/4);
int readnum2 = fread(tc2, 4, size2/4, fp2);
fclose(fp2);
FILE* fp1;
fp1 = fopen("neuron1024-l120-categories.tsv", "rb");
if (fp1 == NULL)
{
printf("Error:Open file fail!\n");
}
fseek(fp1, 0, SEEK_END);
size1 = ftell(fp1);
rewind(fp1);
tc1 = (int*)malloc(sizeof(int) * size1/4);
int readnum1 = fread(tc1, 4, size1/4, fp1);
fclose(fp1);
int judge=0;
for(int i=0;i<size1/4;i++)
{
if(tc1[i]-tc2[i] != 0)
{
judge++;
}
}
printf("judge:%d\n",judge);
if (judge == 0) {
printf("CHALLENGE PASSED\n");
}
else
{
printf("CHALLENGE FAILED\n");
}
free(A0);
return 0;
} 这个时间大约是24s左右,bias和relu使用kernel的话会快几秒。相比于后面要介绍的的 cublasSgemm 还是要慢好多的。
- 作业五是使用 mpi 计算矩阵乘法。学这个的时候因为疫情被学校赶回家就随便水过去了。就不考虑它了。但有人用它达到了3s,是班里最快的。
矩阵乘法优化思路
选择最快的 cublasSgemm
cublasSgemm是NV cublas库的矩阵相乘API,由于 cublas 中矩阵的存储是列优先,所以cublasSgemm API的参数比较难配置,所以就先从 cublasSgemm 的参数配置入手,正确设置完cublasSgemm 的参数后再进行前向传播任务。
二维矩阵的行优先与列优先存储

矩阵在逻辑上表达为2维M行K列,但存储到内存的时候都是按一维布局,其中按行优先存储和按列优先存储的差异如上图所示。
如上图所示,当矩阵按行优先存储然后又按相反的列优先读取的话,就会得到元矩阵转置的结果;同理适用于按列优先存储然后按行优先读取。
cublasSgemm 函数参数
cublasStatus_t cublasSgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc)
This function performs the matrix-matrix multiplication:
$$
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A) m×k , op(B) k×n and C m×n , respectively. Also, for matrix A
and op(B) is defined similarly for matrix B .
官方文档的参数解释如下:
我们可以从中获取以下几个重点:
根据文档说可以知道,cublasSgemm完成了 C = alpha * op ( A ) * op ( B ) + beta * C 的矩阵乘加运算
其中alpha和beta是标量, A B C是以列优先存储的矩阵
如果 transa的参数是CUBLAS_OP_N 则op(A) = A ,如果是CUBLAS_OP_T 则op(A)=A的转置,至于CUBLAS_OP_C就不用管了,这次作业用不到。
如果 transb的参数是CUBLAS_OP_N 则op(B) = B ,如果是CUBLAS_OP_T 则op(B)=B的转置
由于API中的矩阵参数也用A B C表示,为了不和下面例子中的矩阵A B混淆,我们将cublasSgemm中的参数做如下的调整
- A称为乘法左矩阵
- B称为乘法右矩阵
- C称为结果矩阵
所以当alpha =1 并且 beta =0 的时候 cublasSgemm完成了计算:
$$
求解C=AxB
其中 A为mA行nA列 B为mB行nB列 所以 C为mC行nC列,并且六个参数有如下关系:
$$
不使用cublasSgemm transa与transb参数
由于C/C++程序中输入的A和B是按行存储,所以在的情况下,cublas其实读取到的是A和B的转置矩阵和
根据线性代数的规则可知C^T^ = (A x B)^T^ = B^T^ x A^T^ , 所以cublasSgemm API中几个参数设置如下:
- 设置了cublasSgemm的transa与transb参数=CUBLAS_OP_N
- 乘法左矩阵为B^T^=参数设置为B,乘法右矩阵为A^T^=参数设置为A
- 结果矩阵的行数为C^T^的行数=参数设置为nC
- 结果矩阵的列数为C^T^的列数=参数设置为mC
- 乘法左矩阵列与乘法右矩阵的行=参数设置为nA或mB
- 按列优先读取乘法左矩阵B的主维度(即B^T^有几行)=参数设置为nB
- 按列优先读取乘法右矩阵A的主维度(即A^T^有几行)=参数设置为nA
- 结果矩阵存储在参数C中,它的主维度(即C^T^有几行)= 参数设置为nC
cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,nB,d_A,nA,&beta,d_C,nC);
按上面的参数调用cublasSgemm API (矩阵A按行存储在指针d_a, 矩阵B按行存储在指针d_b, 矩阵C的存储空间指针d_c) 最后从结果矩阵的存储空间d_c中按行读取到的就是C=AxB的结果,整个cublasSgemm的计算过程如下图所示:
使用cublasSgemm transa与transb参数
由于C/C++程序中输入的A和B是按行存储,所以在的情况下,cublas其实读取到的是A和B的转置矩阵AT和BT
设置了cublasSgemm的transa与transb参数后可以在做矩阵运算前对读取到的AT和BT矩阵做一次转置,获得A和B
根据线性代数的规则可知C = A x B 所以cublasSgemm API中几个参数设置如下
设置了cublasSgemm的transa与transb参数=CUBLAS_OP_T,在进行矩阵运算前对读取的矩阵做一次转置
- 乘法左矩阵为A=参数设置为A,乘法右矩阵为B=参数设置为B
- 结果矩阵的行数为C的行数=参数设置为mC
- 结果矩阵的列数为C的列数=参数设置为nC
- 乘法左矩阵列与乘法右矩阵的行=参数设置为nA
- 按列优先读取乘法左矩阵A的主维度(即A^T^有几行)=参数设置为nA
- 按列优先读取乘法右矩阵B的主维度(即B^T^有几行)=参数设置为nB
- 结果矩阵存储在参数C中,它的主维度(即C有几行)= 参数设置为mC
cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_T, mC, nC, nA,&alpha,d_a, nA, d_b, nB,&beta, d_c, mC);

运行结果由于按行优先N行M列的顺序读取C相当于做了C的转置,得到C^T^ 。在计算中如果使用带有transa和transb参数,那么在运算结束还需要对结果矩阵进行转置,降低效率,所以在本次大作业中我们使用不带transa和transb参数的cublasSgemm语句进行运算。
代码与结果分析优化
代码
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(ceil((float)nC / BLOCK_SIZE) , ceil((float)mC / BLOCK_SIZE) );
gettimeofday(&t3, NULL);
for (int k = 0; k < 120; k++)
{
memset(C0, 0, sizeof(VALUE_TYPE)*mC*nC);
VALUE_TYPE *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, sizeof(VALUE_TYPE)*mA*nA);
cudaMalloc((void**)&d_B, sizeof(VALUE_TYPE)*mB*nB);
cudaMalloc((void**)&d_C, sizeof(VALUE_TYPE)*mC*nC);
cudaMemcpy(d_A, A0, sizeof(VALUE_TYPE)*mA*nA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B0[k], sizeof(VALUE_TYPE)*mB*nB, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
gettimeofday(&t1, NULL);
cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,mB,d_A,mB,&beta,d_C,nC);
//cublasSgemm_v2(s ,CUBLAS_OP_N,CUBLAS_OP_T,m, n, k, &al, d_A,k, d_B,n, &ve, d_C,n);
gettimeofday(&t2,NULL);
cublasDestroy(handle);
// cudaMemcpy(A0, d_C, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
// cudaFree(d_C);
double time_gemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
gettimeofday(&t1, NULL);
RELUKernel<<<dimGrid, dimBlock>>>(d_C,nA);
gettimeofday(&t2,NULL);
double time_biasrelu = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
printf("k = %d, GEMM time: %4.5f ms, Bias+ReLU time: %4.5f ms\n",
k+1, time_gemm, time_biasrelu);
cudaMemcpy(A0, d_C, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToHost);
cudaFree(d_C);
}
下面为运行结果:
总共需要16.5s来完成一次前向传播。
结果分析优化
我们可以看到,用来计算矩阵相乘和bias+relu的时间开销相比于推理时间16s是极少的,所以我们应该着眼于代码其他方面的优化,比如内存分配,显存访问。
每次循环开始,使用memset将C0设置为0,定义矩阵d_A, d_B, d_C。用cudaMalloc分配空间然后用cudaMemcpy将矩阵d_A, d_B从host拷贝到device上。接着调用函数cublasSgemm,最后释放d_A,d_B,将d_C从device复制到host,然后释放d_C。本次循环结束。
可以优化的选项:
- 每次循环最后中都会把d_C传给A0,在下次循环中访问A0传给d_A,该步操作可以简化为:在循环外声明d_A,d_C,每次循环结束将d_C直接传给d_A,计算结束后不再释放d_A,d_C。循环结束时再释放。
- 对于d_B,虽然每次计算都需要从B中加载不同矩阵,但也可以在循环外面声明。
- 在120个矩阵乘完后,在循环外将d_C矩阵传给A0,然后即可释放d_C,d_A,d_B。
优化后的代码
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <stdbool.h>
#include <cublas.h>
#include "mmio.h"
#include "mmiohighlevel.h"
#define BLOCK_SIZE 32
typedef struct
{
VALUE_TYPE *value;
int *columnindex;
int *rowpointer;
}SMatrix;
__global__ void RELUKernel(VALUE_TYPE *C,int n)
{
VALUE_TYPE Cvalue;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
C[row * n + col]=C[row * n + col]-0.3000;
Cvalue=C[row * n + col];
if (Cvalue<=0){
C[row * n + col]=0.0;}
else if(Cvalue>=32){
C[row * n + col]=32.0;
}
}
int main(int argc, char ** argv)
{
struct timeval t1, t2, t3, t4;
int size1 = 0;
int size2 = 0;
int *tc1;
int *tc2;
double bias = -0.3000;
const float alpha = 1.0, beta = 0.0;
int mA;
int nA;
int nnzA;
int isSymmetricA;
SMatrix A;
int mB;
int nB;
int nnzB;
int isSymmetricB;
SMatrix B[120];
int mC,nC;
int nnzC_golden = 0;
// load matrix data from file
gettimeofday(&t3, NULL);
char filename1[]="sparse-images-1024.tsv";
mmio_info(&mA, &nA, &nnzA, &isSymmetricA, filename1);
A.value=(VALUE_TYPE*)malloc((nnzA)*sizeof(VALUE_TYPE));
A.columnindex=(int*)malloc((nnzA)*sizeof(int));
A.rowpointer=(int*)malloc((mA+1)*sizeof(int));
mmio_data(A.rowpointer, A.columnindex, A.value, filename1);
printf("input matrix A: ( %i, %i ) nnz = %i\n", mA, nA, nnzA);
VALUE_TYPE *A0 = (VALUE_TYPE *)malloc(mA * nA * sizeof(VALUE_TYPE));
memset(A0, 0, sizeof(VALUE_TYPE) * mA * nA);
for (int i = 0; i < mA; i++)
{
for (int j = A.rowpointer[i]; j < A.rowpointer[i+1]; j++)
{
A0[i * nA + A.columnindex[j]] = A.value[j];
}
}
free(A.rowpointer);
free(A.columnindex);
free(A.value);
char neuronfile1[] = "neuron1024/n1024-l";
char neuronfile2[] = ".tsv";
char filename3[60];
VALUE_TYPE *B0[120];
for (int k = 0; k < 120; k++)
{
char filenum[5];
int k1=k+1;
snprintf(filenum,sizeof(filenum),"%d",k1);
strcpy(filename3, neuronfile1);
strcat(filename3, filenum);
strcat(filename3, neuronfile2);
mmio_info(&mB, &nB, &nnzB, &isSymmetricB, filename3);
B[k].value=(VALUE_TYPE*)malloc((nnzB)*sizeof(VALUE_TYPE));
B[k].columnindex=(int*)malloc((nnzB)*sizeof(int));
B[k].rowpointer=(int*)malloc((mB+1)*sizeof(int));
mmio_data(B[k].rowpointer, B[k].columnindex, B[k].value, filename3);
B0[k] = (VALUE_TYPE *)malloc(mB * nB * sizeof(VALUE_TYPE));
memset(B0[k], 0, sizeof(VALUE_TYPE) * mB * nB);
for (int i = 0; i < mB; i++)
{
for (int j = B[k].rowpointer[i]; j < B[k].rowpointer[i+1]; j++)
{
B0[k][i * nB + B[k].columnindex[j]] = B[k].value[j];
}
}
free(B[k].rowpointer);
free(B[k].columnindex);
free(B[k].value);
}
gettimeofday(&t4,NULL);
double time_load = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0;
printf("Weight matrix load time: %f ms \n",time_load);
mC = mA;
nC = nB;
VALUE_TYPE *C0 =(VALUE_TYPE *)malloc((mC*nC)*sizeof(VALUE_TYPE));
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(ceil((float)nC / BLOCK_SIZE) , ceil((float)mC / BLOCK_SIZE) );
gettimeofday(&t3, NULL);
VALUE_TYPE *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, sizeof(VALUE_TYPE)*mA*nA);
cudaMalloc((void**)&d_B, sizeof(VALUE_TYPE)*mB*nB);
cudaMalloc((void**)&d_C, sizeof(VALUE_TYPE)*mC*nC);
cudaMemcpy(d_A, A0, sizeof(VALUE_TYPE)*mA*nA, cudaMemcpyHostToDevice);
for (int k = 0; k < 120; k++)
{
cudaMemcpy(d_B, B0[k], sizeof(VALUE_TYPE)*mB*nB, cudaMemcpyHostToDevice);
free(B0[k]);
cublasHandle_t handle;
cublasCreate(&handle);
gettimeofday(&t1, NULL);
// cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,mB,d_A,mB,&beta,d_C,nC);
cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,nB,d_A,nA,&beta,d_C,nC);
// cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_T, mC, nC, nA,&alpha,d_a, nA, d_b, nB,&beta, d_c, mC);
gettimeofday(&t2,NULL);
cublasDestroy(handle);
double time_gemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
gettimeofday(&t1, NULL);
RELUKernel<<<dimGrid, dimBlock>>>(d_C,nA);
gettimeofday(&t2,NULL);
double time_biasrelu = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
printf("k = %d, GEMM time: %4.5f ms, Bias+ReLU time: %4.5f ms\n",
k+1, time_gemm, time_biasrelu);
cudaMemcpy(d_A, d_C, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToDevice);
}
cudaMemcpy(A0, d_A, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToHost);
gettimeofday(&t4,NULL);
double time_inference = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0;
printf("Inference time: %f ms \n",time_inference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(C0);
// check results
printf("test\n");
FILE* fs;
fs=fopen("sparse-images-1024-1.tsv","w+");
for (int i = 0; i <mA; i++)
{
int sum =0;
for (int j = (i*nA); j < ((i+1)*nA); j++)
{
sum+=A0[j];
}
if(sum!=0)
{
fprintf(fs,"%d\n", i+1);
}
}
fclose(fs);
FILE* fp2=NULL;
fp2 = fopen("sparse-images-1024-1.tsv", "rb");
if (fp2 == NULL)
{
printf("Error:Open file fail!\n");
}
fseek(fp2, 0, SEEK_END);
size2 = ftell(fp2);
rewind(fp2);
tc2 = (int*)malloc(sizeof(int) * size2/4);
int readnum2 = fread(tc2, 4, size2/4, fp2);
fclose(fp2);
FILE* fp1;
fp1 = fopen("neuron1024-l120-categories.tsv", "rb");
if (fp1 == NULL)
{
printf("Error:Open file fail!\n");
}
fseek(fp1, 0, SEEK_END);
size1 = ftell(fp1);
rewind(fp1);
tc1 = (int*)malloc(sizeof(int) * size1/4);
int readnum1 = fread(tc1, 4, size1/4, fp1);
fclose(fp1);
int judge=0;
for(int i=0;i<size1/4;i++)
{
if(tc1[i]-tc2[i] != 0)
{
judge++;
}
}
printf("judge:%d\n",judge);
if (judge == 0) {
printf("CHALLENGE PASSED\n");
}
else
{
printf("CHALLENGE FAILED\n");
}
free(A0);
return 0;
}
该段代码实现了上一节中所述的可优化的部分,下面为运行结果截图:
6.7s!对于内存访问的优化竟然相比于原本方法快了10s,除去后面要说的relu+bias的时间也就是快了9s左右。很不可思议哎!
bias+relu的优化
这bias和relu就是对矩阵中的每一项家偏置项,然后使用relu激活,很明显可以使用kernel函数通过每个线程对其进行并行处理。以达到加速的目的。
这里使用二维网格与线程块来进行计算。行索引 row 通过 blockIdx.y * blockDim.y + threadIdx.y 得到,blockIdx.y 是线程块的横向索引,blockDim.y 是线程块横向的线程数,threadIdx.y 是线程在某块内的横向索引。col 的计算相似。
kernel函数如下:
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(ceil((float)nC / BLOCK_SIZE) , ceil((float)mC / BLOCK_SIZE) );
__global__ void RELUKernel(VALUE_TYPE *C,int n)
{
VALUE_TYPE Cvalue;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
C[row * n + col]=C[row * n + col]-0.3000;
Cvalue=C[row * n + col];
if (Cvalue<=0){
C[row * n + col]=0.0;}
else if(Cvalue>=32){
C[row * n + col]=32.0;
}
} for循环的时间:
kernel的时间:
提升了约36000倍!
结论
最后优化的时间为 6.7s 左右。
查看29道真题和解析
