Would expect the performance of GEMM operations to be comparable to cublas GEMM operations.
The GEMM operations on floats is extremely slow.
See attached test case. Compile test case with -D CUDA and run on an NVIDIA GPU and recompile with -D HIP and run on an AMD Hipyfied AMD GPU and compare the results.
I ran it on my two desktops -- one with Radeon W6800 and the other with GeForce RTX 2060. See attached GemmTest.cpp and here are the results:
#include <unistd.h>
#include <iostream>
#include <stdlib.h>
#include <assert.h>
#if defined(CUDA)
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define GPUBLAS_OP_N CUBLAS_OP_N
typedef cudaError_t gpuError_t;
typedef cudaEvent_t gpuEvent_t;
typedef cudaStream_t gpuStream_t;
typedef cublasHandle_t gpuBlasHandle_t;
typedef cublasStatus_t gpuBlasStatus_t;
const gpuBlasStatus_t gpuBlasSuccess = CUBLAS_STATUS_SUCCESS;
const gpuError_t gpuSuccess = cudaSuccess;
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
gpuError_t gpuMemcpy(void* dest, void* src, size_t size, cudaMemcpyKind flags) {
return cudaMemcpy(dest, src, size, flags);
}
gpuError_t gpuMallocManaged(void** ret, size_t size) {
return cudaMallocManaged(ret, size);
}
gpuError_t gpuEventCreate(gpuEvent_t* pevent) {
return cudaEventCreate(pevent);
}
gpuError_t gpuEventRecord(gpuEvent_t event, gpuStream_t stream) {
return cudaEventRecord(event, stream);
}
gpuError_t gpuEventSynchronize(gpuEvent_t event) {
return cudaEventSynchronize(event);
}
gpuError_t gpuGetLastError() {
return cudaGetLastError();
}
gpuError_t gpuEventElapsedTime(float* t, gpuEvent_t start, gpuEvent_t stop) {
return cudaEventElapsedTime(t, start, stop);
}
gpuError_t gpuFree(void* p) {
return cudaFree(p);
}
gpuBlasStatus_t gpuBlasCreate(gpuBlasHandle_t* phandle) {
return cublasCreate(phandle);
}
gpuBlasStatus_t gpuBlasSgemm(gpuBlasHandle_t handle, cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const float* alpha, const float* a, int lda, const float* b, int ldb, const float* beta, float* c, int ldc) {
return cublasSgemm(handle, ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
}
#elif defined(HIP)
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hipblas/hipblas.h>
#include <hiprand/hiprand.h>
#define GPUBLAS_OP_N HIPBLAS_OP_N
typedef hipError_t gpuError_t;
typedef hipEvent_t gpuEvent_t;
typedef hipStream_t gpuStream_t;
typedef hipblasHandle_t gpuBlasHandle_t;
typedef hipblasStatus_t gpuBlasStatus_t;
const gpuBlasStatus_t gpuBlasSuccess = HIPBLAS_STATUS_SUCCESS;
const gpuError_t gpuSuccess = hipSuccess;
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
gpuError_t gpuMemcpy(void* dest, void* src, size_t size, hipMemcpyKind flags) {
return hipMemcpy(dest, src, size, flags);
}
gpuError_t gpuMallocManaged(void** ret, size_t size) {
return hipMallocManaged(ret, size);
}
gpuError_t gpuEventCreate(gpuEvent_t* pevent) {
return hipEventCreate(pevent);
}
gpuError_t gpuEventRecord(gpuEvent_t event, hipStream_t stream) {
return hipEventRecord(event, stream);
}
gpuError_t gpuEventSynchronize(gpuEvent_t event) {
return hipEventSynchronize(event);
}
gpuError_t gpuGetLastError() {
return hipGetLastError();
}
gpuError_t gpuEventElapsedTime(float* t, gpuEvent_t start, gpuEvent_t stop) {
return hipEventElapsedTime(t, start, stop);
}
gpuError_t gpuFree(void* p) {
return hipFree(p);
}
gpuBlasStatus_t gpuBlasCreate(gpuBlasHandle_t* phandle) {
return hipblasCreate(phandle);
}
gpuBlasStatus_t gpuBlasSgemm(gpuBlasHandle_t handle, hipblasOperation_t ta, hipblasOperation_t tb, int m, int n, int k, const float* alpha, const float* a, int lda, const float* b, int ldb, const float* beta, float* c, int ldc) {
return hipblasSgemm(handle, ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
}
#else
#error "Specify GPU type"
#endif
void initRand(float* p, int rows, int cols) {
int a = 1;
for (int i = 0; i < rows * cols; i++) {
p[i] = (float) rand() / (float) (RAND_MAX / a);
}
}
int main(int argc, char* argv[]) {
int status, lower, upper, num, reps, verbose;
lower = 256;
upper = 8192;
num = 25000;
reps = 5;
verbose = 0;
while ((status = getopt(argc, argv, "l:u:n:r:v")) != -1) {
switch (status) {
case 'l':
lower = strtoul(optarg, 0, 0);
break;
case 'u':
upper = strtoul(optarg, 0, 0);
break;
case 'n':
num = strtoul(optarg, 0, 0);
break;
case 'r':
reps = strtoul(optarg, 0, 0);
break;
case 'v':
verbose = strtoul(optarg, 0, 0);
break;
default:
std::cerr << "invalid argument: " << status << std::endl;
exit(1);
}
}
if (verbose) {
std::cout << "Running with" << " lower: " << lower << " upper: " << upper << " num: " << num << " reps: " << reps << std::endl;
}
if (verbose) {
std::cout << "initializing inputs" << std::endl;
}
gpuBlasHandle_t handle;
gpuBlasStatus_t blasStatus;
blasStatus = gpuBlasCreate(&handle);
if (blasStatus != gpuBlasSuccess) {
std::cerr << "Could not create blas handle " << blasStatus << std::endl;
exit(1);
}
float* A = (float*) calloc(1, upper * upper * sizeof(float));
float* B = (float*) calloc(1, upper * upper * sizeof(float));
float* C = (float*) calloc(1, upper * upper * sizeof(float));
initRand(A, upper, upper);
initRand(B, upper, upper);
initRand(C, upper, upper);
float* dA, *dB, *dC;
gpuError_t err;
int lda, ldb, ldc, m, n, k;
float alpha = 1.0f, beta = 0.0f;
err = gpuMallocManaged((void**) &dA, upper * upper * sizeof(float));
if (err != gpuSuccess) {
std::cerr << "Could not allocate GPU memory; size: " << upper * upper * sizeof(float) << std::endl;
}
err = gpuMallocManaged((void**) &dB, upper * upper * sizeof(float));
if (err != gpuSuccess) {
std::cerr << "Could not allocate GPU memory; size: " << upper * upper * sizeof(float) << std::endl;
}
err = gpuMallocManaged((void**) &dC, upper * upper * sizeof(float));
if (err != gpuSuccess) {
std::cerr << "Could not allocate GPU memory; size: " << upper * upper * sizeof(float) << std::endl;
}
err = gpuMemcpy(dA, A, upper * upper * sizeof(float), gpuMemcpyHostToDevice);
if (err != gpuSuccess) {
std::cerr << "Could not copy to GPU memory; size: " << upper * upper * sizeof(float) << std::endl;
}
err = gpuMemcpy(dB, B, upper * upper * sizeof(float), gpuMemcpyHostToDevice);
if (err != gpuSuccess) {
std::cerr << "Could not copy to GPU memory; size: " << upper * upper * sizeof(float) << std::endl;
}
err = gpuMemcpy(dC, C, upper * upper * sizeof(float), gpuMemcpyHostToDevice);
if (err != gpuSuccess) {
std::cerr << "Could not copy to GPU memory; size: " << upper * upper * sizeof(float) << std::endl;
}
gpuEvent_t start, stop;
gpuEventCreate(&start);
gpuEventCreate(&stop);
for (int s = lower; s <= upper; s = s * 2) {
double sum = 0.0;
for (int r = 0; r < reps; r++) {
gpuEventRecord(start, 0);
m = n = k = s;
lda = m; ldb = k; ldc = m;
blasStatus = gpuBlasSgemm(handle, GPUBLAS_OP_N, GPUBLAS_OP_N, m, n, k, &alpha, dA, lda, dB, ldb, &beta, dC, ldc);
gpuEventRecord(stop, 0);
gpuEventSynchronize(stop);
if (blasStatus != gpuBlasSuccess) {
std::cerr << "gpuBlasSgemm failed: " << blasStatus << std::endl;
exit(1);
}
err = gpuGetLastError();
if (err != gpuSuccess) {
std::cerr << "gpu error: " << err << std::endl;
exit(1);
}
float elapsed;
gpuEventElapsedTime(&elapsed, start, stop);
elapsed /= 1000.0f;
sum += elapsed;
}
std::cout << "size " << s << " average " << sum / reps << " s " << std::endl;
}
gpuFree(dA); gpuFree(dB); gpuFree(dC);
free(A); free(B); free(C);
}