Code Monkey home page Code Monkey logo

Comments (3)

hughperkins avatar hughperkins commented on June 29, 2024

Added failing unit test for this in 53f3a3f

from clnn.

hughperkins avatar hughperkins commented on June 29, 2024

Seems like might be a clBLAS issue?

#include <iostream>
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <clBLAS.h>
using namespace std;

//                        m  n  k  alpha    lda     ldb beta       ldc
//THCLBlas_Gemm('t', 'n', 1, 7 ,1, 1, ones, 1, bias, 1, 0, output_n, 1);
//THCLBlas_Gemm('n', 'n', 1, 7 ,3, 1, columns, 1, weight, 3, 1, output_n, 1);

//    SUBROUTINE DGEMM ( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA,
//                       B, LDB, BETA, C, LDC )

//          CHARACTER*1  TRANSA, TRANSB

//          INTEGER      M, N, K, LDA, LDB, LDC

//          DOUBLE       PRECISION ALPHA, BETA

//          DOUBLE       PRECISION A( LDA, * ), B( LDB, * ), C( LDC,
//                       * )
void clgemm(char transAchar, char transBchar, int M, int N, int K, float alpha, float *A, int lda,
     float *B, int ldb, float beta, float *C, int ldc, float *result) {
cout << "clgemm 1" << endl;
clblasTranspose transA = transAchar == 'n' ? clblasNoTrans : clblasTrans;
clblasTranspose transB = transBchar == 'n' ? clblasNoTrans : clblasTrans;

//size_t off  = 1;
//size_t offA = K + 1;   /* K + off */
//size_t offB = N + 1;   /* N + off */
//size_t offC = N + 1;   /* N + off */
size_t off = 0;
size_t offA = 0;
size_t offB = 0;
size_t offC = 0;

static const clblasOrder order = clblasColumnMajor;

  cl_int err;
  cl_platform_id platform = 0;
  cl_device_id device = 0;
  cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
  cl_context ctx = 0;
  cl_command_queue queue = 0;
  cl_mem bufA, bufB, bufC;
  cl_event event = NULL;
  int ret = 0;

cout << "clgemm 2" << endl;
  /* Setup OpenCL environment. */
  err = clGetPlatformIDs(1, &platform, NULL);
  if (err != CL_SUCCESS) {
      printf( "clGetPlatformIDs() failed with %d\n", err );
      return;
  }
  cout << "got platforms" << endl;

  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  if (err != CL_SUCCESS) {
      printf( "clGetDeviceIDs() failed with %d\n", err );
      return;
  }

  props[1] = (cl_context_properties)platform;
  ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
  if (err != CL_SUCCESS) {
      printf( "clCreateContext() failed with %d\n", err );
      return;
  }

  queue = clCreateCommandQueue(ctx, device, 0, &err);
  if (err != CL_SUCCESS) {
      printf( "clCreateCommandQueue() failed with %d\n", err );
      clReleaseContext(ctx);
      return;
  }

  /* Setup clblas. */
  err = clblasSetup();
  if (err != CL_SUCCESS) {
      printf("clblasSetup() failed with %d\n", err);
      clReleaseCommandQueue(queue);
      clReleaseContext(ctx);
      return;
  }

  /* Prepare OpenCL memory objects and place matrices inside them. */
  bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
                        NULL, &err);
  bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
                        NULL, &err);
  bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
                        NULL, &err);

  err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
      M * K * sizeof(*A), A, 0, NULL, NULL);
  err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
      K * N * sizeof(*B), B, 0, NULL, NULL);
  err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
      M * N * sizeof(*C), C, 0, NULL, NULL);

  cout << "about to call blas" << endl;
  /* Call clblas extended function. Perform gemm for the lower right sub-matrices */
  err = clblasSgemm(order, transA, transB, M - off, N - off, K - off,
                       alpha, bufA, offA, lda,
                       bufB, offB, ldb, beta,
                       bufC, offC, ldc,
                       1, &queue, 0, NULL, &event);
//  float *result = new float[M * N];
  cout << "called blas" << endl;
  if (err != CL_SUCCESS) {
      printf("clblasSgemmEx() failed with %d\n", err);
      ret = 1;
  }
  else {
      /* Wait for calculations to be finished. */
      err = clWaitForEvents(1, &event);

      /* Fetch results of calculations from GPU memory. */
      err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
                                M * N * sizeof(*result),
                                result, 0, NULL, NULL);
      cout << "got result" << endl;
      /* At this point you will get the result of SGEMM placed in 'result' array. */
      puts("");
//      printResult("clblasSgemmEx result");
  }

//  for(int i = 0; i < M * N; i++ ) {
//    C[i] = result[i];
//  }

  /* Release OpenCL memory objects. */
  clReleaseMemObject(bufC);
  clReleaseMemObject(bufB);
  clReleaseMemObject(bufA);

  /* Finalize work with clblas. */
  clblasTeardown();

  /* Release OpenCL working objects. */
  clReleaseCommandQueue(queue);
  clReleaseContext(ctx);
}

void test1() {
  char transa = 't';
  char transb = 'n';
  int m = 1;
  int n = 7;
  int k = 1;
  float alpha = 1;
  int lda = 1;
  int ldb = 1;
//  int ldb = k;
  float beta = 0;
  int ldc = 1;

  // [1 x 1] [1 x 7] => [1 x 7]

  int Arows = m;
  int Acols = k;
  int Brows = k;
  int Bcols = n;
  int Crows = m;
  int Ccols = n;

  float *A = new float[m * k];
  float *B = new float[k * n];
  float *C = new float[m * n];
  A[0] = 0.5;
  for(int i = 0; i < 7; i++) {
    B[i] = 1.0f / (float)(1.0f + i) - 0.5f;
  }

  for(int i = 0; i < 7; i++) {
    C[i] = 0.0f;
  }
  float *Cours = new float[m * n];
  for(int i = 0; i < 7; i++) {
    Cours[i] = A[0] * B[i];
    cout << "Cours[" << i << "]=" << Cours[i] << endl;
  }

  float *clout = new float[m * n];
  clgemm(transa, transb, m, n, k, alpha, A, lda,
     B, ldb, beta, C, ldc, clout);
  for(int i = 0; i < 7; i++) {
    cout << "clout[" << i << "]=" << clout[i] << endl;
  }
}

int main(int argc, char *argv[]) {
  clewInit();
  test1();
  return 0;
}

output:

Cours[0]=0.25
Cours[1]=0
Cours[2]=-0.0833333
Cours[3]=-0.125
Cours[4]=-0.15
Cours[5]=-0.166667
Cours[6]=-0.178571
clgemm 1
clgemm 2
got platforms
about to call blas
called blas
got result

clout[0]=0.25
clout[1]=0
clout[2]=-0.0833333
clout[3]=-0.125
clout[4]=0
clout[5]=-0.166667
clout[6]=-0.15

from clnn.

hughperkins avatar hughperkins commented on June 29, 2024

Seems I can workaround it by just changing 'transA' from 't' to 'n', since k is 1 anyway, so the result is the same (or would be, if there wasnt an issue when using transA == 't'

Addressed in a548012

from clnn.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.