Light

smistad / opencl-getting-started Goto Github PK

A small "getting started" tutorial for OpenCL. See http://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/ for more info

Home Page: http://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/

License: Other

C++ 22.99% C 74.73% Makefile 2.29%

opencl-getting-started's Introduction

OpenCL-Getting-Started

A small "getting started" tutorial for OpenCL. See http://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/ for more info.

For simplicity, the main.c in this directory does not check for errors. If it doesn't work for you, try the version in the 'errorchecking' subdirectory instead, which does (this was contributed by Christian Jaeger.)

opencl-getting-started's People

Contributors

Stargazers

Watchers

Forkers

dushyantgoyal sycdlcrain shaharbenda wildsoul yaredwondimu vicotredin pflanze bailei vida90 massinissalounis msosnick ptjoker95 nivertech sonyomega swederik edkeith jiadinggai configithub firebird00 asuresh1 linan7788626 chaicko zhangkom manasdas17 jameslinus f286 kuhar grevutiu-gabriel htt1994 saadmahboob furuame pengdonglin137 dailyactie brainabilgh aborserker sheeeng roninhack 42n4 arunsunil quancao ukidlucas sarthakrout sakridge iamdestruction hbaxmann yehudahs soietre nateh7 floedelmann trinhgiahuy nim- qrfaq theinthanhlan fangzhou-ye hossamfadeel cara0x0

opencl-getting-started's Issues

Efficiency seems off

Hi,
Whilst testing I edited the program added a function for OpenCL called "gpu" (I know OpenCL can be the CPU too), I did this to see how much more efficient it is compared to serial programming.

I see an issue where the CPU is considerably faster using this code:
vector_add_kernel.cl

__kernel void vector_add(__global float *A, __global float *B, __global float *C) {
    
    // Get the index of the current element
    int i = get_global_id(0);

    // Do the operation
    C[i] = sqrt(A[i]) * sqrt(B[i]);
}

main.cpp

#include <cstdlib>
#include <iostream>
#include <chrono>
#include <math.h>
#ifdef __APPLE__
    #include "OpenCL/opencl.h"
#else
    #include "CL/cl.h"
#endif

#define MAX_SOURCE_SIZE (0x100000)

using namespace std;

void cpu(const int size) 
{
    // Create the two input vectors
    int i;
    float *A = (float*)malloc(sizeof(int)*size);
    float *B = (float*)malloc(sizeof(int)*size);
    for(i = 0; i < size; i++) {
        A[i] = i;
        B[i] = size - i;
    }
    
    // Calculate C on CPU
    float *C = (float*)malloc(sizeof(int)*size);
    for(i = 0; i < size; i++) {
        C[i] = sqrt(A[i]) * sqrt(B[i]); 
    }
    
    // Display the result to the screen
    for(i = 0; i < size; i++) {
        //printf("sqrt(%f) * sqrt(%f) = %f\n", A[i], B[i], C[i]);
    }
    
    free(A);
    free(B);
    free(C);
}

void gpu(const int size) 
{
    // Create the two input vectors
    int i;
    float *A = (float*)malloc(sizeof(int)*size);
    float *B = (float*)malloc(sizeof(int)*size);
    for(i = 0; i < size; i++) {
        A[i] = i;
        B[i] = size - i;
    }

    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("vector_add_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1, 
            &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector 
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
            size * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            size * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            size * sizeof(int), NULL, &ret);

    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            size * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            size * sizeof(int), B, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1, 
            (const char **)&source_str, (const size_t *)&source_size, &ret);

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    
    // Execute the OpenCL kernel on the list
    size_t global_item_size = size; // Process the entire lists
    size_t local_item_size = 64; // Process in groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    float *C = (float*)malloc(sizeof(int)*size);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            size * sizeof(int), C, 0, NULL, NULL);

    // Display the result to the screen
    for(i = 0; i < size; i++) {
        //printf("sqrt(%f) * sqrt(%f) = %f\n", A[i], B[i], C[i]);
    }
    
    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
}

int main(void)
{
    
    typedef std::chrono::high_resolution_clock Time;
    typedef std::chrono::milliseconds ms;
    typedef std::chrono::duration<float> fsec;
    
    int size = 1024 * 1024 * 1024;
    
    auto cpu_start = Time::now();
    cpu(size);
    auto cpu_finish = Time::now();
    ms cpu_diff = std::chrono::duration_cast<ms>(cpu_finish - cpu_start);
    
    auto gpu_start = Time::now();
    gpu(size);
    auto gpu_finish = Time::now();
    ms gpu_diff = std::chrono::duration_cast<ms>(gpu_finish - gpu_start);
    
    printf("CPU: %ld ms, GPU: %ld ms\n", cpu_diff, gpu_diff);
    
    return 0;
    
}

My results were CPU: 21883 ms, GPU: 50536 ms.

I can't tell what's wrong as I'm new to parallel programming, so I was hoping you could take a look.

Thank you,
Jamie

Recommend Projects

React

A declarative, efficient, and flexible JavaScript library for building user interfaces.
Vue.js

🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
Typescript

TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
TensorFlow

An Open Source Machine Learning Framework for Everyone
Django

The Web framework for perfectionists with deadlines.
Laravel

A PHP framework for web artisans
D3

Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

javascript

JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
web

Some thing interesting about web. New door for the world.
server

A server is a program made to process requests and deliver data to clients.
Machine learning

Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Visualization

Some thing interesting about visualization, use data art
Game

Some thing interesting about game, make everyone happy.

Recommend Org

Facebook

We are working to build community through open source technology. NB: members must have two-factor auth.
Microsoft

Open source projects and samples from Microsoft.
Google

Google ❤️ Open Source for everyone.
Alibaba

Alibaba Open Source for everyone
D3

Data-Driven Documents codes.
Tencent

China tencent open source team.