import loopy as lp
import numpy as np
import pyopencl as cl
from loopy.kernel.data import temp_var_scope as scopes
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
out_map = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14], dtype=np.int32)
if_val = np.array([-1, 0, -1, -1, -1, -1, 0, -1, -1, 0, 0, 0, -1, 0, -1], dtype=np.int32)
vals = np.array([2, 3, 0, 1, 2, 4, 1, 2, 4, 1, 3, 6, 0, 1, 4, 7, 1, 9, 10, 3, 10, 11, 1, 14, 15, 1, 12, 13, 21, 22, 23, 21, 22, 23, 3, 24, 25, 26, 27, 28, 29, 1, 3, 6], dtype=np.int32)
num_vals = np.array([2, 4, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3], dtype=np.int32)
num_vals_offset = np.array(np.cumsum(num_vals) - num_vals, dtype=np.int32)
knl = lp.make_kernel(['{[i]: 0 <= i < 12}',
'{[j]: 0 <= j < 100}',
'{[a_count]: 0 <= a_count < a_end}',
'{[b_count]: 0 <= b_count < b_end}'],
"""
for j
for i
<>i_map = out_map[i]
#find P_sum
<> a_end = abs(if_val[i_map])
<> a_sum = 1.0d {id=ainit}
if if_val[i_map] > 0
<> a_val = 10.0d {id=aval_decl}
else
a_val = 0.1d {id=aval_decl1}
end
for a_count
a_sum = a_sum * a_val {id=a_accum, dep=ainit:aval_decl:aval_decl1}
end
#find b_sum
<>b_end = num_vals[i_map]
<>offset = num_vals_offset[i_map] {id=offset}
<>b_sum = 0 {id=b_init}
for b_count
<>val = vals[offset + b_count] {dep=offset}
if if_val[i_map] != 0
b_sum = b_sum + if_val[i_map] * B[j,val] {id=b_accum, dep=b_init}
end
end
b_sum = exp(b_sum) {id=b_final, dep=b_accum}
out[j,i] = a_sum * b_sum {dep=a_accum:b_final}
end
end
""",
[lp.TemporaryVariable('out_map', initializer=out_map, shape=out_map.shape, read_only=True, scope=scopes.PRIVATE),
lp.TemporaryVariable('if_val', initializer=if_val, shape=if_val.shape, read_only=True, scope=scopes.PRIVATE),
lp.TemporaryVariable('vals', initializer=vals, shape=vals.shape, read_only=True, scope=scopes.PRIVATE),
lp.TemporaryVariable('num_vals', initializer=num_vals, shape=num_vals.shape, read_only=True, scope=scopes.PRIVATE),
lp.TemporaryVariable('num_vals_offset', initializer=num_vals_offset, shape=num_vals_offset.shape, read_only=True, scope=scopes.PRIVATE),
lp.GlobalArg('B', shape=(100, 31), dtype=np.float64),
lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)])
knl = lp.prioritize_loops(knl, ['j', 'i'])
print(lp.generate_code(knl)[0])
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double const *restrict B, __global double *restrict out)
{
int a_end;
double a_sum;
double a_val;
int b_end;
double b_sum;
int i_map;
int const if_val[15] = { -1, 0, -1, -1, -1, -1, 0, -1, -1, 0, 0, 0, -1, 0, -1 };
int const num_vals[15] = { 2, 4, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3 };
int const num_vals_offset[15] = { 0, 2, 6, 9, 12, 14, 16, 19, 22, 25, 28, 31, 34, 37, 41 };
int offset;
int const out_map[12] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14 };
int val;
int const vals[44] = { 2, 3, 0, 1, 2, 4, 1, 2, 4, 1, 3, 6, 0, 1, 4, 7, 1, 9, 10, 3, 10, 11, 1, 14, 15, 1, 12, 13, 21, 22, 23, 21, 22, 23, 3, 24, 25, 26, 27, 28, 29, 1, 3, 6 };
for (int j = 0; j <= 99; ++j)
for (int i = 0; i <= 11; ++i)
{
i_map = out_map[i];
offset = num_vals_offset[i_map];
b_end = num_vals[i_map];
a_end = abs(if_val[i_map]);
b_sum = 0.0;
if (!(if_val[i_map] > 0))
a_val = 0.1;
if (if_val[i_map] > 0)
a_val = 10.0;
a_sum = 1.0;
for (int b_count = 0; b_count <= -1 + b_end; ++b_count)
**if (-1 + a_end >= 0)**
{
val = vals[offset + b_count];
if (if_val[i_map] != 0)
b_sum = b_sum + if_val[i_map] * B[31 * j + val];
}
b_sum = exp(b_sum);
for (int a_count = 0; a_count <= -1 + a_end; ++a_count)
a_sum = a_sum * a_val;
out[12 * j + i] = a_sum * b_sum;
}
}
I'm not sure if I'm just being dense and missing something obvious here, but I can't figure out why the bolded (starred, oops bolding doesn't work in code blocks) if-statement has been applied to the B-loop.