anydsl / thorin Goto Github PK

The Higher-Order Intermediate Representation

License: GNU Lesser General Public License v3.0

CMake 1.54% C++ 97.38% NASL 0.12% C 0.95%

compiler compiler-optimization intermediate-representation higher-order-functions continuation-passing-style gpu cpu vectorization simd

thorin's Introduction

Thorin

The Higher-Order Intermediate Representation

Build Instructions

See Build Instructions.

Documentation

See our AnyDSL/Thorin Website.

thorin's People

Contributors

Stargazers

Watchers

Forkers

mastercassim longjohncoder sdleffler danielspaniol sailfish009 hugobros3 m-kurtenacker leissa skosukhin pearcoding f-fries mfkiwl neuralcoder3 dfop-hd

thorin's Issues

stick to coding conventions

thorin/runtime uses void *p instead of void* p against our coding convetions.
https://github.com/AnyDSL/anydsl/wiki/Coding-Style

ycomp output for graphs

reuse infrastructure for dumping trees and graphs:

domtree
looptree
CDG
CFG

Code generation problem for while loops with if statements in the body

This code fragment:

extern "device" {
    fn "llvm.nvvm.read.ptx.sreg.tid.x" nvvm_read_ptx_sreg_tid_x() -> i32;
    fn "llvm.nvvm.barrier0" nvvm_barrier0() -> ();
}

extern "thorin" {
    fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
    fn sizeof[T]() -> i32;
}

extern "C" {
    fn thorin_alloc(i32, i64) -> &i8;
}

struct Buffer {
    device: i32,
    data: &[i8]
}

fn alloc(dev: i32, size: i32) -> Buffer {
    Buffer {
        device: dev,
        data: thorin_alloc(dev, size as i64) as &[i8]
    }
}

fn thorin_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
fn alloc_cuda(dev: i32, size: i32) -> Buffer { alloc(thorin_device(1, dev), size) }

fn main() -> () {
  let buf = alloc_cuda(0, sizeof[i32]() * 57);
  let mut ptr = buf.data as &[i32];
  with nvvm(0, (2, 2, 2), (2, 2, 2)) {
    let mut id = 1;
    if nvvm_read_ptx_sreg_tid_x() == 0 {
      id = 5;
    }

    while (id < 57) {
      nvvm_barrier0();
      if nvvm_read_ptx_sreg_tid_x() == 0 {
        // here I update a global memory location that will ba later read by all other threads
        ptr(id) = 1;
        id += 2;
      }
      nvvm_barrier0();
    }
  }
}

Generates the following nvvm code:

; ModuleID = 'broken2'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"

define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda:
  %0 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %1 = icmp eq i32 %0, 0
  br i1 %1, label %next10.us, label %next10

next10.us:                                        ; preds = %lambda, %next10.us
  %id111.us = phi i32 [ %3, %next10.us ], [ 5, %lambda ]
  tail call ptx_device void @llvm.nvvm.barrier0()
  %2 = getelementptr inbounds [0 x i32]* %_271_389, i64 0, i32 %id111.us
  store i32 1, i32* %2
  %3 = add nsw i32 %id111.us, 2
  tail call ptx_device void @llvm.nvvm.barrier0()
  %4 = icmp slt i32 %3, 57
  br i1 %4, label %next10.us, label %next2

next2:                                            ; preds = %next10.us
  ret void

next10:                                           ; preds = %next10, %lambda
  tail call ptx_device void @llvm.nvvm.barrier0()
  tail call ptx_device void @llvm.nvvm.barrier0()
  br label %next10
}

declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)

; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}

!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x i32]*)* @lambda_crit_268, metadata !"kernel", i64 1}

The problem is that 2 versions of the while loop body is generated: 1 for thread 0 and one of the other threads. As a result, I get the expected result for thread 0 but all other threads produce wrong results.

In my case, thread 0 updates some device memory location which will be later read by other threads.

This happens only when the code is compiled with -O3 compiler option. If no optimization flag is passed the code is generated as expected:

; ModuleID = 'broken2'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"

define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda_crit_268_start:
  br label %lambda_crit_268

lambda_crit_268:                                  ; preds = %lambda_crit_268_start
  %0 = call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  br label %lambda

lambda:                                           ; preds = %lambda_crit_268
  %llvm.nvvm.read.ptx.sreg.tid.x = phi i32 [ %0, %lambda_crit_268 ]
  %1 = icmp eq i32 %llvm.nvvm.read.ptx.sreg.tid.x, 0
  br i1 %1, label %if_then, label %if_else

if_else:                                          ; preds = %lambda
  br label %next

if_then:                                          ; preds = %lambda
  br label %next

next:                                             ; preds = %if_then, %if_else
  %id = phi i32 [ 5, %if_then ], [ 1, %if_else ]
  br label %while_head

while_head:                                       ; preds = %next10, %next
  %id1 = phi i32 [ %id, %next ], [ %id9, %next10 ]
  %2 = icmp slt i32 %id1, 57
  br i1 %2, label %while_body, label %next2

next2:                                            ; preds = %while_head
  ret void

while_body:                                       ; preds = %while_head
  call ptx_device void @llvm.nvvm.barrier0()
  br label %while_body3

while_body3:                                      ; preds = %while_body
  %3 = call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  br label %while_body4

while_body4:                                      ; preds = %while_body3
  %llvm.nvvm.read.ptx.sreg.tid.x5 = phi i32 [ %3, %while_body3 ]
  %4 = icmp eq i32 %llvm.nvvm.read.ptx.sreg.tid.x5, 0
  br i1 %4, label %if_then7, label %if_else6

if_else6:                                         ; preds = %while_body4
  br label %next8

if_then7:                                         ; preds = %while_body4
  %5 = getelementptr inbounds [0 x i32]* %_271_389, i64 0, i32 %id1
  store i32 1, i32* %5
  %6 = add nsw i32 2, %id1
  br label %next8

next8:                                            ; preds = %if_then7, %if_else6
  %id9 = phi i32 [ %id1, %if_else6 ], [ %6, %if_then7 ]
  call ptx_device void @llvm.nvvm.barrier0()
  br label %next10

next10:                                           ; preds = %next8
  br label %while_head
}

declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)

; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}

!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x i32]*)* @lambda_crit_268, metadata !"kernel", i64 1}

Very simple code crashes during PE (new_master)

The following code emits the world().lambdas().find(lambda) != world().lambdas().end() assertion on new_master (run with -emit-llvm) :

fn max(a: f32, b: f32) -> f32 @{
    if a > b { a } else { b }
}

extern fn traverse_accel() -> () @{
    let t0_y : f32;
    let t0 = max(t0_y, t0_y);
}

Inlining the function max, removing the @ sign on the first line, or specifying a concrete value for t0_y removes the assertion.

Warn about expressions being used as statements

It would be really, really useful if impala could warn about pure expressions being used as statement, like

4;

This just bite me hard, when I had the following in my code in many places

return;

which of course did not return...

Array equality

Array equality is authorized in Impala, but not supported by Thorin. The following code breaks :

extern "C" {
    fn puts(&[u8]) -> ();
}

fn main() -> () {
    let a = [1, 2];
    let b = [3, 4];
    if a == b {
        puts("arrays are equal");    
    }
}

It generates the following assertion :

impala: /space/perard/sources/thorin/src/thorin/util/cast.h:29: L* thorin::scast(R*) [with L = thorin::VectorTypeNode; R = thorin::MagicCast<thorin::TypeNode>]: Assertion `(!r || dynamic_cast<L*>(r)) && "cast not possible"' failed.

Should I implement it ? Or should it be forbidden in the front-end ?

thorin asserts on type mismatch for operands of phi nodes

impala: llvm/IR/Instructions.h:2104: void llvm::PHINode::addIncoming(llvm::Value*, llvm::BasicBlock*): Assertion `getType() == V->getType() && "All operands to PHI node must be the same type as the PHI node!"' failed.

test case can be found here: AnyDSL/impala@80bab6a

Error "currently only pointers to arrays supported" only when partial evaluation is not used

The following code,

extern "thorin" {
    fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
}

extern "C" {
    fn thorin_alloc(i32, i64) -> &i8;
}

fn get_element_addr(mut data: &[i8], idx: i32) -> &[i8] { data }

fn call_me() -> () {
    let d_in_buf = thorin_alloc(1, 10i64) as &[i8];

    with nvvm(0, (1,1,1), (1,1,1)) {
        //@get_element_addr(d_in_buf, 0);
        get_element_addr(d_in_buf, 0);
    }
}

fn main() -> i32 {
    call_me();
    0
}

produces this compilation runtime error:

E:\anydsl\thorin\src\thorin\be\llvm\runtime.cpp: 90: currently only pointers to arrays supported as kernel argument at 'broken.impala:6 col 5 - 37'; argument has different type: qs8*

This happens only if get_element_addr is called without partial evaluation.

Memory monads

I got a function with only memory monads as arguments (3 of them). Hence the assertion

thorin/be/llvm/llvm.cpp:253 assert(n == num_args || n+1 == num_args);

has been triggered in the code generation phase (in my case n = 0 and num_args = 3). How is that even possible ? Shouldn't there be only one memory monad argument ?

I am not able to reproduce the bug with a simple example. Maybe this is linked to partial evaluation. I can post the whole raytracer code that generates this bug, if you are interested.

Incorrect transformation in lift_enters pass

Compilation of the following code produces an incorrect result:

static stack_sentinel = 0x76543210u32;

struct Stack {
    data: [i32 * 64],
    id: i32
}

fn empty_stack() -> Stack {
    let mut stack: Stack;
    stack.id = 0;
    stack.data(0) = stack_sentinel as i32;
    stack
}

fn pop(mut stack: &Stack) -> i32 {
    let ret = stack.data(stack.id);
    stack.id--;
    ret
}

fn main() -> i32 {
    let mut stack = empty_stack();
    let mut node_id = 0;
    while (node_id as u32) < stack_sentinel {
        node_id = pop(&stack);
    }
    node_id
}

The llvm output is:

%0 = type { [64 x i32], i32 }

define i32 @main_impala() {
main_impala_start:
  br label %main_impala

main_impala:                                      ; preds = %main_impala_start
  %stack_114 = alloca %0
  %0 = getelementptr inbounds %0* %stack_114, i32 0, i32 0
  store [64 x i32] [i32 1985229328, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef], [64 x i32]* %0
  %1 = getelementptr inbounds %0* %stack_114, i32 0, i32 1
  br label %while_head

while_head:                                       ; preds = %while_body, %main_impala
  %node_id = phi i32 [ 0, %main_impala ], [ %5, %while_body ]
  %2 = icmp ult i32 %node_id, 1985229328
  br i1 %2, label %while_body, label %next

while_body:                                       ; preds = %while_head
  %3 = load i32* %1
  %4 = getelementptr inbounds [64 x i32]* %0, i64 0, i32 %3
  %5 = load i32* %4
  %6 = load i32* %1
  %7 = sub nsw i32 %6, 1
  store i32 %7, i32* %1
  br label %while_head

next:                                             ; preds = %while_head
  ret i32 %node_id
}

One can clearly see that the id member of the stack is not initialized when it should be in the main_impala function. The Thorin IR is correct up to the lift_enters pass.

Before lift_enters:

main_impala_102(mem mem_103, fn(mem, qs32) return_104) extern 
    (mem, frame) _106 = enter mem_103
    frame _108 = extract _106, qu32 1
    Stack{[64 x qs32], qs32}* stack_114 = slot _108
    mem _107 = extract _106, qu32 0
    mem _115 = store _107, stack_114, (insert (insert bottom Stack{[64 x qs32], qs32}, qu32 1, qs32 0), qu32 0, (definite_array qs32 1985229328, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32))
    qs32* _242 = lea stack_114, qu32 1
    [64 x qs32]* _250 = lea stack_114, qu32 0
    while_head_121 _115, qs32 0

    while_head_121(mem mem_123, qs32 node_id_125)
        pu32 _126 = cast node_id_125
        bool _127 = lt _126, pu32 1985229328
        fn(mem) _134 = @ _127 select while_body_128, next_131
        _134 mem_123

    while_body_128(mem mem_130)
        (mem, frame) _236 = enter mem_130
        mem _237 = extract _236, qu32 0
        (mem, Stack{[64 x qs32], qs32}) _239 = load _237, stack_114
        mem _240 = extract _239, qu32 0
        (mem, qs32) _244 = load _240, _242
        mem _245 = extract _244, qu32 0
        (mem, Stack{[64 x qs32], qs32}) _247 = load _245, stack_114
        mem _248 = extract _247, qu32 0
        qs32 _254 = extract _244, qu32 1
        (mem, [64 x qs32]) _252 = load _248, _250
        qs32* _256 = lea _250, _254
        mem _253 = extract _252, qu32 0
        (mem, qs32) _258 = load _253, _256
        mem _259 = extract _258, qu32 0
        (mem, qs32) _261 = load _259, _242
        qs32 _263 = extract _261, qu32 1
        qs32 _264 = sub _263, qs32 1
        mem _262 = extract _261, qu32 0
        qs32 ret_266 = extract _258, qu32 1
        mem _265 = store _262, _242, _264
        while_head_121 _265, ret_266

    next_131(mem mem_133)
        return_104 mem_133, node_id_125

After lift_enters:

main_impala_102(mem mem_103, fn(mem, qs32) return_104) extern 
    (mem, frame) _106 = enter mem_103
    frame _108 = extract _106, qu32 1
    Stack{[64 x qs32], qs32}* stack_114 = slot _108
    [64 x qs32]* _250 = lea stack_114, qu32 0
    mem _107 = extract _106, qu32 0
    mem _282 = store _107, _250, (definite_array qs32 1985229328, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32, bottom qs32)
    qs32* _242 = lea stack_114, qu32 1
    while_head_121 _282, qs32 0

    while_head_121(mem mem_123, qs32 node_id_125)
        pu32 _126 = cast node_id_125
        bool _127 = lt _126, pu32 1985229328
        fn(mem) _134 = @ _127 select while_body_128, next_131
        _134 mem_123

    while_body_128(mem mem_130)
        (mem, Stack{[64 x qs32], qs32}) _291 = load mem_130, stack_114
        mem _292 = extract _291, qu32 0
        (mem, qs32) _294 = load _292, _242
        mem _295 = extract _294, qu32 0
        (mem, Stack{[64 x qs32], qs32}) _297 = load _295, stack_114
        mem _298 = extract _297, qu32 0
        qs32 _302 = extract _294, qu32 1
        (mem, [64 x qs32]) _300 = load _298, _250
        qs32* _304 = lea _250, _302
        mem _301 = extract _300, qu32 0
        (mem, qs32) _306 = load _301, _304
        mem _307 = extract _306, qu32 0
        (mem, qs32) _309 = load _307, _242
        qs32 _311 = extract _309, qu32 1
        qs32 _312 = sub _311, qs32 1
        mem _310 = extract _309, qu32 0
        qs32 ret_314 = extract _306, qu32 1
        mem _313 = store _310, _242, _312
        while_head_121 _313, ret_314

    next_131(mem mem_133)
        return_104 mem_133, node_id_125

Thorin asserts when structs are used that is a param

anydsl/thorin/src/thorin/analyses/domtree.cpp:53: const thorin::CFNode* thorin::DomTreeBase<<anonymous> >::lca(const thorin::CFNode*, const thorin::CFNode*) const [with bool forward = true]: Assertion `i && j' failed.

Test case: AnyDSL/impala@d863c02

reserve_shared triggers assertion failure

This code fragment:

extern "thorin" {
    fn cuda(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); 
    fn reserve_shared[T](i32) -> &[3][T];
}
fn launch(size: i32) -> () {
    with cuda(0, (size, 1, 1), (size, 1, 1)) {
        let shrd = reserve_shared[f32](size);
    }   
}

fn main() -> i32 {
    @launch(128);
    @launch(256);
    0   
}

results in an assertion failure:

Expression: (!r || dynamic_cast<L*>(r)) && "cast not possible"

at:

File: W:/thesis/anydsl/thorin/src/thorin/util/cast.h, Line 29

The reserve_shared call triggers the problem. Also the problem is only triggered when more than launch calls are present.

amdgpu support

create a bottom aggregate as an aggregate of bottoms

Currently we create a single bottom value of aggregate type. An aggregate of bottoms enables nicer optimizations.

Partial evaluator bug.

Here is a simple code that generates an assertion `pred->direct_succs().size() == 1 && "critical edge"' failure. The strange thing is that if you remove the @ on the if branch, or if you leave it but replace the push_node calls by the equivalent inlined code, it works fine. It even works when you add @ signs on each push_node call and remove the @ on the branch.

struct Node {
    child_first: int,
    prim_count: int,
}

extern fn test_recursion(mut nodes: ~[Node]) -> () {
    let mut stack: [int * 32];
    let mut stack_idx = 1;

    fn push_node(node: int) -> () {
        stack(stack_idx) = node;
        stack_idx += 1;
    }

    fn pop_node() -> int {
        stack_idx -= 1;
        stack(stack_idx)
    }

    stack(0) = 0;
    while stack_idx > 0 @{
        let node_id = pop_node();
        let cur_node = &nodes(node_id);

        if cur_node.prim_count == 0 @{ // This is the @ sign that causes the assertion failure 
            // The following manually inlined code works fine :
            /*stack(stack_idx) = cur_node.child_first;
            stack_idx += 1;
            stack(stack_idx) = cur_node.child_first + 1;
            stack_idx += 1*/

            // Adding @ signs each call here and removing the one on the branch works too
            push_node(cur_node.child_first);
            push_node(cur_node.child_first + 1)
        }
    }
}

C back end: incorrect code for calls to same function in different basic blocks

We generate incorrect code for the following example:

extern "thorin" {
    fn cuda(int, (int, int, int), (int, int, int), fn() -> ()) -> ();
    fn bitcast[D, S](S) -> D;
}

fn nan() -> f32 { bitcast[f32](0x7fffffffi32) }

fn main(out: &[f32], idx: i32) -> () {
    cuda(0, (1, 1, 1), (1, 1, 1), || {
        if idx < 1 {
            out(idx) = nan();
            return()
        }
        out(idx) = nan();
    });
}

Generated CUDA code:

__global__ void lambda_crit_520(float* _523_559, int _524_560) {
    bool _562;
    _562 = _524_560 < 1;
    float* idx_564;
    idx_564 = _523_559 + _524_560;
    if (_562) goto l563; else goto l568;
    l563: ;
        float _566;
        union { float dst; int src; } u_566;
        u_566.src = 2147483647;
        _566 = u_566.dst;
        *idx_564 = _566;
        return ;
    l568: ;
        *idx_564 = _566;
        return ;
}

Problem: _566 is not set when entering l568.

Runtime incompatibilities with Windows

I tried to compile an imbatracer on Windows, and I found the following issues in the runtime :

The allocation/deallocation routines are using posix_memalign, which is not available under Windows (and not part of the standard).
The timing routines are using clock_gettime : same story here.

I would suggest using the following alternatives :

Using hand-coded aligned allocation functions for thorin_alloc/free or using C11 aligned_alloc (but that requires the file to be saved as a C file and compiled with the -std=c11 option - I would not go for that).
Using the standard c++11 library std::chrono for timing

With those minor changes the code seems to work, at least on the CPU.

Infinite loop in lower2cff

Calling impala -emit-llvm on AnyDSL/stincilla/sorting_networks/bitonic.impala hangs in lower2cff:

impala bitonic.impala -emit-llvm

Invalid casts can result in assertion failure

Invalid casts can result in an assertion failure in thorin. That's highlight frustrating, as it quickly happens and it's fairly hard to figure out which part of the source triggers the problem.
See the last commit in https://github.com/AnyDSL/imbatracer/commits/impala-cast for an example.

Partial evaluation/hashing bug

There seems to be some sort of bug in the hashing/partial evailuation:

impala: /space/perard/sources/anydsl/thorin/src/thorin/util/hash.h:113: void thorin::HashTable<Key, T, H>::iterator_base<is_const>::verify() const [with bool is_const = true; Key = thorin::Use; T = void; H = thorin::UseHash]: Assertion `table_->id_ == id_' failed.

This bug is triggered by the following piece of code, with -emit-llvm. Interestingly, if you replace let iterate = |begin, end, body| { vectorize(4, begin, end, body); } by let iterate = range; (in the function traverse_single), then the code compiles perfectly. Moving the partial evaluation symbols also triggers another assertion.

impala: /space/perard/sources/anydsl/thorin/src/thorin/transform/partial_evaluation.cpp:135: void thorin::PartialEvaluator::eval(thorin::Continuation*, thorin::Continuation*): Assertion `ncur != nullptr' failed.

This is the culprit:

// Iteration function
type IterateFn = fn (i32, i32, fn(i32) -> ()) -> ();

extern "thorin" {
    fn vectorize(i32, i32, i32, fn(i32) -> ()) -> ();
    fn bitcast[D, S](S) -> D;
    fn select[A, B](A, B, B) -> B;
}

static flt_max = 1.0e+37f;

fn range(a: i32, b: i32, body: fn(i32) -> ()) -> () {
    if a < b {
        body(a);
        range(a + 1, b, body, return)
    }
}

fn unroll(a: i32, b: i32, body: fn(i32) -> ()) -> () @{
    if a < b @{
        body(a);
        unroll(a + 1, b, body, return)
    }
}

// Vector of size 3
struct Vec3 {
    x: f32, y: f32, z: f32
}

// Node for a 4-ary BVH
struct Node {
    // Minimum bounding box coord. for 4 children
    min_x: [f32 * 4],
    min_y: [f32 * 4],
    min_z: [f32 * 4],

    // Maximum bounding box coord. for 4 children
    max_x: [f32 * 4],
    max_y: [f32 * 4],
    max_z: [f32 * 4],

    // Child index (>0: inner node, 0: disabled, <0: leaf)
    child: [i32 * 4]
}

// Flattened triangle
struct FlatTri {
    // Packed x coords. for v0, e1, e2 and normal
    v0_x: [f32 * 4],
    e1_x: [f32 * 4],
    e2_x: [f32 * 4],
     n_x: [f32 * 4],

    // Packed y coords. for v0, e1, e2 and normal
    v0_y: [f32 * 4],
    e1_y: [f32 * 4],
    e2_y: [f32 * 4],
     n_y: [f32 * 4],

    // Packed z coords. for v0, e1, e2 and normal
    v0_z: [f32 * 4],
    e1_z: [f32 * 4],
    e2_z: [f32 * 4],
     n_z: [f32 * 4],

    // Index of the triangle (<0: sentinel, >=0: valid triangle index)
    id:   [i32 * 4]
}

// Manual stack for the traversal
struct Stack {
    swap: fn(i32) -> (),
    push: fn(i32) -> (),
    pop:  fn() -> i32,
    is_empty: fn() -> bool
}

// Structure that holds the hit information
struct Hit {
    t:  f32,
    u:  f32,
    v:  f32,
    id: i32
}

// Structure that holds a ray
struct Ray {
    org:   Vec3,
    dir:   Vec3,
    oidir: Vec3,
    idir:  Vec3,
    tmin:  f32,
    tmax:  f32
}

fn vec3(x: f32, y: f32, z: f32) -> Vec3 {
    Vec3 { x: x, y: y, z: z }
}

fn vec3_sub(a: Vec3, b: Vec3) -> Vec3 {
    Vec3 {
        x: a.x - b.x,
        y: a.y - b.y,
        z: a.z - b.z
    }
}

fn vec3_mul(a: Vec3, b: Vec3) -> Vec3 {
    Vec3 {
        x: a.x * b.x,
        y: a.y * b.y,
        z: a.z * b.z
    }
}

fn vec3_cross(a: Vec3, b: Vec3) -> Vec3 {
    Vec3 {
        x: a.y * b.z - a.z * b.y,
        y: a.z * b.x - a.x * b.z,
        z: a.x * b.y - a.y * b.x
    }
}

fn vec3_dot(a: Vec3, b: Vec3) -> f32 {
    a.x * b.x + a.y * b.y + a.z * b.z
}

fn ray(org: Vec3, dir: Vec3, tmin: f32, tmax: f32) -> Ray {
    let idir = vec3(1.0f / dir.x, 1.0f / dir.y, 1.0f / dir.z);
    let oidir = vec3_mul(org, idir);
    Ray {
        org: org,
        dir: dir,
        idir: idir,
        oidir: oidir,
        tmin: tmin,
        tmax: tmax
    }
}

fn alloc_stack() -> Stack {
    let sentinel = 0x7FFFFFFF;
    let mut top : i32 = sentinel;
    let mut items : [i32 * 64];
    let mut ptr = 0;
    Stack {
        swap: |node| { items(ptr++) = node; },
        push: |node| { items(ptr++) = top; top = node; },
        pop:  || { let old = top; top = items(--ptr); old },
        is_empty: || { top == sentinel }
    }
}

fn is_leaf(node: i32) -> bool { node < 0 }

fn iminf(a: f32, b: f32) -> f32 {
    // Use integer comparison
    /*let (a_, b_) = (bitcast[i32](a), bitcast[i32](b));
    bitcast[f32](select(a_ < b_, a_, b_))*/
    select(a < b, a, b)
}
fn imaxf(a: f32, b: f32) -> f32 {
    // Use integer comparison
    /*let (a_, b_) = (bitcast[i32](a), bitcast[i32](b));
    bitcast[f32](select(a_ > b_, a_, b_))*/
    select(a > b, a, b)
}
fn iminminf(a: f32, b: f32, c: f32) -> f32 { iminf(iminf(a, b), c) }
fn iminmaxf(a: f32, b: f32, c: f32) -> f32 { imaxf(iminf(a, b), c) }
fn imaxminf(a: f32, b: f32, c: f32) -> f32 { iminf(imaxf(a, b), c) }
fn imaxmaxf(a: f32, b: f32, c: f32) -> f32 { imaxf(imaxf(a, b), c) }

fn fabsf(x: f32) -> f32 { if x < 0.0f { -x } else { x } }

fn prodsign(x: f32, y: f32) -> f32 {
    bitcast[f32](bitcast[i32](x) ^ (bitcast[i32](y) & bitcast[i32](0x80000000u)))
}

fn intersect_ray_box(bmin: Vec3, bmax: Vec3, oidir: Vec3, idir: Vec3, tmin: f32, tmax: f32, intr: fn(f32, f32) -> ()) -> () @{
    fn span_begin(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32) -> f32 {
        imaxmaxf(iminf(a, b), iminf(c, d), iminmaxf(e, f, g))
    }

    fn span_end(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32) -> f32 {
        iminminf(imaxf(a, b), imaxf(c, d), imaxminf(e, f, g))
    }

    let t0_x = bmin.x * idir.x - oidir.x;
    let t1_x = bmax.x * idir.x - oidir.x;
    let t0_y = bmin.y * idir.y - oidir.y;
    let t1_y = bmax.y * idir.y - oidir.y;
    let t0_z = bmin.z * idir.z - oidir.z;
    let t1_z = bmax.z * idir.z - oidir.z;

    let t0 = span_begin(t0_x, t1_x, t0_y, t1_y, t0_z, t1_z, tmin);
    let t1 = span_end  (t0_x, t1_x, t0_y, t1_y, t0_z, t1_z, tmax);

    if (t0 <= t1) { intr(t0, t1) }
}

fn intersect_ray_tri(v0: Vec3, e1: Vec3, e2: Vec3, n: Vec3, org: Vec3, dir: Vec3, tmin: f32, tmax: f32, intr: fn(f32, f32, f32) -> ()) -> () @{
    let c = vec3_sub(v0, org);
    let r = vec3_cross(dir, c);
    let det = vec3_dot(n, dir);
    let abs_det = fabsf(det);

    let u = prodsign(vec3_dot(r, e2), det);
    let v = prodsign(vec3_dot(r, e1), det);
    let w = abs_det - u - v;

    if u >= 0.0f & v >= 0.0f & w >= 0.0f {
        let t = prodsign(vec3_dot(n, c), det);
        if t >= abs_det * tmin & abs_det * tmax >= t {
            let inv_det = 1.0f / abs_det;
            intr(t * inv_det, u * inv_det, v * inv_det);
        }
    }
}

fn traverse(iterate: IterateFn, stack: Stack, nodes: &[Node * 1], tris: &[FlatTri * 1], ray: Ray) -> Hit @{
    let mut t_hit = ray.tmax;
    let mut u_hit = 0.0f;
    let mut v_hit = 0.0f;
    let mut id_hit = -1;

    while !stack.is_empty() {
        let mut node_id = stack.pop();
        let node = nodes(node_id);

        // Intersect each child
        let mut found = [false, false, false, false];
        let mut entry : [f32 * 4]; 
        for i in iterate(0, 4) @{
            let min = vec3(node.min_x(i), node.min_y(i), node.min_z(i));
            let max = vec3(node.max_x(i), node.max_y(i), node.max_z(i));
            with t0, t1 in intersect_ray_box(min, max, ray.oidir, ray.idir, ray.tmin, t_hit) @{
                found(i) = node.child(i) != 0;
                entry(i) = t0;
            }
        }

        // "Sort" them
        let mut tmin = flt_max;
        for i in @unroll(0, 4) {
            if found(i) {
                if tmin > entry(i) @{
                    stack.push(node.child(i));
                    tmin = entry(i)
                } else @{
                    stack.swap(node.child(i))
                }
            }
        }

        while is_leaf(node_id) @{
            let mut tri_id = !node_id;
            while true {
                let cur = tris(tri_id);

                for i in iterate(0, 4) @{
                    /*let v0 = vec3(cur.v0_x(i), cur.v0_y(i), cur.v0_z(i));
                    let e1 = vec3(cur.e1_x(i), cur.e1_y(i), cur.e1_z(i));
                    let e2 = vec3(cur.e2_x(i), cur.e2_y(i), cur.e2_z(i));
                    let n  = vec3(cur. n_x(i), cur. n_y(i), cur. n_z(i));

                    with t, u, v in intersect_ray_tri(v0, e1, e2, n, ray.org, ray.dir, ray.tmin, t_hit) @{
                        t_hit = t;
                        u_hit = u;
                        v_hit = v;
                        id_hit = cur.id(i);
                    }*/
                }

                if cur.id(3) < 0 { break() }
                tri_id++;
            }

            node_id = stack.pop()
        }
    }

    Hit { t: t_hit, u: u_hit, v: v_hit, id: id_hit }
}

fn traverse_single(nodes: &[Node * 1], tris: &[FlatTri * 1], ray: Ray) -> Hit @{
    let iterate = |begin, end, body| { vectorize(4, begin, end, body) };
    let stack = alloc_stack();
    stack.push(0);
    traverse(iterate, stack, nodes, tris, ray)
}

extern fn from_c_traverse_single(nodes: &[Node * 1], tris: &[FlatTri * 1], org: &Vec3, dir: &Vec3, tmin: f32, tmax: f32, hit: &Hit) -> () @{
    *hit = traverse_single(nodes, tris, ray(*org, *dir, tmin, tmax));
}

'May be undefined' message

I got the 'May be undefined' message after compiling the following piece of code:

fn iterate_children(mut node_id: &i32) -> () {
    for call_me() {
        if 42 < *node_id {
            // Nothing to see here
        }
    }
    *node_id = 42;
}

fn call_me(body: fn() -> ()) -> () {
    body();
}

extern fn traverse_accel() -> () {
    let mut node_id = 42;
    while node_id < 42 {
        @iterate_children(&node_id);
    }
}

Runtime error passing mutable variable to a kernel function

The following code:

extern "thorin" {
    fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
}

fn pass_dummy(mut d: i32) -> () { }

fn main() -> i32 {
    let mut d = 1;
    with nvvm(0, (1, 1, 1), (1, 1, 1)) {
        pass_dummy(d);
    }
    0
}

causes the runtime error:

Found return instr that returns non-void in Function of void return type!
ret void
voidBroken module found, compilation aborted!

This only happens if d is mut. If it is immutable it compiles fine.

Sometimes, the same thing causes another runtime error:

E:W:\thesis\anydsl\thorin\src\thorin\be\llvm\runtime.cpp: 91: currently only pointers to arrays supported as kernel argument at 'W:/thesis/anydsl/libpp/src/cuda/device/dispatch/dispatch_scan.impala:58 col 13 - 45'; argument has different type:

However, I couldn't reproduce this same issue in a stand alone example outside my code. If it is of any help, I can try to find out when either of the errors happen exactly.

use cmake instead of plain makefiles

thorin/runtime still uses old-school makefiles. we should switch to cmake.

Vector comparisons

Vector comparisons should be forbidden. Or return simd[bool * n].

merge master and new_master

Assertion `phis_.find(param) != phis_.end()' failed

The aforementioned assertion can be triggered by having lots of $ in the code. See the branch impala-3 of imbatracer (https://github.com/AnyDSL/imbatracer/commits/impala-fail3). The last commit mechanically replaces all "@" by "$", this introduces the breakage.

Memory usage explodes

Some seemingly harmless changes can make the impala memory usage explode. See the branch https://github.com/AnyDSL/imbatracer/commits/memleak of imbatracer. The last commit adds a simple (10-line) file that's not even used, which triggers the problem.
Another example triggering the problem is adding an external function call as in AnyDSL/imbatracer@b2eaa4a.

C backend is amenable for the swap problem

When translating the following program

fn main(c: bool, mut a: int, mut b: int) -> int {
    while c {
        let tmp = a;
        a = b;
        b = tmp;
    }

    a + b
}

to C:

int main_impala(bool c_29, int a_30, int b_31) {
    int tmp_55;
    int b_57;
    tmp_55 = a_30;
    b_57 = b_31;
    goto l44;
    l44: ;
        if (c_29) {
            goto l47;
        } else {
            goto l50;
        }
    l47: ;
        tmp_55 = b_57;
        b_57 = tmp_55;
        goto l44;
    l50: ;
        int _64;
        _64 = tmp_55 + b_57;
        return _64;
}

The code is incorrect. The swap is lost.

SegFault in -emit-thorin

Calling impala -emit-thorin on AnyDSL/stincilla/sorting_networks/bitonic.impala segfaults:

impala bitonic.impala -emit-thorin
module 'bitonic'

thorin/analyses/domtree.cpp:53: const thorin::CFNode* thorin::DomTreeBase<<anonymous> >::lca(const thorin::CFNode*, const thorin::CFNode*) const [with bool forward = true]: Assertion `i && j' failed.
Aborted (core dumped)

No README

Thorin segfault on unreachable code when emitting LLVM

The following code segfaults when generating LLVM code due to unreachable code (llvm.cpp:321):

extern "C" {
  fn sqrtf(f32) -> f32;
}
extern
fn thorin_test() -> () {
    let check: [float * 1]; 
    let X = ~[12:float];
    let sval = sqrtf(X(0));
    if check(0) > 0f { }   
}

`type->is_closed()' assertion for polymorphic functions

impala: thorin/src/thorin/world.cpp:814: const thorin::TypeNode* thorin::World::unify_base(const thorin::TypeNode*): Assertion `type->is_closed()' failed.

A test case can be found here: AnyDSL/impala@cfdee49

sizeof

Es wäre echt praktisch, wenn wir einen sizeof-Operator bekommen könnten. Momentan berechnen wir die Größen diverser Structs (inklusive Padding) von Hand, das ist durchaus ziemlich fehleranfällig.

Parameter elimination broken

Parameter elimination assumes that every use of a lambda is itself a lambda. While loops generate select nodes which take lambda as parameters. Hence, parameter elimination breaks on these nodes. Here is an example taken from the traversal code:

while_body_8328(mem mem_8329)
    pop_node_4421 mem_8329, stack_8289, while_body_8332

while_body_8332(mem mem_8333, qs32 id_8334)
    is_empty_4402 mem_8333, stack_8289, while_head_8339

while_head_8339(mem mem_8340, bool is_empty_8341)
    fn(mem) _8342 = @ is_empty_8341 select next_8326, while_body_8328
    _8342 mem_8340

The lambda while_body_8328 is used as an operand for the select node. Then the following part of parameter elimination breaks (thorin/transform/cleanup_world.cpp:61):

for (auto use : olambda->uses()) {
    auto ulambda = use->as_lambda();
    assert(use.index() == 0 && "deleted param of lambda used as argument");
    ulambda->jump(nlambda, ulambda->args().cut(proxy_idx));
}

If I replace it by this it seems to work:

for (auto use : olambda->uses()) {
    if (auto ulambda = use->isa_lambda()) {
        assert(use.index() == 0 && "deleted param of lambda used as argument");
        ulambda->jump(nlambda, ulambda->args().cut(proxy_idx));
    }
}

Shall I push this patch ?

Missing functionality for sizeof

The sizeof PrimOp has no argument, one type argument (the type for which we want to get the size), and one return type (i32). This is annoying, because the import transformation requires the vrebuild function to be of the form:

const Def* SizeOf ::vrebuild(World& to, Defs, const Type* t)

Here, t is the return type (i.e. i32), and we cannot rebuild size_of->of() by calling vrebuild on its type, because it is private.

The correct fix for this is to add a Type2Type argument to vrebuild for PrimOps as well. This is a bit of a change though, and I would like to get your approval first.

Note: This fixes the SEGFAULT issue in Stincilla for the infer branch. The last remaining issue will be the reserve_shared test.

build thorin as shared lib

it should be possible to build thorin as shared library.

thorin::Node_TypeParam not handled in CodeGen::convert(Type type)

Thorin segfaults since thorin::Node_TypeParam is nod handled in CodeGen::convert(Type type), see commit AnyDSL/impala@93237d5

remove SPIR support

Now that we're on LLVM 3.8 we should remove all traces of SPIR inside of thorin and impala. It's just a chunk of code which doesn't work anyway.

PE skips code on new_master

File: test/stencil_int.impala in stincilla
Using new_master emits 3 allocas (3x warning slow), no allocas (0x warning slow) on master.
cd build && make stencil_int

decouple SSA-construction from Continuation

This causes some ugliness in the code which I want to get rid off - in particular before we try to merge some of the CoC stuff.

Problem during scope analysis

Scope analysis gave me the following message: Assertion `n == 0' failed. To trigger the bug, compile traversal/scope_bug with BACKEND=cpu and VECTORIZE=1. The code has been stripped to the minimum in this configuration.

PE does not terminate for dynamic higher-order functions

A test case for this can be found in AnyDSL/impala@e2c17b8.

Seg fault for simple example

This simple example creates a seg fault when compiling.

extern fn traverse_accel() -> () {   
    let mut k = 0;
    if k == 0 {return()}
    k=1;
}

handle any left-over pe_info in partial_evaluation.cpp

we should have a post-processing phase after partial evaluation which removes any pe_infos with an appropriate message. This should make 70144b1 superfluous.

PE skips code when logical operators are used

impala -emit-llvm logic_operator.impala
W:anydsl/thorin/src/thorin/be/llvm/llvm.cpp: 742: slow: alloca and loads/stores needed for aggregate '_17' at 'logic_operator.impala:2 col 5 - 8'
W:anydsl/thorin/src/thorin/be/llvm/llvm.cpp: 742: slow: alloca and loads/stores needed for aggregate '_20' at 'logic_operator.impala:2 col 16 - 19'

if @fun() { } works, but if (@fun() || @fun()) does not partially evaluate all the code.
Test case: AnyDSL/impala@fdb984f

Build is possible, but overly difficult, on Windows

I think this qualifies as a legitimate issue, but feel free to close if you don't wish to support Windows as a target. I managed to build Thorin on Windows 7, 64-bit, using VS Community 2015 (version 14 compilers, 64-bit). I don't know how to test it yet, and it may not work as-intended, but it does compile. However, it required multiple installations of unrelated software packages, mostly available through Chocolatey/NuGet/OneGet (on Windows 7, I use Chocolatey to manage packages for C/C++-built tools), but not universally (xz needed extra work). I'm not sure if half actually needs to be built, since it's header-only, so I am unsure why it includes a (rather out-of-date) Visual Studio solution. Some extra work was needed to build half because I didn't have doxygen, and without it the whole build mysteriously failed, giving the error "cmd.exe" exited with code 3. After updating the version-11 VC++ project files to version 14, hand-editing the one relevant vcxproj file to omit the custom build step that invoked doxygen, and then re-opening in VS and building, half compiled (though I suspect I didn't need to do any of this).

Then came the significantly more-frustrating task of navigating cmake for thorin. I was able to point it to the include directory of half without issue, but setting LLVM_DIR was completely ignored, and this consistently repeated no matter how explicitly I specified that variable. The solution was to put the bin folder of the already-built LLVM installation on my PATH env var (presumably this allowed llvm-config to be run), and give the llvm_install directory as LLVM_DIR (though this may have been overwritten anyway). I had built LLVM using the AnyDSL/anydsl scripts, which needed the manual download of xz to extract LLVM and the package installation of svn to acquire half (as well as some fiddling to get svn to actually download any files), and then some changes to how LLVM built so it used the correct settings for msbuild to successfully compile.

The closest thing to an actual bug here, as opposed to a mind-melting layer-cake of frustrations that are, still, solvable given enough patience, is that the advice cmake gives to set LLVM_DIR is incorrect. So is the description of LLVM_DIR as the directory that contains the cmake files for LLVM (setting it to that directory doesn't seem to do anything differently). What would be nice is if some of this build was done in a more unified way, like forking half so it's on GitHub instead of hoping that SourceForge won't be as unreliable today as it has been recently, and including half, Thorin, and maybe Impala as git submodules of anydsl.

Other than the work needed to get the code operational, the science that is the backbone of Thorin looks really excellent. I've linked the Thorin paper to a friend who is working independently on a graph-related programming language/platform, and he's impressed, as am I. I found this project through the Bangra IR project, which doesn't seem to use your code but is based off the paper, and apparently Leonard Ritter found very significant reductions in code size/effort when switching Bangra's closures to the method used in Thorin, as your team had found in the paper. I'm hoping to at some point implement a hybrid imperative/functional/array language using Thorin or, once it's more-ready, Bangra, but Impala looks like an excellent idea for easing DSL implementations.

So, sorry for venting a bit, but I hope there's a way to ease the Windows build process for other potential users. Thanks.

Assertion `!pred->arg(index) && "already set"'

The following code triggers the assertion (using impala and -emit-thorin):

struct Blob {
    done: fn () -> bool
}

fn test(next_blob: Blob) -> () {
    let next = |exit| {
        next_blob
    };

    let mut blob : Blob;
    while !blob.done() {
        blob = next(continue);
    }
}