paddlepaddle / cinn Goto Github PK
View Code? Open in Web Editor NEWCompiler Infrastructure for Neural Networks
Home Page: https://paddlepaddle.github.io/CINN/
License: Apache License 2.0
Compiler Infrastructure for Neural Networks
Home Page: https://paddlepaddle.github.io/CINN/
License: Apache License 2.0
生成的代码有2个sync
?
Originally posted by @Xreki in #785 (comment)
我在开发argmin算子时。参考了min等同类算子的实现方法后,发现这些全部都涉及到了更底层的开发,
需要改动的地方非常多,似乎不是本次任务所希望的实现方法。
开发中遇到的问题核心就是不知道在Compute中如何正确的使用与具体尺寸相关的循环,
若直接使用普通的for循环,对shape
取值得到的是Expr(n)
,而使用for(Expr i = Expr(0); i< shape[aixs]; i++)
不能正确编译程序,
于是考虑使用 ir::For,但是对其的原理不理解,不知道其 Expr类型的返回值的含义,编写如下代码仍然无法获得理想的结果。
我在对已经实现的所有compute中均未发现类似的可以参考的用法,多数都采用Reduce方法,但是Reduce只支持max/min/sum/mul等,如果要扩展需要修改较多底层的实现,似乎不是本次任务所希望的实现方法,
因此希望可以得到一个在compute中使用与具体尺寸相关的循环的例子参考,或者有一些其他方向上的指导。
auto temp_tensor = Compute(
{shape[real_axis] + 1},
[=](const std::vector<Expr> &indices) -> Expr { return lang::Identity(Expr(3.402823e+38f)); },
output_name + "_temp");
auto compute = [=](const std::vector<Expr> &indices) -> Expr {
std::vector<Expr> cur_indices(indices);
if (!keep_dims) {
cur_indices.insert(cur_indices.begin() + real_axis, Expr(0));
}
CHECK_EQ(cur_indices.size(), ndim);
Var loop_var("k0", Int(32));
cur_indices[real_axis] = Expr(loop_var);
auto value = in_tensor(cur_indices);
auto last_value = temp_tensor(Expr(loop_var) - 1);
auto update = ir::GT::Make(value, last_value);
auto c_v = ir::Select::Make(update, value, last_value);
auto c_i = ir::Select::Make(update, ir::Cast::Make(Float(32), Expr(loop_var)), temp_tensor({Expr(0)}));
auto body1 = ir::Store::Make(temp_tensor, c_v, {Expr(loop_var)});
auto body2 = ir::Store::Make(temp_tensor, c_i, {Expr(0)});
auto body = ir::Block::Make({body1, body2});
auto forloop = ir::For::Make(
loop_var, common::make_const(1), shape[real_axis], ir::ForType::Serial, ir::DeviceAPI::Host, body);
return ir::Cast::Make(Int(32), temp_tensor({Expr(0)}));
};
__global__
void fn_conv2d_1_kernel(const float* __restrict__ var_1, const float* __restrict__ conv2d_0__w_0, float* __restrict__ Conv2d_nchw_out)
{
float* Conv2d_nchw_out__reduce_init = Conv2d_nchw_out;
if ((blockIdx.x < 2)) {
{
if ((threadIdx.x < 480)) {
{
for (int32_t j_inner = 0; j_inner < 2; j_inner += 1) {
for (int32_t k = 0; k < 7; k += 1) {
for (int32_t a = 0; a < 7; a += 1) {
Conv2d_nchw_out__reduce_init[((47040 * blockIdx.x) + ((49 * j_inner) + ((7 * k) + ((98 * threadIdx.x) + a))))] = 0;
};
};
};
}
};
}
};
if ((blockIdx.x < 2)) {
{
if ((threadIdx.x < 480)) {
{
for (int32_t j_inner = 0; j_inner < 2; j_inner += 1) {
for (int32_t k = 0; k < 7; k += 1) {
for (int32_t a = 0; a < 7; a += 1) {
for (int32_t fc = 0; fc < 160; fc += 1) {
for (int32_t fy = 0; fy < 1; fy += 1) {
Conv2d_nchw_out[((47040 * blockIdx.x) + ((49 * j_inner) + ((7 * k) + ((98 * threadIdx.x) + a))))] = (Conv2d_nchw_out[((47040 * blockIdx.x) + ((49 * j_inner) + ((7 * k) + ((98 * threadIdx.x) + a))))] + (((((((((k * 1) + fy) >= 0) && ((((k * 1) + fy) - 0) < 7)) && (((a * 1) + 0) >= 0)) && ((((a * 1) + 0) - 0) < 7))) ? var_1[((49 * ((((2 * threadIdx.x) + j_inner) / 960) * 160)) + ((7840 * blockIdx.x) + ((49 * fc) + ((7 * fy) + ((7 * k) + a)))))] : 0) * ((((0 == 0) && ((fy % 1) == 0))) ? conv2d_0__w_0[((fy / 1) + ((160 * j_inner) + ((320 * threadIdx.x) + fc)))] : 0)));
};
};
};
};
};
}
};
}
};
}
poly_for (po1, 0, (po1 <= 9), 1)
{
{
if (((((po0 >= 0) and (po0 <= 9)) and (po1 >= 0)) and (po1 <= 9))) {
cache[po0, po1] = A[po0, po1]
}
C[po0, po1] = (cache[po0, po1] + B[po0, po1])
}
}
The if(...)
in the code above is redundant, better to simplify it in the poly schedule step.
57a24429f3a2f06414131b676c5f0d32a33619d1
避免重复工作,这里登记任务情况,如有进行开发中的同学,请回复你要开发的任务,这里定期更新情况。
参考文档 mentor by @thisjiang
序号 | 任务名称 | 难度 | 状态 | 作者 | PR |
---|---|---|---|---|---|
1 | reciprocal(取倒数) | 简单 | 已完成✅ | enkilee | #1069 |
2 | cbrt(立方根) | 简单 | 已完成 ✅ | huangjiyi | #1073 |
3 | logical_right_shift(逻辑右移) | 简单 | 已完成✅ | ccsuzzh | #1083 |
4 | clz | 简单 | 已完成✅ | zzk0 | #1059 |
5 | popc | 简单 | 已完成✅ | FisherWY | #1064 |
6 | atan2 | 中等 | 已完成✅ | zrr1999 | #1058 |
7 | cholesky | 困难 | 已完成✅ | @FisherWY | #1133 |
参考文档 mentor by @SunNy820828449
序号 | 任务名称 | 难度 | 状态 | 作者 | PR |
---|---|---|---|---|---|
1 | CSE(公共子表达式消除) | 难 | 已完成 ✅(2023/02/16) | @zrr1999 | #1116 #1166 |
序号 | 任务名称 | 难度 | 状态 | 作者 | PR |
---|---|---|---|---|---|
1 | Unannotate | 简单 | 已完成(2023/01/03)✅ | @AndPuQing | #1126 |
2 | GetChildBlocks | 简单 | 已完成(2023/02/02)✅ | @ccsuzzh | #1157 |
3 | SampleCategorica | 简单 | 已完成(2023/03/31)✅ | @enkilee | #1169 |
4 | SamplePerfectTile | 中等 | 已完成(2023/01/17)✅ | @AndPuQing | #1142 |
提示如下
CMake Warning at cmake/external/pybind11.cmake:19 (find_package):
By not providing "FindPython.cmake" in CMAKE_MODULE_PATH this project has
asked CMake to find a package configuration file provided by "Python", but
CMake did not find one.
Could not find a package configuration file provided by "Python" with any
of the following names:
PythonConfig.cmake
python-config.cmake
Add the installation prefix of "Python" to CMAKE_PREFIX_PATH or set
"Python_DIR" to a directory containing one of the above files. If "Python"
provides a separate development package or SDK, be sure it has been
installed.
Call Stack (most recent call first):
CMakeLists.txt:52 (include)
-- pybind path: /jiangjiajun/CINN/build/thirds/pybind/src/extern_pybind/include
-- third: /jiangjiajun/CINN/build/thirds
-- set LLVM_DIR: /lib/cmake/llvm
-- set MLIR_DIR: /lib/cmake/mlir
CMake Error at cmake/llvm.cmake:8 (find_package):
Could not find a package configuration file provided by "MLIR" with any of
the following names:
MLIRConfig.cmake
mlir-config.cmake
Add the installation prefix of "MLIR" to CMAKE_PREFIX_PATH or set
"MLIR_DIR" to a directory containing one of the above files. If "MLIR"
provides a separate development package or SDK, be sure it has been
installed.
Call Stack (most recent call first):
CMakeLists.txt:57 (include)
make all the temporary variables with a temp buffer
The graph from CreateCompGraph
按照 README 中 Installation 所说的,使用了 Docker 镜像 paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8-gcc82,执行 cmake 的过程中发生错误。
执行的命令是在 CINN/build
目录下 cmake .. -DPY_VERSION=3.9
(我使用的 Python 虚拟环境对应的 python 版本是 3.9)
按照报错的 CMakeLists.txt 查看,link 的是 python 动态库。
find_package(PythonInterp ${PY_VERSION} REQUIRED)
find_package(PythonLibs ${PY_VERSION} REQUIRED)
cc_test(test_cost_model SRCS cost_model_test.cc cost_model.cc DEPS pybind gtest_main)
target_link_libraries(test_cost_model ${PYTHON_LIBRARIES})
这个 so 是能被找到的。但是仍然出现上述的报错。
请问这改如何解决呢?
详细的 CMake 输出文件如下:
Enhance the intrinsic operations used in CodeGens.
Add signature and type verification.
The remaining operations to refactor:
After the factor, the intrinsic.h
file will be removed.
The benchmark on op
Devices:
Compare with TVM0.7
We might need to consider building a benchmark workflow in CI
An extra null pointer check is not needed in functions like the following.
我在开发gather算子时遇到了一个bug,经过排查发现很有可能是因为SetRandData 的功能跟我认为的功能不太一样。
源码中将随机到的整数转为浮点数再对浮点张量进行赋值
std::vector<float> random_data(num_ele);
for (size_t i = 0; i < num_ele; i++) {
random_data[i] = static_cast<float>(dist(engine)); // All random data
}
在其他类似工具生成随机数的时候通常是用Randint,randn,rand等函数名确定实际生成的随机数分布,数据类型通过额外指定,所以SetRandData函数是否也应该采用类似的写法,比如SetRandint,SetRandn这种,T指定了针对的张量数据类型。
我感觉这样至少有两点好处
相关 pr:#897
Add something like
ConditionedSimplify
to simplify Load/Store indices if ((blockIdx.x < 40)) {
{
if ((threadIdx.x < 40)) {
{
if (((((blockIdx.x >= 0) && (blockIdx.x <= 39)) && (threadIdx.x >= 0)) && (threadIdx.x <= 39))) {
C[0] = A[((40 * blockIdx.x) + threadIdx.x)];
};
C_cache_write_out_3[((40 * blockIdx.x) + threadIdx.x)] = C[0];
}
};
}
};
#417
computeAt lowered ir is different with computeAt2 lowered ir
#include <cinn_runtime.h>
#include <stdio.h>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Predefined utilities in CINN BEGIN(
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#include <immintrin.h>
#include <stdint.h>
#include <vector>
#ifndef _CINN_X86_BUILTIN_SOURCE_
#define _CINN_X86_BUILTIN_SOURCE_
//! Vector in stack, this can only used in generated .cc file.
template <typename T, size_t Num>
struct StackVec {
typedef T value_type;
typedef StackVec<T, Num> self_type;
self_type& operator=(const StackVec& src) {
if (this != &src) {
memcpy(data_, src.data_, num_bytes());
}
return *this;
}
StackVec() { memset(data_, 0, num_bytes()); }
explicit StackVec(const T* externl) : external_data_(externl) {}
static self_type Broadcast(const value_type& v) {
self_type res;
for (size_t i = 0; i < Num; i++) res.data_[i] = v;
return res;
}
static self_type Ramp(const value_type& base, const value_type& stride) {
self_type res;
for (size_t i = 0; i < Num; i++) {
res.data_[i] = base + stride * i;
}
}
static self_type Load(const void* base, int32_t offset) {
self_type res;
memcpy(&res.data_[0], (const value_type*)base + offset, num_bytes());
}
static self_type Load(const void* base, const StackVec<int32_t, Num>& offset) {
self_type res;
for (size_t i = 0; i < Num; i++) {
res.data_[i] = ((const value_type*)base)[offset[i]];
}
}
void Store(void* base, int32_t offset) const {
mempcpy((value_type*)base + offset, &data_[0], num_bytes()); // NOLINT
}
inline value_type& operator[](size_t i) { return data_[i]; }
inline value_type operator[](size_t i) const { return data_[i]; }
// binary operator between two vectors
// @{
#define __(op__) \
friend self_type operator op__(const self_type& a, const self_type& b) { \
self_type res; \
for (size_t i = 0; i < Num; i++) { \
res.data_[i] = a[i] op__ b[i]; \
} \
return res; \
}
__(+)
__(-)
__(*)
__(/)
__(%)
// @}
#undef __
// binary operator between a vector and a scalar
// @{
#define __(op__) \
friend self_type operator op__(const self_type& a, const value_type& b) { \
self_type res; \
for (size_t i = 0; i < Num; i++) { \
res.data_[i] = a[i] op__ b; \
} \
return res; \
}
__(+)
__(-)
__(*)
__(/)
__(%)
#undef __
// @}
static constexpr size_t num_bytes() { return sizeof(data_); }
private:
T data_[Num];
T* external_data_{nullptr};
};
/**
* The vector with external data.
*/
template <typename T, size_t Num>
struct ExternalVec {
typedef T value_type;
typedef ExternalVec<T, Num> self_type;
explicit ExternalVec(T* data) : data_(data) {}
self_type& operator=(const self_type& src) {
if (data_ != src.data_) {
memcpy(data_, src.data_, num_bytes());
}
return *this;
}
static self_type Load(const void* base, int32_t offset) {
self_type res((T*)base + offset); // NOLINT
return res;
}
static constexpr size_t num_bytes() { return sizeof(value_type) * Num; }
private:
T* data_{nullptr};
};
// AVX256 load
//@{
inline __m256 cinn_avx256_load(const float* dst) { return _mm256_load_ps(dst); }
inline __m256d cinn_avx256_load(const double* dst) { return _mm256_load_pd(dst); }
//@}
// AVX512 load
//@{
inline __m512 cinn_avx512_load(const float* dst) { return _mm512_load_ps(dst); }
inline __m512d cinn_avx512_load(const double* dst) { return _mm512_load_pd(dst); }
//@}
// FP32x8 * FP32x8
// @{
inline void cinn_avx256_add(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_sub(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_mul(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_div(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
// @}
// FP32x4 * float
// @{
inline void cinn_avx256_add(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
inline void cinn_avx256_sub(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
inline void cinn_avx256_mul(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
inline void cinn_avx256_div(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
// @}
// float * FP32x4
// @{
inline void cinn_avx256_add(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_add_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_sub(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_sub_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_mul(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_mul_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_div(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_div_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
// @}
// 4 x float64
// @{
inline void cinn_avx256_add(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_sub(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_mul(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_div(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
// @}
// FP32x4 * FP64
// @{
inline void cinn_avx256_add(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
inline void cinn_avx256_sub(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
inline void cinn_avx256_mul(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
inline void cinn_avx256_div(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
// @}
// float * FP32x4
// @{
inline void cinn_avx256_add(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_add_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_sub(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_sub_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_mul(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_mul_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_div(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_div_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
// @}
//! 32 x float32 operations.
// @{
inline void cinn_avx512_add(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_add_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
inline void cinn_avx512_sub(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_sub_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
inline void cinn_avx512_mul(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_mul_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
inline void cinn_avx512_div(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_div_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
// @}
// FP32x4 * FP64
// @{
inline void cinn_avx512_add(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
inline void cinn_avx512_sub(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
inline void cinn_avx512_mul(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
inline void cinn_avx512_div(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
// @}
// float * FP32x4
// @{
inline void cinn_avx512_add(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_add_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_sub(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_sub_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_mul(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_mul_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_div(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_div_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
// @}
//! 16 x float32 operations.
// @{
inline void cinn_avx512_add(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_sub(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_mul(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_div(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
// @}
inline __m512 cinn_avx512_add(const __m512& a, const __m512& b);
inline __m256 cinn_avx256_add_float(const __m256& a, const __m256& b) { return _mm256_add_ps(a, b); }
inline __m256d cinn_avx256_add_double(const __m256d& a, const __m256d& b) { return _mm256_add_pd(a, b); }
inline __m512 cinn_avx512_add_float(const __m512& a, const __m512& b) { return _mm512_add_ps(a, b); }
inline __m512d cinn_avx512_add_double(const __m512d& a, const __m512d& b) { return _mm512_add_pd(a, b); }
//! set1
// @{
inline __m256 cinn_avx256_set1(float value) { return _mm256_set1_ps(value); }
inline __m256d cinn_avx256_set1(double value) { return _mm256_set1_pd(value); }
inline __m512 cinn_avx512_set1(float value) { return _mm512_set1_ps(value); }
inline __m512d cinn_avx512_set1(double value) { return _mm512_set1_pd(value); }
// @}
//! store
// @{
inline void cinn_avx512_store(float* dst, const __m512& x) { _mm512_store_ps(dst, x); }
inline void cinn_avx512_store(double* dst, const __m512d& x) { _mm512_store_pd(dst, x); }
inline void cinn_avx256_store(float* dst, const __m256& x) { _mm256_store_ps(dst, x); }
inline void cinn_avx256_store(double* dst, const __m256d& x) { _mm256_store_pd(dst, x); }
// @}
//! add
// @{
inline __m256 cinn_avx256_add(const __m256& a, const __m256& b) { return _mm256_add_ps(a, b); }
inline __m256d cinn_avx256_add(const __m256d& a, const __m256d& b) { return _mm256_add_pd(a, b); }
inline __m512 cinn_avx512_add(const __m512& a, const __m512& b) { return _mm512_add_ps(a, b); }
inline __m512d cinn_avx512_add(const __m512d& a, const __m512d& b) { return _mm512_add_pd(a, b); }
// @}
//! mul
// @{
inline __m256 cinn_avx256_mul(const __m256& a, const __m256& b) { return _mm256_mul_ps(a, b); }
inline __m256d cinn_avx256_mul(const __m256d& a, const __m256d& b) { return _mm256_mul_pd(a, b); }
inline __m512 cinn_avx512_mul(const __m512& a, const __m512& b) { return _mm512_mul_ps(a, b); }
inline __m512d cinn_avx512_mul(const __m512d& a, const __m512d& b) { return _mm512_mul_pd(a, b); }
// @}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// )END Predefined utilities in CINN
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#endif // _CINN_X86_BUILTIN_SOURCE_
void matmul_array_packing(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_t* _packedB = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[3]));
cinn_buffer_malloc((void*)(0), _C);
cinn_buffer_malloc((void*)(0), _packedB);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
float* packedB = ((float*)(_packedB->memory));
for (int32_t i = 0; i < 1024; i += 1) {
for (int32_t j = 0; j < 1024; j += 1) {
C__reduce_init[((1024 * i) + j)] = 0;
};
};
for (int32_t i = 0; i < 32; i += 1) {
for (int32_t j = 0; j < 1024; j += 1) {
for (int32_t k = 0; k < 4; k += 1) {
cinn_avx256_store(packedB + ((32768 * i) + ((32 * j) + (8 * k))), cinn_avx256_load(B + ((32 * i) + ((1024 * j) + (8 * k)))));
};
};
};
for (int32_t i_outer = 0; i_outer < 32; i_outer += 1) {
for (int32_t j_outer = 0; j_outer < 32; j_outer += 1) {
for (int32_t k0_outer = 0; k0_outer < 256; k0_outer += 1) {
for (int32_t i_inner = 0; i_inner < 32; i_inner += 1) {
for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
auto tmp_1 = cinn_avx256_set1(((float)(A[((1024 * i_inner) + ((32768 * i_outer) + ((4 * k0_outer) + k0_inner)))])));
for (int32_t j_inner = 0; j_inner < 4; j_inner += 1) {
cinn_avx256_store(C + ((1024 * i_inner) + ((32768 * i_outer) + ((8 * j_inner) + (32 * j_outer)))), cinn_avx256_add(cinn_avx256_load(C + ((1024 * i_inner) + ((32768 * i_outer) + ((8 * j_inner) + (32 * j_outer))))), cinn_avx256_mul(tmp_1, cinn_avx256_load(packedB + ((8 * j_inner) + ((32768 * j_outer) + ((32 * k0_inner) + (128 * k0_outer))))))));
};
};
};
};
};
};
cinn_buffer_free((void*)(0), _C);
cinn_buffer_free((void*)(0), _packedB);
}
add pe module and related test
Add operator module and build structure
Remove unneeded fields.
Build a framework for developers to conveniently compare CINN's op performance with TVM's op performance.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.