Code Monkey home page Code Monkey logo

nvidia-merlin / hierarchicalkv Goto Github PK

View Code? Open in Web Editor NEW
102.0 19.0 22.0 6.26 MB

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. The key capability of HierarchicalKV is to store key-value feature-embeddings on high-bandwidth memory (HBM) of GPUs and in host memory. It also can be used as a generic key-value storage.

License: Apache License 2.0

CMake 0.50% Cuda 90.83% C++ 0.47% Starlark 3.48% Python 2.33% Smarty 2.37% Shell 0.03%
cuda gpu hashtable recommender-system dynamic-embedding embedding-storage key-value-store

hierarchicalkv's Issues

[Bug] We got one test fail on memory pool, it looks like happen randomly.

Ready Events =
[ 0 ] 0x55ff7c5bf600
[ 1 ] 0x55ff7c5bf760
[ 2 ] 0x55ff7c5bd220
[ 3 ] 0x55ff7c5bd380
--------------------------------------------------------------------------------

/home/jenkins/agent/workspace/merlin-kv-ci/tests/memory_pool_test.cc.cu:372: Failure
Expected equality of these values:
  pool.num_pending()
    Which is: 1
  0
DeviceAllocator[type_name = 8SomeType]: 0x320000000 freed, stream = 0
DeviceAllocator[type_name = 8SomeType]: 0x320200000 freed, stream = 0
[  FAILED  ] MemoryPool.borrow_return_with_context (0 ms)
[----------] 5 tests from MemoryPool (279 ms total)

Unexpected insert behavior

I'm trying to insert 1024 keys into a table but get table size 976 instead, when init_capacity is 1024 and max_capacity is 2048. 48 keys are missing after insertion.

This issue can be reproduced by:

#include <stdio.h>
#include <memory>
#include <utility>
#include "merlin/types.cuh"
#include "merlin/utils.cuh"
#include "merlin_hashtable.cuh"

using TableOptions = nv::merlin::HashTableOptions;
const size_t dim = 8;
using Table_t = nv::merlin::HashTable<int64_t, float, size_t, dim>;

int main() {
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  size_t len = 1024;
  int64_t* keys = (int64_t*) malloc(sizeof(int64_t) * len);
  float* values = (float*) malloc(sizeof(float) * len * dim);
  memset(values, 0, sizeof(float) * len * dim);
  for (size_t i = 0; i < len; i ++) {
    keys[i] = i;
  }

  int64_t* d_keys = nullptr;
  float* d_values = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_keys, sizeof(int64_t) * len, stream));
  CUDA_CHECK(cudaMallocAsync(&d_values, sizeof(float) * len * dim, stream));
  CUDA_CHECK(cudaMemsetAsync(d_keys, 0, sizeof(int64_t) * len, stream));
  CUDA_CHECK(cudaMemsetAsync(d_values, 0, sizeof(float) * len * dim, stream));

  CUDA_CHECK(cudaMemcpyAsync(d_keys, keys, sizeof(int64_t) * len, cudaMemcpyHostToDevice, stream));
  CUDA_CHECK(cudaMemcpyAsync(d_values, values, sizeof(float) * len * dim, cudaMemcpyHostToDevice, stream));

  std::unique_ptr<Table_t> table = std::make_unique<Table_t>();
  TableOptions options;
  options.init_capacity = 1024;
  options.max_capacity = 2048;
  options.max_hbm_for_vectors = \
      options.max_capacity * (sizeof(int64_t) + sizeof(float) * dim + sizeof(size_t));
  table->init(options);

  table->insert_or_assign(len, d_keys, d_values, nullptr, stream);
  printf("size: %llu\n", table->size(stream));  // expecting 1024, but get 976.
  printf("capacity: %llu\n", table->capacity());  // 1024
  CUDA_CHECK(cudaFreeAsync(d_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_values, stream));
  free(keys);
  free(values);
  CUDA_CHECK(cudaStreamDestroy(stream));

  return 0;
}

CUDA device

How do we ensure that the CUDA device is properly set for API calls?

export_batch_if get misaligned address error

I'm trying to use export_batch_if to scan the table and get valid keys for pred fn.
But get misaligned address by CUDA_CHECK.

Here is the snippet to reproduce the issue.

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

#include "cuda_runtime.h"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using Table = nv::merlin::HashTable<i64, f32, u64>;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class M>
__forceinline__ __device__ bool export_if_pred(const K& key, M& meta,
                                               const K& pattern,
                                               const M& threshold) {
  return meta > threshold;
}

template <class K, class M>
__device__ Table::Pred ExportIfPred = export_if_pred<K, M>;

void test_export_with_condition() {
  TableOptions opt;

  // table setting
  const size_t init_capacity = 1024;

  // numeric setting
  const size_t U = 2llu << 18;
  const size_t M = (U >> 1);
  const size_t N = (U >> 1) + 17;  // Add a prime to test the non-aligned case.

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.evict_strategy = nv::merlin::EvictStrategy::kCustomized;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // step1
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(M, dim, stream);
  buffer.ToRange(0, 1, stream);
  buffer.SetMeta((u64)1, stream);

  u64* h_metas = buffer.metas_ptr(false);
  for (size_t i = 0; i < M; i++) {
    h_metas[i] = static_cast<u64>(i);
  }
  buffer.SyncData(true, stream);
  table->insert_or_assign(M, buffer.keys_ptr(), buffer.values_ptr(), buffer.metas_ptr(), stream);

  i64 pattern = 0;
  u64 threshold = M / 2;
  size_t* d_counter = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_counter, sizeof(size_t), stream));
  CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_t), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  table->export_batch_if(ExportIfPred<i64, u64>, pattern, threshold, table->capacity(), 0, d_counter,
                         buffer.keys_ptr(),  buffer.values_ptr(), buffer.metas_ptr(), stream);

  size_t h_counter = 0;
  buffer.Free(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaMemcpyAsync(&h_counter, d_counter, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaFreeAsync(d_counter, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("----> check h_counter: %llu\n", h_counter);
}

TEST(ExportWithCondition, test_export_with_condition) {
  test_export_with_condition();
}

[BUG] export_batch get not expected value when max_capacity is smaller than insert data length.

When max_capacity is smaller than data length, Insert data to table may got not expected value.
Export the keys and values in the table will show the error.

Here is a code snippet to reproduce the issue.

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using Table = nv::merlin::HashTable<i64, f32, u64>;
using TableOptions = nv::merlin::HashTableOptions;

void test_reproduce() {
  size_t len = 10000llu;
  size_t max_capacity = 1 << 12;
  size_t init_capacity = 1 << 11;
  size_t offset = 0;
  size_t uplimit = 1 << 20;
  float load_factor_threshold = 0.98f;

  TableOptions opt;
  opt.max_capacity = max_capacity;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);
  opt.evict_strategy = nv::merlin::EvictStrategy::kLru;
  opt.dim = dim;

  using Vec_t = test_util::ValueArray<f32, dim>;
  std::map<i64, Vec_t> ref_map;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(len, dim, stream);
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(len, dim, stream);

  size_t total_len = 0;
  buffer.ToRange(offset, /*skip=1*/1, stream);
  size_t n_evicted = table->insert_and_evict(len, buffer.keys_ptr(),
      buffer.values_ptr(), nullptr, evict_buffer.keys_ptr(),
      evict_buffer.values_ptr(), nullptr, stream);
  prinf("Insert %llu keys and evict %llu\n", len, n_evicted);
  offset += len;
  total_len += len;
  evict_buffer.SyncData(/*h2d=*/false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  for (size_t i = 0; i < n_evicted; i++) {
    Vec_t* vec = reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);
    ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;
  }

  offset = 0;
  for (; offset < table->capacity(); offset += len) {
    size_t search_len = len;
    if (offset + search_len > table->capacity()) {
      search_len = table->capacity() - offset;
    }
    size_t n_exported = table->export_batch(search_len, offset, buffer.keys_ptr(),
                                            buffer.values_ptr(), /*metas=*/nullptr, stream);
    buffer.SyncData(/*h2d=*/false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < n_exported; i++) {
      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);
      for (size_t j = 0; j < dim; j++) {
        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));
      }
      ref_map[buffer.keys_ptr(false)[i]] = *vec;
    }
  }

  for (auto& it : ref_map) {
    for (size_t j = 0; j < dim; j++) {
      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);
    }
  }
}

Cmake bug in current master

In current master:

/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::insert_or_assign(unsigned long, unsigned long const*, float const*, unsigned long const*, CUstream_st*, bool)':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:326: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: /home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:326: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::find(unsigned long, unsigned long const*, float*, bool*, unsigned long*, CUstream_st*) const':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:925: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: /home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:925: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::find_or_insert(unsigned long, unsigned long const*, float*, unsigned long*, CUstream_st*, bool)':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:654: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o:/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:654: more undefined references to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par' follow
collect2: error: ld returned 1 exit status
make[2]: *** [CMakeFiles/merlin_hashtable_benchmark.dir/build.make:100: merlin_hashtable_benchmark] Error 1
make[1]: *** [CMakeFiles/Makefile2:152: CMakeFiles/merlin_hashtable_benchmark.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....

dead lock when using keys of int64 converted from int32

#include "random"
#include "hierarchical_kv/merlin_hashtable.cuh"

using K = uint64_t;
using S = uint64_t;
using V = float;
using MerlinHashTable = nv::merlin::HashTable<K, V, S>;
using TableOption = nv::merlin::HashTableOptions;

using namespace nv::merlin;
using namespace std;

#define CUCO_CUDA_TRY(...)                                               \
  GET_CUCO_CUDA_TRY_MACRO(__VA_ARGS__, CUCO_CUDA_TRY_2, CUCO_CUDA_TRY_1) \
  (__VA_ARGS__)
#define GET_CUCO_CUDA_TRY_MACRO(_1, _2, NAME, ...) NAME
#define CUCO_CUDA_TRY_2(_call, _exception_type)                         \
  do {                                                                  \
    cudaError_t const error = (_call);                                  \
    if (cudaSuccess != error) {                                         \
      cudaGetLastError();                                               \
      throw _exception_type{std::string{"ERROR at "} + __FILE__ + ":" + \
                            CUCO_STRINGIFY(__LINE__) + ": " +           \
                            cudaGetErrorName(error) + " " +             \
                            cudaGetErrorString(error)};                 \
    }                                                                   \
  } while (0);
#define CUCO_CUDA_TRY_1(_call) CUCO_CUDA_TRY_2(_call, cuco::cuda_error)


__device__ void select() {

}


void test_merlin(bool block = true) {
  TableOption options;
  options.init_capacity = 128 * 1024 * 1024UL;
  options.max_capacity = 128 * 1024 * 1024UL;
  options.dim = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(30);
  options.evict_strategy = EvictStrategy::kLru;

//  MerlinHashTable table;
  std::shared_ptr<MerlinHashTable> table = std::make_shared<MerlinHashTable>();
  table->init(options);

  size_t total_key_length = 1ul << 27;
  size_t total_key_range = 1ul << 27;
  size_t total_key_per_op = 1ul << 20;
  int dim_size = 8;

  K* h_keys;
  V* h_values;
  K* d_keys;
  V* d_values;
  K* h_cold_keys;
  K* d_cold_keys;

  K* d_input_keys;
  V* d_output_values;
  bool* d_founds;

  K* d_evicted_keys;
  V* d_evicted_values;
  S* d_evicted_scores;

  h_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  h_values = static_cast<V*>(std::malloc(total_key_length * dim_size * sizeof(V)));
  h_cold_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_keys, total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_values, total_key_length * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_cold_keys, total_key_length * sizeof(K)));

  CUCO_CUDA_TRY(cudaMalloc(&d_input_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_output_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_founds, total_key_per_op * sizeof(bool)));
  CUCO_CUDA_TRY(cudaMemset(d_output_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_founds, 0, total_key_per_op * sizeof(bool)));

  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_scores, total_key_per_op * sizeof(S)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_keys, 0, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_scores, 0, total_key_per_op * sizeof(S)));

  std::random_device dev;
  std::mt19937 mt(dev());
  std::uniform_int_distribution<uint64_t> dist(0, 1ul << 50);
  for (int i = 0; i < total_key_length; i++) {
    h_keys[i] = dist(mt) % total_key_range;
    h_cold_keys[i] = dist(mt);
    for (int j = 0; j < dim_size; j++) {
      h_values[i * dim_size + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);;
    }
  }

  cudaMemcpy(d_keys, h_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_cold_keys, h_cold_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_values, h_values, total_key_length * dim_size * sizeof(V), cudaMemcpyHostToDevice);

  cudaStream_t stream;
  cudaStreamCreate(&stream);
  cudaStreamSynchronize(stream);

  cout << "start" << endl;
  int count = 0;
  for (;;) {
    size_t start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->find(total_key_per_op, d_input_keys, d_output_values, d_founds, nullptr, stream);

    start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->insert_and_evict(total_key_per_op, d_input_keys, d_values + start_index * dim_size,
                            nullptr, d_evicted_keys, d_evicted_values, d_evicted_scores, stream);

    cudaStreamSynchronize(stream);
    count++;
    if (count % 100 == 0) {
    }
    cout << "find and insert_and_evict: " << count << endl;
  }

}

[Bug] misaligned error for tlp_v2_upsert_and_evict_kernel_unique

Encounter this issue when running insert_and_evict_test on a 3090.

=========     Host Frame: [0x563ebe13a5ce]
=========                in /HierarchicalKV/build/./insert_and_evict_test
=========
========= Invalid __global__ write of size 16 bytes
=========     at 0x4ab0 in /usr/local/cuda/targets/x86_64-linux/include/sm_32_intrinsics.hpp:492:void nv::merlin::tlp_v2_upsert_and_evict_kernel_unique<long,float,unsigned long,uint4,unsigned int=128,unsigned int=16>(nv::merlin::Bucket<long,float,unsigned long>*,int*,unsigned long,unsigned int,unsigned int,nv::merlin::Bucket const *,uint4 const *,float const *,nv::merlin::Bucket*,nv::merlin::Bucket const **,float*,unsigned long,unsigned long*)
=========     by thread (76,0,0) in block (1132,0,0)
=========     Address 0xdc000000c5 is misaligned
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x7f1b1e0af40c]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x563ebe20c0cb]


Compilation error encountered on nvidia-docker with cuda version 12.0

When I compiled this project on nvidia-docker with cuda version 12.0, I got this error message:

====================================== Error log ======================================
......
[ 81%] Linking CXX executable merlin_hashtable_test
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::insert_or_assign(unsigned long, unsigned long const*, float const*, unsigned long const*, CUstream_st*, bool)':
......
/Workspace/HierarchicalKV/include/merlin_hashtable.cuh:1131: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
collect2: error: ld returned 1 exit status
......
====================================== Error log ======================================
Even though I replaced the header file related to thrust with absolute path, I got the same error.
But when I used nvidia-docker with cuda version 11.7.0 to compile this project, the error mentioned above disappeared.

[Bug] Value Different Error When using find_or_insert on hybrid mode.

The problem is :

Expected equality of these values:
  h_vectors[i * options.dim + j]
    Which is: 1.09825e+12
  static_cast<float>(h_keys[i] * 0.00001)
    Which is: 1.53826e+14

We can reproduce the problem by rewriting the unit test: test_basic_when_full of find_or_insert_test

void test_basic_when_full(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 1024* 1024;//1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = INIT_CAPACITY;//1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  options.evict_strategy = nv::merlin::EvictStrategy::kCustomized;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  // CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    std::cout << total_size_after_insert << std::endl; 
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ///////////////////////////////////////// my check
    {
      CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));
      table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(total_size_after_insert, found_num);
    }
    ///////////////////////////////////////// my check

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

TEST(FindOrInsertTest, test_basic_when_full) {
  // test_basic_when_full(16);
  test_basic_when_full(0);
}

[Feature request] Add a new contain API for cache senario

We use pure-device hkv as a device cache for feature ids which has high probability to be used in next few steps training/inference. It's critical to have a contain API for performance if there are multiple level of caches, in different throughput, interact with each other:

  void contain(const size_type n, const key_type* keys,  // (n)
               bool* founds,                             // (n)
               score_type* scores = nullptr,             // (n)
               cudaStream_t stream = 0) const;

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.