Code Monkey home page Code Monkey logo

nvidia-merlin / hierarchicalkv Goto Github PK

View Code? Open in Web Editor NEW
99.0 19.0 22.0 6.09 MB

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. The key capability of HierarchicalKV is to store key-value feature-embeddings on high-bandwidth memory (HBM) of GPUs and in host memory. It also can be used as a generic key-value storage.

License: Apache License 2.0

CMake 0.49% Cuda 90.78% C++ 0.47% Starlark 3.50% Python 2.34% Smarty 2.39% Shell 0.03%
cuda gpu hashtable recommender-system dynamic-embedding embedding-storage key-value-store

hierarchicalkv's People

Contributors

bashimao avatar dakabang avatar emmaqiaoch avatar evanzhen avatar jiashuy avatar lifann avatar lingelin avatar mikemckiernan avatar mr-nineteen avatar neuzxy avatar ranjeet-nvidia avatar rhdong avatar zehuanw avatar zhangyafeikimi avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

hierarchicalkv's Issues

CUDA device

How do we ensure that the CUDA device is properly set for API calls?

[BUG] export_batch get not expected value when max_capacity is smaller than insert data length.

When max_capacity is smaller than data length, Insert data to table may got not expected value.
Export the keys and values in the table will show the error.

Here is a code snippet to reproduce the issue.

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using Table = nv::merlin::HashTable<i64, f32, u64>;
using TableOptions = nv::merlin::HashTableOptions;

void test_reproduce() {
  size_t len = 10000llu;
  size_t max_capacity = 1 << 12;
  size_t init_capacity = 1 << 11;
  size_t offset = 0;
  size_t uplimit = 1 << 20;
  float load_factor_threshold = 0.98f;

  TableOptions opt;
  opt.max_capacity = max_capacity;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);
  opt.evict_strategy = nv::merlin::EvictStrategy::kLru;
  opt.dim = dim;

  using Vec_t = test_util::ValueArray<f32, dim>;
  std::map<i64, Vec_t> ref_map;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(len, dim, stream);
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(len, dim, stream);

  size_t total_len = 0;
  buffer.ToRange(offset, /*skip=1*/1, stream);
  size_t n_evicted = table->insert_and_evict(len, buffer.keys_ptr(),
      buffer.values_ptr(), nullptr, evict_buffer.keys_ptr(),
      evict_buffer.values_ptr(), nullptr, stream);
  prinf("Insert %llu keys and evict %llu\n", len, n_evicted);
  offset += len;
  total_len += len;
  evict_buffer.SyncData(/*h2d=*/false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  for (size_t i = 0; i < n_evicted; i++) {
    Vec_t* vec = reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);
    ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;
  }

  offset = 0;
  for (; offset < table->capacity(); offset += len) {
    size_t search_len = len;
    if (offset + search_len > table->capacity()) {
      search_len = table->capacity() - offset;
    }
    size_t n_exported = table->export_batch(search_len, offset, buffer.keys_ptr(),
                                            buffer.values_ptr(), /*metas=*/nullptr, stream);
    buffer.SyncData(/*h2d=*/false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < n_exported; i++) {
      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);
      for (size_t j = 0; j < dim; j++) {
        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));
      }
      ref_map[buffer.keys_ptr(false)[i]] = *vec;
    }
  }

  for (auto& it : ref_map) {
    for (size_t j = 0; j < dim; j++) {
      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);
    }
  }
}

Unexpected insert behavior

I'm trying to insert 1024 keys into a table but get table size 976 instead, when init_capacity is 1024 and max_capacity is 2048. 48 keys are missing after insertion.

This issue can be reproduced by:

#include <stdio.h>
#include <memory>
#include <utility>
#include "merlin/types.cuh"
#include "merlin/utils.cuh"
#include "merlin_hashtable.cuh"

using TableOptions = nv::merlin::HashTableOptions;
const size_t dim = 8;
using Table_t = nv::merlin::HashTable<int64_t, float, size_t, dim>;

int main() {
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  size_t len = 1024;
  int64_t* keys = (int64_t*) malloc(sizeof(int64_t) * len);
  float* values = (float*) malloc(sizeof(float) * len * dim);
  memset(values, 0, sizeof(float) * len * dim);
  for (size_t i = 0; i < len; i ++) {
    keys[i] = i;
  }

  int64_t* d_keys = nullptr;
  float* d_values = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_keys, sizeof(int64_t) * len, stream));
  CUDA_CHECK(cudaMallocAsync(&d_values, sizeof(float) * len * dim, stream));
  CUDA_CHECK(cudaMemsetAsync(d_keys, 0, sizeof(int64_t) * len, stream));
  CUDA_CHECK(cudaMemsetAsync(d_values, 0, sizeof(float) * len * dim, stream));

  CUDA_CHECK(cudaMemcpyAsync(d_keys, keys, sizeof(int64_t) * len, cudaMemcpyHostToDevice, stream));
  CUDA_CHECK(cudaMemcpyAsync(d_values, values, sizeof(float) * len * dim, cudaMemcpyHostToDevice, stream));

  std::unique_ptr<Table_t> table = std::make_unique<Table_t>();
  TableOptions options;
  options.init_capacity = 1024;
  options.max_capacity = 2048;
  options.max_hbm_for_vectors = \
      options.max_capacity * (sizeof(int64_t) + sizeof(float) * dim + sizeof(size_t));
  table->init(options);

  table->insert_or_assign(len, d_keys, d_values, nullptr, stream);
  printf("size: %llu\n", table->size(stream));  // expecting 1024, but get 976.
  printf("capacity: %llu\n", table->capacity());  // 1024
  CUDA_CHECK(cudaFreeAsync(d_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_values, stream));
  free(keys);
  free(values);
  CUDA_CHECK(cudaStreamDestroy(stream));

  return 0;
}

dead lock when using keys of int64 converted from int32

#include "random"
#include "hierarchical_kv/merlin_hashtable.cuh"

using K = uint64_t;
using S = uint64_t;
using V = float;
using MerlinHashTable = nv::merlin::HashTable<K, V, S>;
using TableOption = nv::merlin::HashTableOptions;

using namespace nv::merlin;
using namespace std;

#define CUCO_CUDA_TRY(...)                                               \
  GET_CUCO_CUDA_TRY_MACRO(__VA_ARGS__, CUCO_CUDA_TRY_2, CUCO_CUDA_TRY_1) \
  (__VA_ARGS__)
#define GET_CUCO_CUDA_TRY_MACRO(_1, _2, NAME, ...) NAME
#define CUCO_CUDA_TRY_2(_call, _exception_type)                         \
  do {                                                                  \
    cudaError_t const error = (_call);                                  \
    if (cudaSuccess != error) {                                         \
      cudaGetLastError();                                               \
      throw _exception_type{std::string{"ERROR at "} + __FILE__ + ":" + \
                            CUCO_STRINGIFY(__LINE__) + ": " +           \
                            cudaGetErrorName(error) + " " +             \
                            cudaGetErrorString(error)};                 \
    }                                                                   \
  } while (0);
#define CUCO_CUDA_TRY_1(_call) CUCO_CUDA_TRY_2(_call, cuco::cuda_error)


__device__ void select() {

}


void test_merlin(bool block = true) {
  TableOption options;
  options.init_capacity = 128 * 1024 * 1024UL;
  options.max_capacity = 128 * 1024 * 1024UL;
  options.dim = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(30);
  options.evict_strategy = EvictStrategy::kLru;

//  MerlinHashTable table;
  std::shared_ptr<MerlinHashTable> table = std::make_shared<MerlinHashTable>();
  table->init(options);

  size_t total_key_length = 1ul << 27;
  size_t total_key_range = 1ul << 27;
  size_t total_key_per_op = 1ul << 20;
  int dim_size = 8;

  K* h_keys;
  V* h_values;
  K* d_keys;
  V* d_values;
  K* h_cold_keys;
  K* d_cold_keys;

  K* d_input_keys;
  V* d_output_values;
  bool* d_founds;

  K* d_evicted_keys;
  V* d_evicted_values;
  S* d_evicted_scores;

  h_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  h_values = static_cast<V*>(std::malloc(total_key_length * dim_size * sizeof(V)));
  h_cold_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_keys, total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_values, total_key_length * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_cold_keys, total_key_length * sizeof(K)));

  CUCO_CUDA_TRY(cudaMalloc(&d_input_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_output_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_founds, total_key_per_op * sizeof(bool)));
  CUCO_CUDA_TRY(cudaMemset(d_output_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_founds, 0, total_key_per_op * sizeof(bool)));

  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_scores, total_key_per_op * sizeof(S)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_keys, 0, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_scores, 0, total_key_per_op * sizeof(S)));

  std::random_device dev;
  std::mt19937 mt(dev());
  std::uniform_int_distribution<uint64_t> dist(0, 1ul << 50);
  for (int i = 0; i < total_key_length; i++) {
    h_keys[i] = dist(mt) % total_key_range;
    h_cold_keys[i] = dist(mt);
    for (int j = 0; j < dim_size; j++) {
      h_values[i * dim_size + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);;
    }
  }

  cudaMemcpy(d_keys, h_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_cold_keys, h_cold_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_values, h_values, total_key_length * dim_size * sizeof(V), cudaMemcpyHostToDevice);

  cudaStream_t stream;
  cudaStreamCreate(&stream);
  cudaStreamSynchronize(stream);

  cout << "start" << endl;
  int count = 0;
  for (;;) {
    size_t start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->find(total_key_per_op, d_input_keys, d_output_values, d_founds, nullptr, stream);

    start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->insert_and_evict(total_key_per_op, d_input_keys, d_values + start_index * dim_size,
                            nullptr, d_evicted_keys, d_evicted_values, d_evicted_scores, stream);

    cudaStreamSynchronize(stream);
    count++;
    if (count % 100 == 0) {
    }
    cout << "find and insert_and_evict: " << count << endl;
  }

}

[Feature request] Add a new contain API for cache senario

We use pure-device hkv as a device cache for feature ids which has high probability to be used in next few steps training/inference. It's critical to have a contain API for performance if there are multiple level of caches, in different throughput, interact with each other:

  void contain(const size_type n, const key_type* keys,  // (n)
               bool* founds,                             // (n)
               score_type* scores = nullptr,             // (n)
               cudaStream_t stream = 0) const;

[Bug] We got one test fail on memory pool, it looks like happen randomly.

Ready Events =
[ 0 ] 0x55ff7c5bf600
[ 1 ] 0x55ff7c5bf760
[ 2 ] 0x55ff7c5bd220
[ 3 ] 0x55ff7c5bd380
--------------------------------------------------------------------------------

/home/jenkins/agent/workspace/merlin-kv-ci/tests/memory_pool_test.cc.cu:372: Failure
Expected equality of these values:
  pool.num_pending()
    Which is: 1
  0
DeviceAllocator[type_name = 8SomeType]: 0x320000000 freed, stream = 0
DeviceAllocator[type_name = 8SomeType]: 0x320200000 freed, stream = 0
[  FAILED  ] MemoryPool.borrow_return_with_context (0 ms)
[----------] 5 tests from MemoryPool (279 ms total)

[Bug] misaligned error for tlp_v2_upsert_and_evict_kernel_unique

Encounter this issue when running insert_and_evict_test on a 3090.

=========     Host Frame: [0x563ebe13a5ce]
=========                in /HierarchicalKV/build/./insert_and_evict_test
=========
========= Invalid __global__ write of size 16 bytes
=========     at 0x4ab0 in /usr/local/cuda/targets/x86_64-linux/include/sm_32_intrinsics.hpp:492:void nv::merlin::tlp_v2_upsert_and_evict_kernel_unique<long,float,unsigned long,uint4,unsigned int=128,unsigned int=16>(nv::merlin::Bucket<long,float,unsigned long>*,int*,unsigned long,unsigned int,unsigned int,nv::merlin::Bucket const *,uint4 const *,float const *,nv::merlin::Bucket*,nv::merlin::Bucket const **,float*,unsigned long,unsigned long*)
=========     by thread (76,0,0) in block (1132,0,0)
=========     Address 0xdc000000c5 is misaligned
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x7f1b1e0af40c]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x563ebe20c0cb]


export_batch_if get misaligned address error

I'm trying to use export_batch_if to scan the table and get valid keys for pred fn.
But get misaligned address by CUDA_CHECK.

Here is the snippet to reproduce the issue.

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

#include "cuda_runtime.h"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using Table = nv::merlin::HashTable<i64, f32, u64>;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class M>
__forceinline__ __device__ bool export_if_pred(const K& key, M& meta,
                                               const K& pattern,
                                               const M& threshold) {
  return meta > threshold;
}

template <class K, class M>
__device__ Table::Pred ExportIfPred = export_if_pred<K, M>;

void test_export_with_condition() {
  TableOptions opt;

  // table setting
  const size_t init_capacity = 1024;

  // numeric setting
  const size_t U = 2llu << 18;
  const size_t M = (U >> 1);
  const size_t N = (U >> 1) + 17;  // Add a prime to test the non-aligned case.

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.evict_strategy = nv::merlin::EvictStrategy::kCustomized;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // step1
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(M, dim, stream);
  buffer.ToRange(0, 1, stream);
  buffer.SetMeta((u64)1, stream);

  u64* h_metas = buffer.metas_ptr(false);
  for (size_t i = 0; i < M; i++) {
    h_metas[i] = static_cast<u64>(i);
  }
  buffer.SyncData(true, stream);
  table->insert_or_assign(M, buffer.keys_ptr(), buffer.values_ptr(), buffer.metas_ptr(), stream);

  i64 pattern = 0;
  u64 threshold = M / 2;
  size_t* d_counter = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_counter, sizeof(size_t), stream));
  CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_t), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  table->export_batch_if(ExportIfPred<i64, u64>, pattern, threshold, table->capacity(), 0, d_counter,
                         buffer.keys_ptr(),  buffer.values_ptr(), buffer.metas_ptr(), stream);

  size_t h_counter = 0;
  buffer.Free(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaMemcpyAsync(&h_counter, d_counter, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaFreeAsync(d_counter, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("----> check h_counter: %llu\n", h_counter);
}

TEST(ExportWithCondition, test_export_with_condition) {
  test_export_with_condition();
}

Compilation error encountered on nvidia-docker with cuda version 12.0

When I compiled this project on nvidia-docker with cuda version 12.0, I got this error message:

====================================== Error log ======================================
......
[ 81%] Linking CXX executable merlin_hashtable_test
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::insert_or_assign(unsigned long, unsigned long const*, float const*, unsigned long const*, CUstream_st*, bool)':
......
/Workspace/HierarchicalKV/include/merlin_hashtable.cuh:1131: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
collect2: error: ld returned 1 exit status
......
====================================== Error log ======================================
Even though I replaced the header file related to thrust with absolute path, I got the same error.
But when I used nvidia-docker with cuda version 11.7.0 to compile this project, the error mentioned above disappeared.

Cmake bug in current master

In current master:

/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::insert_or_assign(unsigned long, unsigned long const*, float const*, unsigned long const*, CUstream_st*, bool)':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:326: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: /home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:326: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::find(unsigned long, unsigned long const*, float*, bool*, unsigned long*, CUstream_st*) const':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:925: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: /home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:925: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::find_or_insert(unsigned long, unsigned long const*, float*, unsigned long*, CUstream_st*, bool)':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:654: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o:/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:654: more undefined references to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par' follow
collect2: error: ld returned 1 exit status
make[2]: *** [CMakeFiles/merlin_hashtable_benchmark.dir/build.make:100: merlin_hashtable_benchmark] Error 1
make[1]: *** [CMakeFiles/Makefile2:152: CMakeFiles/merlin_hashtable_benchmark.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....

[Bug] Value Different Error When using find_or_insert on hybrid mode.

The problem is :

Expected equality of these values:
  h_vectors[i * options.dim + j]
    Which is: 1.09825e+12
  static_cast<float>(h_keys[i] * 0.00001)
    Which is: 1.53826e+14

We can reproduce the problem by rewriting the unit test: test_basic_when_full of find_or_insert_test

void test_basic_when_full(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 1024* 1024;//1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = INIT_CAPACITY;//1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  options.evict_strategy = nv::merlin::EvictStrategy::kCustomized;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  // CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    std::cout << total_size_after_insert << std::endl; 
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ///////////////////////////////////////// my check
    {
      CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));
      table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(total_size_after_insert, found_num);
    }
    ///////////////////////////////////////// my check

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

TEST(FindOrInsertTest, test_basic_when_full) {
  // test_basic_when_full(16);
  test_basic_when_full(0);
}

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.