nvidia-merlin / hierarchicalkv Goto Github PK

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. The key capability of HierarchicalKV is to store key-value feature-embeddings on high-bandwidth memory (HBM) of GPUs and in host memory. It also can be used as a generic key-value storage.

License: Apache License 2.0

CMake 0.50% Cuda 90.83% C++ 0.47% Starlark 3.48% Python 2.33% Smarty 2.37% Shell 0.03%

cuda gpu hashtable recommender-system dynamic-embedding embedding-storage key-value-store

hierarchicalkv's Issues

[featurer request] one kernel for optimizer_update

For performance consideration, merge kernels to one for optimizer_update

[opt] Can initialize_buckets be further optimized?

HierarchicalKV/include/merlin/core_kernels.cuh

Line 225 in 8a4b781

for (int i = start; i < end; i++) {

If max_capacity of the hash table is set to a large value, this will result in many iterations here.

Each time the graphics memory is allocated here, it is 2176, which is 128 * 17. Could this potentially lead to a lot of graphics memory fragmentation?

[Feature request] reserve by any capacity.

[Bug] We got one test fail on memory pool, it looks like happen randomly.

Ready Events =
[ 0 ] 0x55ff7c5bf600
[ 1 ] 0x55ff7c5bf760
[ 2 ] 0x55ff7c5bd220
[ 3 ] 0x55ff7c5bd380
--------------------------------------------------------------------------------

/home/jenkins/agent/workspace/merlin-kv-ci/tests/memory_pool_test.cc.cu:372: Failure
Expected equality of these values:
  pool.num_pending()
    Which is: 1
  0
DeviceAllocator[type_name = 8SomeType]: 0x320000000 freed, stream = 0
DeviceAllocator[type_name = 8SomeType]: 0x320200000 freed, stream = 0
[  FAILED  ] MemoryPool.borrow_return_with_context (0 ms)
[----------] 5 tests from MemoryPool (279 ms total)

Unexpected insert behavior

I'm trying to insert 1024 keys into a table but get table size 976 instead, when init_capacity is 1024 and max_capacity is 2048. 48 keys are missing after insertion.

This issue can be reproduced by:

#include <stdio.h>
#include <memory>
#include <utility>
#include "merlin/types.cuh"
#include "merlin/utils.cuh"
#include "merlin_hashtable.cuh"

using TableOptions = nv::merlin::HashTableOptions;
const size_t dim = 8;
using Table_t = nv::merlin::HashTable<int64_t, float, size_t, dim>;

int main() {
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  size_t len = 1024;
  int64_t* keys = (int64_t*) malloc(sizeof(int64_t) * len);
  float* values = (float*) malloc(sizeof(float) * len * dim);
  memset(values, 0, sizeof(float) * len * dim);
  for (size_t i = 0; i < len; i ++) {
    keys[i] = i;
  }

  int64_t* d_keys = nullptr;
  float* d_values = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_keys, sizeof(int64_t) * len, stream));
  CUDA_CHECK(cudaMallocAsync(&d_values, sizeof(float) * len * dim, stream));
  CUDA_CHECK(cudaMemsetAsync(d_keys, 0, sizeof(int64_t) * len, stream));
  CUDA_CHECK(cudaMemsetAsync(d_values, 0, sizeof(float) * len * dim, stream));

  CUDA_CHECK(cudaMemcpyAsync(d_keys, keys, sizeof(int64_t) * len, cudaMemcpyHostToDevice, stream));
  CUDA_CHECK(cudaMemcpyAsync(d_values, values, sizeof(float) * len * dim, cudaMemcpyHostToDevice, stream));

  std::unique_ptr<Table_t> table = std::make_unique<Table_t>();
  TableOptions options;
  options.init_capacity = 1024;
  options.max_capacity = 2048;
  options.max_hbm_for_vectors = \
      options.max_capacity * (sizeof(int64_t) + sizeof(float) * dim + sizeof(size_t));
  table->init(options);

  table->insert_or_assign(len, d_keys, d_values, nullptr, stream);
  printf("size: %llu\n", table->size(stream));  // expecting 1024, but get 976.
  printf("capacity: %llu\n", table->capacity());  // 1024
  CUDA_CHECK(cudaFreeAsync(d_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_values, stream));
  free(keys);
  free(values);
  CUDA_CHECK(cudaStreamDestroy(stream));

  return 0;
}

[Feature request] Support multiple Dim in one table instance.(P2)

Need to provide additional API for basic insert and find
Complete the operations in as few kernels as possible

[Feature Request] Support runtime setting to embedding vector dimension

It's inconvenient to the users if they need to recompile HKV to use new embedding vector size. On the other hand, if we list all the possible vector size in compile time will make our binary very big.

Shall we consider to support runtime setting to embedding vector dimension?

#65 (comment)

CUDA device

How do we ensure that the CUDA device is properly set for API calls?

export_batch_if get misaligned address error

I'm trying to use export_batch_if to scan the table and get valid keys for pred fn.
But get misaligned address by CUDA_CHECK.

Here is the snippet to reproduce the issue.

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

#include "cuda_runtime.h"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using Table = nv::merlin::HashTable<i64, f32, u64>;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class M>
__forceinline__ __device__ bool export_if_pred(const K& key, M& meta,
                                               const K& pattern,
                                               const M& threshold) {
  return meta > threshold;
}

template <class K, class M>
__device__ Table::Pred ExportIfPred = export_if_pred<K, M>;

void test_export_with_condition() {
  TableOptions opt;

  // table setting
  const size_t init_capacity = 1024;

  // numeric setting
  const size_t U = 2llu << 18;
  const size_t M = (U >> 1);
  const size_t N = (U >> 1) + 17;  // Add a prime to test the non-aligned case.

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.evict_strategy = nv::merlin::EvictStrategy::kCustomized;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // step1
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(M, dim, stream);
  buffer.ToRange(0, 1, stream);
  buffer.SetMeta((u64)1, stream);

  u64* h_metas = buffer.metas_ptr(false);
  for (size_t i = 0; i < M; i++) {
    h_metas[i] = static_cast<u64>(i);
  }
  buffer.SyncData(true, stream);
  table->insert_or_assign(M, buffer.keys_ptr(), buffer.values_ptr(), buffer.metas_ptr(), stream);

  i64 pattern = 0;
  u64 threshold = M / 2;
  size_t* d_counter = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_counter, sizeof(size_t), stream));
  CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_t), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  table->export_batch_if(ExportIfPred<i64, u64>, pattern, threshold, table->capacity(), 0, d_counter,
                         buffer.keys_ptr(),  buffer.values_ptr(), buffer.metas_ptr(), stream);

  size_t h_counter = 0;
  buffer.Free(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaMemcpyAsync(&h_counter, d_counter, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaFreeAsync(d_counter, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("----> check h_counter: %llu\n", h_counter);
}

TEST(ExportWithCondition, test_export_with_condition) {
  test_export_with_condition();
}

[feature request] dump to file

dump the whole table to file or HDFS directly for avoiding the huge HBM/HMEM consuming.

[Bug] A test case of memory pool fail randomly.

Hi @bashimao, could you help analysis this issue? Thank you~
https://prod.blsm.nvidia.com/sw-gpu-hugectr-ci-master/job/merlin-kv-ci/162/console

[Feature request] Share Keys of weight with status tables of stateful optimizers(P2)

[Feature Request] support SSD thru GDS API

#55 (comment)

[BUG] export_batch get not expected value when max_capacity is smaller than insert data length.

When max_capacity is smaller than data length, Insert data to table may got not expected value.
Export the keys and values in the table will show the error.

Here is a code snippet to reproduce the issue.

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using Table = nv::merlin::HashTable<i64, f32, u64>;
using TableOptions = nv::merlin::HashTableOptions;

void test_reproduce() {
  size_t len = 10000llu;
  size_t max_capacity = 1 << 12;
  size_t init_capacity = 1 << 11;
  size_t offset = 0;
  size_t uplimit = 1 << 20;
  float load_factor_threshold = 0.98f;

  TableOptions opt;
  opt.max_capacity = max_capacity;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);
  opt.evict_strategy = nv::merlin::EvictStrategy::kLru;
  opt.dim = dim;

  using Vec_t = test_util::ValueArray<f32, dim>;
  std::map<i64, Vec_t> ref_map;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(len, dim, stream);
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(len, dim, stream);

  size_t total_len = 0;
  buffer.ToRange(offset, /*skip=1*/1, stream);
  size_t n_evicted = table->insert_and_evict(len, buffer.keys_ptr(),
      buffer.values_ptr(), nullptr, evict_buffer.keys_ptr(),
      evict_buffer.values_ptr(), nullptr, stream);
  prinf("Insert %llu keys and evict %llu\n", len, n_evicted);
  offset += len;
  total_len += len;
  evict_buffer.SyncData(/*h2d=*/false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  for (size_t i = 0; i < n_evicted; i++) {
    Vec_t* vec = reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);
    ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;
  }

  offset = 0;
  for (; offset < table->capacity(); offset += len) {
    size_t search_len = len;
    if (offset + search_len > table->capacity()) {
      search_len = table->capacity() - offset;
    }
    size_t n_exported = table->export_batch(search_len, offset, buffer.keys_ptr(),
                                            buffer.values_ptr(), /*metas=*/nullptr, stream);
    buffer.SyncData(/*h2d=*/false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < n_exported; i++) {
      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);
      for (size_t j = 0; j < dim; j++) {
        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));
      }
      ref_map[buffer.keys_ptr(false)[i]] = *vec;
    }
  }

  for (auto& it : ref_map) {
    for (size_t j = 0; j < dim; j++) {
      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);
    }
  }
}

Cmake bug in current master

In current master:

/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::insert_or_assign(unsigned long, unsigned long const*, float const*, unsigned long const*, CUstream_st*, bool)':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:326: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: /home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:326: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::find(unsigned long, unsigned long const*, float*, bool*, unsigned long*, CUstream_st*) const':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:925: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: /home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:925: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::find_or_insert(unsigned long, unsigned long const*, float*, unsigned long*, CUstream_st*, bool)':
/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:654: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o:/home/mlanger/proj/merlin-kv/include/merlin_hashtable.cuh:654: more undefined references to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par' follow
collect2: error: ld returned 1 exit status
make[2]: *** [CMakeFiles/merlin_hashtable_benchmark.dir/build.make:100: merlin_hashtable_benchmark] Error 1
make[1]: *** [CMakeFiles/Makefile2:152: CMakeFiles/merlin_hashtable_benchmark.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....

dead lock when using keys of int64 converted from int32

#include "random"
#include "hierarchical_kv/merlin_hashtable.cuh"

using K = uint64_t;
using S = uint64_t;
using V = float;
using MerlinHashTable = nv::merlin::HashTable<K, V, S>;
using TableOption = nv::merlin::HashTableOptions;

using namespace nv::merlin;
using namespace std;

#define CUCO_CUDA_TRY(...)                                               \
  GET_CUCO_CUDA_TRY_MACRO(__VA_ARGS__, CUCO_CUDA_TRY_2, CUCO_CUDA_TRY_1) \
  (__VA_ARGS__)
#define GET_CUCO_CUDA_TRY_MACRO(_1, _2, NAME, ...) NAME
#define CUCO_CUDA_TRY_2(_call, _exception_type)                         \
  do {                                                                  \
    cudaError_t const error = (_call);                                  \
    if (cudaSuccess != error) {                                         \
      cudaGetLastError();                                               \
      throw _exception_type{std::string{"ERROR at "} + __FILE__ + ":" + \
                            CUCO_STRINGIFY(__LINE__) + ": " +           \
                            cudaGetErrorName(error) + " " +             \
                            cudaGetErrorString(error)};                 \
    }                                                                   \
  } while (0);
#define CUCO_CUDA_TRY_1(_call) CUCO_CUDA_TRY_2(_call, cuco::cuda_error)


__device__ void select() {

}


void test_merlin(bool block = true) {
  TableOption options;
  options.init_capacity = 128 * 1024 * 1024UL;
  options.max_capacity = 128 * 1024 * 1024UL;
  options.dim = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(30);
  options.evict_strategy = EvictStrategy::kLru;

//  MerlinHashTable table;
  std::shared_ptr<MerlinHashTable> table = std::make_shared<MerlinHashTable>();
  table->init(options);

  size_t total_key_length = 1ul << 27;
  size_t total_key_range = 1ul << 27;
  size_t total_key_per_op = 1ul << 20;
  int dim_size = 8;

  K* h_keys;
  V* h_values;
  K* d_keys;
  V* d_values;
  K* h_cold_keys;
  K* d_cold_keys;

  K* d_input_keys;
  V* d_output_values;
  bool* d_founds;

  K* d_evicted_keys;
  V* d_evicted_values;
  S* d_evicted_scores;

  h_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  h_values = static_cast<V*>(std::malloc(total_key_length * dim_size * sizeof(V)));
  h_cold_keys = static_cast<K*>(std::malloc(total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_keys, total_key_length * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_values, total_key_length * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_cold_keys, total_key_length * sizeof(K)));

  CUCO_CUDA_TRY(cudaMalloc(&d_input_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_output_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_founds, total_key_per_op * sizeof(bool)));
  CUCO_CUDA_TRY(cudaMemset(d_output_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_founds, 0, total_key_per_op * sizeof(bool)));

  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_keys, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_values, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMalloc(&d_evicted_scores, total_key_per_op * sizeof(S)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_keys, 0, total_key_per_op * sizeof(K)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_values, 0, total_key_per_op * dim_size * sizeof(V)));
  CUCO_CUDA_TRY(cudaMemset(d_evicted_scores, 0, total_key_per_op * sizeof(S)));

  std::random_device dev;
  std::mt19937 mt(dev());
  std::uniform_int_distribution<uint64_t> dist(0, 1ul << 50);
  for (int i = 0; i < total_key_length; i++) {
    h_keys[i] = dist(mt) % total_key_range;
    h_cold_keys[i] = dist(mt);
    for (int j = 0; j < dim_size; j++) {
      h_values[i * dim_size + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);;
    }
  }

  cudaMemcpy(d_keys, h_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_cold_keys, h_cold_keys, total_key_length * sizeof(K), cudaMemcpyHostToDevice);
  cudaMemcpy(d_values, h_values, total_key_length * dim_size * sizeof(V), cudaMemcpyHostToDevice);

  cudaStream_t stream;
  cudaStreamCreate(&stream);
  cudaStreamSynchronize(stream);

  cout << "start" << endl;
  int count = 0;
  for (;;) {
    size_t start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->find(total_key_per_op, d_input_keys, d_output_values, d_founds, nullptr, stream);

    start_index = dist(mt) % (total_key_length - total_key_per_op);
    cudaMemcpyAsync(d_input_keys, d_keys + start_index, total_key_per_op * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    cudaMemcpyAsync(d_input_keys, d_cold_keys + start_index, total_key_per_op / 10 * sizeof(K),
                    cudaMemcpyDeviceToDevice, stream);
    table->insert_and_evict(total_key_per_op, d_input_keys, d_values + start_index * dim_size,
                            nullptr, d_evicted_keys, d_evicted_values, d_evicted_scores, stream);

    cudaStreamSynchronize(stream);
    count++;
    if (count % 100 == 0) {
    }
    cout << "find and insert_and_evict: " << count << endl;
  }

}

[Bug] insert_and_evict got wrong outputs

There are some bugs in the test case, and the eviction is not triggered.
After fix the condition, the test case will fail:
Lifann@300eb1f

[Bug] misaligned error for tlp_v2_upsert_and_evict_kernel_unique

Encounter this issue when running insert_and_evict_test on a 3090.

=========     Host Frame: [0x563ebe13a5ce]
=========                in /HierarchicalKV/build/./insert_and_evict_test
=========
========= Invalid __global__ write of size 16 bytes
=========     at 0x4ab0 in /usr/local/cuda/targets/x86_64-linux/include/sm_32_intrinsics.hpp:492:void nv::merlin::tlp_v2_upsert_and_evict_kernel_unique<long,float,unsigned long,uint4,unsigned int=128,unsigned int=16>(nv::merlin::Bucket<long,float,unsigned long>*,int*,unsigned long,unsigned int,unsigned int,nv::merlin::Bucket const *,uint4 const *,float const *,nv::merlin::Bucket*,nv::merlin::Bucket const **,float*,unsigned long,unsigned long*)
=========     by thread (76,0,0) in block (1132,0,0)
=========     Address 0xdc000000c5 is misaligned
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x7f1b1e0af40c]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x563ebe20c0cb]

Compilation error encountered on nvidia-docker with cuda version 12.0

When I compiled this project on nvidia-docker with cuda version 12.0, I got this error message:

====================================== Error log ======================================
......
[ 81%] Linking CXX executable merlin_hashtable_test
/usr/bin/ld: CMakeFiles/merlin_hashtable_benchmark.dir/benchmark/merlin_hashtable_benchmark.cc.cu.o: in function `nv::merlin::HashTable<unsigned long, float, unsigned long>::insert_or_assign(unsigned long, unsigned long const*, float const*, unsigned long const*, CUstream_st*, bool)':
......
/Workspace/HierarchicalKV/include/merlin_hashtable.cuh:1131: undefined reference to `nv::merlin::HashTable<unsigned long, float, unsigned long>::thrust_par'
collect2: error: ld returned 1 exit status
......
====================================== Error log ======================================
Even though I replaced the header file related to thrust with absolute path, I got the same error.
But when I used nvidia-docker with cuda version 11.7.0 to compile this project, the error mentioned above disappeared.

Compiling error when values_type != float32

Has eviction strategy been implemented for hierarchical kv?

Will there be memory exchange between hbm and hmem ? Thanks.

[Bug] Value Different Error When using find_or_insert on hybrid mode.

The problem is :

Expected equality of these values:
  h_vectors[i * options.dim + j]
    Which is: 1.09825e+12
  static_cast<float>(h_keys[i] * 0.00001)
    Which is: 1.53826e+14

We can reproduce the problem by rewriting the unit test: test_basic_when_full of find_or_insert_test

void test_basic_when_full(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 1024* 1024;//1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = INIT_CAPACITY;//1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  options.evict_strategy = nv::merlin::EvictStrategy::kCustomized;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  // CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    std::cout << total_size_after_insert << std::endl; 
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ///////////////////////////////////////// my check
    {
      CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));
      table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(total_size_after_insert, found_num);
    }
    ///////////////////////////////////////// my check

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

TEST(FindOrInsertTest, test_basic_when_full) {
  // test_basic_when_full(16);
  test_basic_when_full(0);
}

[Feature request] Add a new contain API for cache senario

We use pure-device hkv as a device cache for feature ids which has high probability to be used in next few steps training/inference. It's critical to have a contain API for performance if there are multiple level of caches, in different throughput, interact with each other:

  void contain(const size_type n, const key_type* keys,  // (n)
               bool* founds,                             // (n)
               score_type* scores = nullptr,             // (n)
               cudaStream_t stream = 0) const;

nvidia-merlin / hierarchicalkv Goto Github PK

hierarchicalkv's Issues

Recommend Projects

Recommend Topics

Recommend Org