diff --git a/include/color_helper.h b/include/color_helper.h new file mode 100644 index 000000000..5982f4946 --- /dev/null +++ b/include/color_helper.h @@ -0,0 +1,17 @@ +#pragma once +#include +#include + +namespace diskann +{ + +class color_helper +{ +public: + void write_color_binfile(const std::string& filepath, const std::vector& location_to_seller, std::uint32_t num_unique_sellers); + + bool load_color_binfile(const std::string &filepath, std::vector &location_to_seller, std::uint32_t& num_unique_sellers); +}; + + +} \ No newline at end of file diff --git a/src/color_helper.cpp b/src/color_helper.cpp new file mode 100644 index 000000000..e35066c8e --- /dev/null +++ b/src/color_helper.cpp @@ -0,0 +1,63 @@ +#include "color_helper.h" +#include "ann_exception.h" +#include + +namespace diskann +{ + +void color_helper::write_color_binfile(const std::string& filepath, const std::vector& location_to_seller, std::uint32_t num_unique_sellers) +{ + // format: num_points, color_size, unique color count, color content + std::ofstream outfile(filepath, std::ios::binary); + if (outfile.fail()) + { + throw diskann::ANNException(std::string("Failed to open file ") + filepath, -1); + } + + std::uint32_t num_points = static_cast(location_to_seller.size()); + std::uint32_t color_size = static_cast(sizeof(uint32_t)); + outfile.write((char*)(&num_points), sizeof(std::uint32_t)); + outfile.write((char*)(&color_size), sizeof(std::uint32_t)); + outfile.write((char *)(&num_unique_sellers), sizeof(std::uint32_t)); + outfile.write((char*)location_to_seller.data(), location_to_seller.size() * color_size); + outfile.close(); +} + +bool color_helper::load_color_binfile(const std::string& filepath, std::vector& location_to_seller, std::uint32_t& num_unique_sellers) +{ + std::ifstream infile(filepath, std::ios::binary); + if (infile.fail()) + { + return false; + } + + infile.seekg(0, std::ios::end); + size_t file_size = infile.tellg(); + + infile.seekg(0, std::ios::beg); + + std::uint32_t num_points_in_file = 0; + std::uint32_t color_size = 0; + infile.read((char*)(&num_points_in_file), sizeof(std::uint32_t)); + infile.read((char*)(&color_size), sizeof(std::uint32_t)); + infile.read((char *)(&num_unique_sellers), sizeof(std::uint32_t)); + if (color_size != sizeof(uint32_t)) + { + return false; + } + + size_t color_data_size = num_points_in_file * color_size; + if (file_size != (sizeof(std::uint32_t) * 3 + color_data_size)) + { + return false; + } + + location_to_seller.resize(num_points_in_file); + infile.read((char *)location_to_seller.data(), color_data_size); + + infile.close(); + + return true; +} + +} \ No newline at end of file diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index f486b1060..846fb3b89 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -1206,7 +1206,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const std::string mem_univ_label_file = mem_index_path + "_universal_label.txt"; std::string disk_univ_label_file = disk_index_path + "_universal_label.txt"; std::string disk_labels_int_map_file = disk_index_path + "_labels_map.txt"; - std::string disk_seller_file = disk_index_path + "_sellers.txt"; + std::string old_disk_seller_file = disk_index_path + "_sellers.txt"; + std::string disk_seller_file = disk_index_path + "_sellers.bin"; std::string dummy_remap_file = disk_index_path + "_dummy_remap.txt"; // remap will be used if we break-up points of // high label-density to create copies @@ -1392,7 +1393,14 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const std::remove(labels_file_to_use.c_str()); } - std::string seller_mem_file = std::string(mem_index_path) + "_sellers.txt"; + std::string old_seller_mem_file = std::string(mem_index_path) + "_sellers.txt"; + if (file_exists(old_seller_mem_file)) + { + copy_file(old_seller_mem_file, old_disk_seller_file); + std::remove(old_seller_mem_file.c_str()); + } + + std::string seller_mem_file = std::string(mem_index_path) + "_sellers.bin"; if (file_exists(seller_mem_file)) { copy_file(seller_mem_file, disk_seller_file); diff --git a/src/index.cpp b/src/index.cpp index 7a5d7130f..3d6a15c7a 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -15,6 +15,7 @@ #include "windows_customizations.h" #include "tag_uint128.h" #include "label_helper.h" +#include "color_helper.h" #if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) #include "gperftools/malloc_extension.h" #endif @@ -290,8 +291,12 @@ void Index::save(const char *filename, bool compact_before_save if (!_save_as_one_file) { if (_diverse_index) { - std::string index_seller_file = std::string(filename) + "_sellers.txt"; - std::filesystem::copy(_seller_file, index_seller_file); + std::string index_seller_file = std::string(filename) + "_sellers.bin"; + color_helper().write_color_binfile(index_seller_file, _location_to_seller, _num_unique_sellers); + + // will remove after new format loaded + std::string old_seller_file = std::string(filename) + "_sellers.txt"; + std::filesystem::copy(_seller_file, old_seller_file); } if (_filtered_index) @@ -621,11 +626,35 @@ void Index::load(const char *filename, uint32_t num_threads, ui throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } - std::string index_seller_file = std::string(filename) + "_sellers.txt"; - if (file_exists(index_seller_file)) + std::string index_seller_file = std::string(filename) + "_sellers.bin"; + std::string old_index_seller_file = std::string(filename) + "_sellers.txt"; + if (file_exists(index_seller_file)) + { + //uint64_t nrows_seller_file; + //parse_seller_file(index_seller_file, nrows_seller_file); + if (!color_helper().load_color_binfile(index_seller_file, _location_to_seller, _num_unique_sellers) + || _location_to_seller.size() != data_file_num_pts) + { + std::stringstream stream; + stream << "ERROR: When loading seller file " << index_seller_file << std::endl; + diskann::cerr << stream.str() << std::endl; + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + + _diverse_index = true; + } + else if (file_exists(old_index_seller_file)) { uint64_t nrows_seller_file; - parse_seller_file(index_seller_file, nrows_seller_file); + parse_seller_file(old_index_seller_file, nrows_seller_file); + if (nrows_seller_file != data_file_num_pts) + { + std::stringstream stream; + stream << "ERROR: When loading old seller file " << old_index_seller_file << " found " << nrows_seller_file + << " rows, expected " << data_file_num_pts << std::endl; + diskann::cerr << stream.str() << std::endl; + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } _diverse_index = true; } diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 48e99342d..990f40bea 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -9,7 +9,9 @@ #include "label_helper.h" #include "pq_flash_index.h" #include "cosine_similarity.h" +#include "color_helper.h" #include +#include #ifdef _WINDOWS #include "windows_aligned_file_reader.h" @@ -562,7 +564,7 @@ template int PQFlashIndex::load(uint32_ std::string labels_to_medoids = std::string(index_prefix) + "_labels_to_medoids.txt"; std::string labels_map_file = std::string(index_prefix) + "_labels_map.txt"; std::string univ_label_file = std::string(index_prefix) + "_universal_label.txt"; - std::string seller_file = std::string(index_prefix) + "_sellers.txt"; + std::string seller_file = std::string(index_prefix) + "_sellers.bin"; #ifdef EXEC_ENV_OLS return load_from_separate_paths(files, num_threads, disk_index_file.c_str(), pq_table_bin.c_str(), @@ -828,8 +830,30 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons if (file_exists(seller_filepath)) { - uint64_t nrows_seller_file; - parse_seller_file(seller_filepath, nrows_seller_file); + std::filesystem::path seller_path(seller_filepath); + std::string extension = seller_path.extension().string(); + if (extension != ".bin") + { + uint64_t nrows_seller_file; + parse_seller_file(seller_filepath, nrows_seller_file); + if (nrows_seller_file != _num_points) + { + std::stringstream stream; + stream << "Error loading seller file. Exiting." << std::endl; + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + } + else + { + if (!color_helper().load_color_binfile(seller_filepath, _location_to_seller, _num_unique_sellers) + || _location_to_seller.size() != _num_points) + { + std::stringstream stream; + stream << "Error loading seller file. Exiting." << std::endl; + throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); + } + } + _diverse_index = true; }