11#pragma once
22#include " label_bitmask.h"
3+ #include " integer_label_vector.h"
34#include " percentile_stats.h"
5+ #include " tsl/robin_set.h"
46#include < string>
57
68namespace diskann
@@ -20,6 +22,113 @@ class label_helper
2022
2123 bool read_bitmask_from_file (const std::string &bitmask_label_file, simple_bitmask_buf &bitmask_buf,
2224 size_t & num_points);
25+
26+ bool parse_label_file_in_integer (
27+ const std::string& label_file,
28+ size_t & num_points,
29+ integer_label_vector& integer_vector,
30+ tsl::robin_set<uint32_t >& labels, TableStats &table_stats);
31+
32+ template <typename LabelT>
33+ bool load_label_map (
34+ const std::string& label_map_file,
35+ std::unordered_map<std::string, LabelT>& label_map)
36+ {
37+ std::ifstream infile (label_map_file, std::ios::binary);
38+ if (infile.fail ())
39+ {
40+ throw diskann::ANNException (std::string (" Failed to open file " ) + label_map_file, -1 );
41+ }
42+ infile.seekg (0 , std::ios::end);
43+ size_t file_size = infile.tellg ();
44+
45+ std::string buffer (file_size, ' ' );
46+
47+ infile.seekg (0 , std::ios::beg);
48+ infile.read (&buffer[0 ], file_size);
49+ infile.close ();
50+
51+ unsigned line_cnt = 0 ;
52+
53+ size_t cur_pos = 0 ;
54+ size_t next_pos = 0 ;
55+ size_t lbl_pos = 0 ;
56+ std::string token;
57+ std::string labe_str;
58+ while (cur_pos < file_size && cur_pos != std::string::npos)
59+ {
60+ next_pos = buffer.find (' \n ' , cur_pos);
61+ if (next_pos == std::string::npos)
62+ {
63+ break ;
64+ }
65+
66+ lbl_pos = search_string_range (buffer, ' \t ' , cur_pos, next_pos);
67+ labe_str.assign (buffer.c_str () + cur_pos, lbl_pos - cur_pos);
68+
69+ token.assign (buffer.c_str () + lbl_pos + 1 , next_pos - lbl_pos - 1 );
70+ LabelT label_num = (LabelT)std::stoul (token);
71+
72+ label_map[labe_str] = label_num;
73+
74+ cur_pos = next_pos + 1 ;
75+
76+ line_cnt++;
77+ }
78+
79+ return true ;
80+ }
81+
82+ template <typename LabelT>
83+ bool load_label_medoids (
84+ const std::string& label_medoids_file,
85+ std::unordered_map<LabelT, uint32_t >& label_to_start_id)
86+ {
87+ std::ifstream infile (label_medoids_file, std::ios::binary);
88+ if (infile.fail ())
89+ {
90+ throw diskann::ANNException (std::string (" Failed to open file " ) + label_medoids_file, -1 );
91+ }
92+ infile.seekg (0 , std::ios::end);
93+ size_t file_size = infile.tellg ();
94+
95+ std::string buffer (file_size, ' ' );
96+
97+ infile.seekg (0 , std::ios::beg);
98+ infile.read (&buffer[0 ], file_size);
99+ infile.close ();
100+
101+ unsigned line_cnt = 0 ;
102+
103+ size_t cur_pos = 0 ;
104+ size_t next_pos = 0 ;
105+ size_t lbl_pos = 0 ;
106+ std::string token;
107+ while (cur_pos < file_size && cur_pos != std::string::npos)
108+ {
109+ next_pos = buffer.find (' \n ' , cur_pos);
110+ if (next_pos == std::string::npos)
111+ {
112+ break ;
113+ }
114+
115+ lbl_pos = search_string_range (buffer, ' ,' , cur_pos, next_pos);
116+ token.assign (buffer.c_str () + cur_pos, lbl_pos - cur_pos);
117+ LabelT label_num = (LabelT)std::stoul (token);
118+
119+ token.assign (buffer.c_str () + lbl_pos + 1 , next_pos - lbl_pos - 1 );
120+ uint32_t medoid = (uint32_t )std::stoul (token);
121+
122+ label_to_start_id[label_num] = medoid;
123+
124+ cur_pos = next_pos + 1 ;
125+
126+ line_cnt++;
127+ }
128+
129+ return true ;
130+ }
131+
23132 private:
24133 size_t search_string_range (const std::string& str, char ch, size_t start, size_t end);
25134};
0 commit comments