Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions dindex_builder/dindex_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def build_dindex(profile_data_path, config: Dict, force: bool):
text_path = Path(profile_data_path) / "text"

# Read profiles and populate the Profile index
print("Reading profile files...")
profile_id_to_tbl = {}
for file_path in os.listdir(profile_path):
file_path = profile_path / file_path
# file_path = os.path.join(profile_path, file_path)
Expand All @@ -38,8 +40,11 @@ def build_dindex(profile_data_path, config: Dict, force: bool):
profile["minhash"] = ",".join(map(str, profile["minhash"]))
# add profile
dindex.add_profile(profile)
profile_id_to_tbl[profile["id"]] = profile["sourceName"]

# Read text files and populate index
print("Reading text files...")
profile_ids = set() # there are dupicate profile ids in the text files
for csv_file_path in tqdm(os.listdir(text_path)):
csv_file_path = os.path.join(text_path, csv_file_path)
if not os.path.isfile(csv_file_path):
Expand All @@ -49,8 +54,11 @@ def build_dindex(profile_data_path, config: Dict, force: bool):
df = pd.read_csv(csv_file_path, names=['profile_id', 'dbName', 'path', 'sourceName',
'columnName', 'data'], sep=csv_delimiter, skiprows=1)
for _, row in df.iterrows():
if row['profile_id'] in profile_ids:
continue
dindex.add_text_content(row['profile_id'], row['dbName'], row['path'],
row['sourceName'], row['columnName'], row['data'])
profile_ids.add(row['profile_id'])

# Have to manually refresh fts index as per DuckDB's docs: "Note that the FTS index will not update automatically
# when input table changes. A workaround of this limitation can be recreating the index to refresh."
Expand All @@ -60,14 +68,19 @@ def build_dindex(profile_data_path, config: Dict, force: bool):

# Create content_similarity edges
# TODO: this could be done incrementally, every time a new node is added, at a cost in efficiency
print("Building content similarity edges...")
profiles = dindex.get_minhashes()
content_similarity_index = dindex.get_content_similarity_index()
visited_profile_ids = set()
for profile in tqdm(profiles):
cur_id = profile['id']
visited_profile_ids.add(cur_id)
neighbors = content_similarity_index.query(profile['minhash'])
for neighbor in neighbors:
# TODO: Need to check that they are not from the same source
# TODO: Replace with actual attributes
dindex.add_undirected_edge(
# we don't add edges within the same table.
if neighbor in visited_profile_ids or profile_id_to_tbl[cur_id] == profile_id_to_tbl[neighbor]:
continue
dindex.add_edge(
profile['id'], neighbor,
EdgeType.ATTRIBUTE_SYNTACTIC_SIMILARITY, {'similar': 1})
print("Done building")
Expand Down
2 changes: 1 addition & 1 deletion dindex_store/graph_index_kuzu.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def find_neighborhood(self, node_id, relation_type, hops) -> List:
try:
results = self.conn.execute(
f'''MATCH
(startNode:ColumnNode {{id : {node_id}}})-[:Edge*1..{hops}]->(endNode:ColumnNode)
(startNode:ColumnNode {{id : {node_id}}})-[:Edge*1..{hops}]-(endNode:ColumnNode)
RETURN endNode''')
neighbors = []
while results.has_next():
Expand Down