From 12a58a1dfae1fd36c06ef5bca87678dea0a0c91a Mon Sep 17 00:00:00 2001 From: greatjudge Date: Tue, 12 Mar 2024 02:09:28 +0300 Subject: [PATCH] write all --- .gitignore | 4 + hw_boolean_search.py | 80 +++++++++----- index.py | 44 ++++++++ searcher.py | 190 +++++++++++++++++++++++++++++++++ start.sh | 2 +- utils/for_sorted.py | 78 ++++++++++++++ utils/test_intersect_sorted.py | 98 +++++++++++++++++ utils/test_merge_sorted.py | 157 +++++++++++++++++++++++++++ 8 files changed, 625 insertions(+), 28 deletions(-) create mode 100644 .gitignore create mode 100644 index.py create mode 100644 searcher.py mode change 100644 => 100755 start.sh create mode 100644 utils/for_sorted.py create mode 100644 utils/test_intersect_sorted.py create mode 100644 utils/test_merge_sorted.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..416387e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea +*.csv +data +__pycache__/ \ No newline at end of file diff --git a/hw_boolean_search.py b/hw_boolean_search.py index d98b8cc..4ce2f6c 100644 --- a/hw_boolean_search.py +++ b/hw_boolean_search.py @@ -3,37 +3,63 @@ import argparse import codecs -import sys +from typing import Iterable +from time import perf_counter +from index import IndexMemory +from searcher import Searcher -class Index: - def __init__(self, index_file): - # TODO: build index - pass -class QueryTree: - def __init__(self, qid, query): - # TODO: parse query and create query tree - pass - def search(self, index): - # TODO: lookup query terms in the index and implement boolean search logic - pass +def create_and_fill_memory_index(docs_filepath): + print('Start creating memory index...') + start = perf_counter() + index = IndexMemory() + with open(docs_filepath) as f: + for i, line in enumerate(f, 1): + fields = line.rstrip('\n').split('\t') + docid, title, body = fields + doc = title + ' ' + body + index.add_document(doc, docid) + if i % 1000 == 0: + print(f"Processed {i} documents") + index.commit() + print(f'Memory index created, total time: {perf_counter() - start}') + return index + + + +INDEX_DIR = 'indexdir' class SearchResults: - def add(self, found): - # TODO: add next query's results - pass + def __init__(self): + self._qid_to_docids: dict[int, set] = {} + + def add(self, qid: int, docids: Iterable[str]): + self._qid_to_docids[qid] = set(docids) def print_submission(self, objects_file, submission_file): - # TODO: generate submission file - pass + obj_file = open(objects_file) + obj_file_iter = iter(obj_file) + next(obj_file_iter) + + subm_file = open(submission_file, 'w') + subm_file.write('ObjectId,Relevance\n') + + for line in obj_file_iter: + obj_id, qid, docid = line.rstrip('\n').split(',') + qid = int(qid) + if docid in self._qid_to_docids[qid]: + finded = 1 + else: + finded = 0 + subm_file.write(f'{obj_id},{finded}\n') def main(): - # Command line arguments. + # Command line argum_indexents. parser = argparse.ArgumentParser(description='Homework: Boolean Search') parser.add_argument('--queries_file', required = True, help='queries.numerate.txt') parser.add_argument('--objects_file', required = True, help='objects.numerate.txt') @@ -41,22 +67,23 @@ def main(): parser.add_argument('--submission_file', required = True, help='output file with relevances') args = parser.parse_args() - # Build index. - index = Index(args.docs_file) + index = create_and_fill_memory_index(args.docs_file) + + # Searcher + searcher = Searcher(index, top_k=30) # Process queries. search_results = SearchResults() with codecs.open(args.queries_file, mode='r', encoding='utf-8') as queries_fh: - for line in queries_fh: + for i, line in enumerate(queries_fh, 1): fields = line.rstrip('\n').split('\t') qid = int(fields[0]) query = fields[1] - # Parse query. - query_tree = QueryTree(qid, query) - - # Search and save results. - search_results.add(query_tree.search(index)) + result_docids = searcher.search(query) + search_results.add(qid, result_docids) + if i % 100 == 0: + print(f'Processed {i} queries') # Generate submission file. search_results.print_submission(args.objects_file, args.submission_file) @@ -64,4 +91,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/index.py b/index.py new file mode 100644 index 0000000..780ddb0 --- /dev/null +++ b/index.py @@ -0,0 +1,44 @@ +from collections import Counter +from typing import Iterable + + +class Tokenizer: + def tokenize(self, doc: str) -> Iterable[str]: + return doc.split() + + +class IndexMemory: + def __init__(self) -> None: + self._inverted_index: dict[str, list[int]] = {} + self._docnum = 0 + self._docnum_to_docid: dict[int, str] = {} + self._tokenizer = Tokenizer() + self.token_occurences = Counter() + self.total_token_occur = 0 + + def add_document(self, doc: str, docid: str): + tokens = self._tokenizer.tokenize(doc) + for token in tokens: + self.token_occurences[token] += 1 + postings = self._inverted_index.setdefault(token, []) + if not postings or postings[-1] != self._docnum: + postings.append(self._docnum) + self._docnum_to_docid[self._docnum] = docid + self._docnum += 1 + + def commit(self): + self.total_token_occur = self.token_occurences.total() + + def postings(self, token: str) -> list[int]: + return self._inverted_index.get(token, []) + + def has_token(self, token: str) -> bool: + return token in self._inverted_index + + def get_docid(self, docnum: int): + return self._docnum_to_docid[docnum] + + def language_model(self, token: str): + '''Probability of `word`. Naive implementation.''' + assert self.token_occurences != 0 + return self.token_occurences[token] / self.total_token_occur diff --git a/searcher.py b/searcher.py new file mode 100644 index 0000000..8049fdb --- /dev/null +++ b/searcher.py @@ -0,0 +1,190 @@ +import heapq + +from index import IndexMemory +# from search_framework.index.in_memory.index import IndexMemory +from utils.for_sorted import ( + merge_sorted_unique, + merge_sorted_unique_many, + intersect_sorted_unique, +) +from typing import Iterable +from heapq import nlargest + + +class Searcher: + # TODO: TEST + def __init__(self, index: IndexMemory, top_k_for_exsisting = 3, top_k = 10): + self._index = index + self._service_symbols = {"(", ")", " ", "|"} + self._op_priority = {"(": 0, "|": 1, " ": 2} + self._rpn_operators = {' ', '|'} + self.top_k = top_k + self.top_k_for_exsisting = top_k_for_exsisting + + def search(self, query: str) -> list[str]: + """ + Return doc_ids list for query + """ + parsed = self._parse_query(query) + rpn = self._to_rpn(parsed) + postings = self._execute_rpn(rpn) + + doc_ids = [''] * len(postings) + for i, docnum in enumerate(postings): + doc_ids[i] = self._index.get_docid(docnum) + return doc_ids + + def _parse_query(self, query: str) -> list[str]: + """ + Convert string query to list of items (token or operator) + """ + query_list = [] + word = [] + for ch in query: + if ch in self._service_symbols: + if word: + query_list.append(''.join(word)) + word = [] + query_list.append(ch) + else: + word.append(ch) + if word: + query_list.append(''.join(word)) + return query_list + + def _to_rpn(self, query: list[str]) -> list[str]: + """ + Convert parsed query to Reverse Polish Notation + """ + bracket_cnt = 0 + result: list[str] = [] + stack = [] + for w in query: + if w == "(": + stack.append(w) + bracket_cnt += 1 + elif w == ")": + bracket_cnt -= 1 + if bracket_cnt < 0: + raise ValueError("wrong brackets") + while stack and stack[-1] != "(": + result.append(stack.pop()) + stack.pop() + elif w in self._op_priority: + while stack and self._op_priority[stack[-1]] >= self._op_priority[w]: + result.append(stack.pop()) + stack.append(w) + else: + result.append(w) + if bracket_cnt != 0: + raise ValueError("wrong brackets") + while stack: + result.append(stack.pop()) + return result + + def _execute_rpn(self, query: list[str]) -> list[int]: + """ + Return postings list + """ + stack: list[list[int]] = [] + for w in query: + if w in self._rpn_operators: + if len(stack) < 2: + raise ValueError("bad query") + snd, fst = stack.pop(), stack.pop() + if w == ' ': + result = intersect_sorted_unique(fst, snd) + else: + result = merge_sorted_unique(fst, snd) + stack.append(result) + else: + edited = {w} + if len(w) >= 2: + edited |= edits1(w) + edited = self._limit_candidates(edited) + + layouted = change_layout(w) + lay_edited = {layouted} + if len(layouted) >= 3: + edited |= self._limit_candidates(edits1(layouted)) + edited |= self._limit_candidates(lay_edited) + edited = self._get_top_k(edited, top_k=self.top_k) + + postings = merge_sorted_unique_many( + [self._index.postings(token) for token in edited] + ) + stack.append(postings) + result = stack.pop() + if stack: + raise ValueError("bad query") + return result + + def _limit_candidates(self, tokens: Iterable[str]) -> set[str]: + '''The subset of `tokens` that appear in the dictionary of words_occurences.''' + output = set() + for token in tokens: + if self._index.has_token(token): + output.add(token) + return output + + def _get_top_k(self, words: Iterable[str], top_k: int) -> list[str]: + return nlargest(top_k, words, self._index.language_model) + + +def all_edits1(words: set[str]) -> set[str]: + output = set() + for w in words: + output |= edits1(w) + return output + + +def edits1(word: str) -> set[str]: + '''All edits that are one edit away from `word`.''' + if is_rus(word): + letters = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' + else: + letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits] + transposes = [L[:-1] + R[0] + L[-1] + R[1:] for L, R in splits if len(L) > 0 and len(R) > 0] + replaces = [] + for c in letters: + replaces += [L + c + R[1:] for L, R in splits] + inserts = [] + for c in letters: + inserts += [L + c + R for L, R in splits] + # prefixes = [L for L, _ in splits if len(L) >= 3] + # suffixes = [R for _, R in splits if len(R) >= 3] + return set(deletes + transposes + replaces + inserts) + + +def edits2(word: str) -> set[str]: + '''All edits that are 2 edits away from `word`.''' + output = set() + for candidate in edits1(word): + output.update(edits1(candidate)) + return output + + +ENG = '''qwertyuiop[]asdfghjkl;'zxcvbnm,./`QWERTYUIOP{}ASDFGHJKL:"ZXCVBNM<>?~''' +RUS = '''йцукенгшщзхъфывапролджэячсмитьбю.ёЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ,Ё''' + + +def layout_eng_2_rus(text): + layout = dict(zip(map(ord, ENG), RUS)) + return text.translate(layout) + + +def layout_rus_2_eng(text): + layout = dict(zip(map(ord, RUS), ENG)) + return text.translate(layout) + + +def change_layout(word: str) -> str: + if is_rus(word): + return layout_rus_2_eng(word) + return layout_eng_2_rus(word) + + +def is_rus(text, alphabet=set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')): + return not alphabet.isdisjoint(text.lower()) diff --git a/start.sh b/start.sh old mode 100644 new mode 100755 index 0dc4976..cf8ae8d --- a/start.sh +++ b/start.sh @@ -1,4 +1,4 @@ -docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python hw_boolean_search.py \ +docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python -O hw_boolean_search.py \ --queries_file /work_dir/data/queries.numerate.txt \ --objects_file /work_dir/data/objects.numerate.txt\ --docs_file /work_dir/data/docs.txt \ diff --git a/utils/for_sorted.py b/utils/for_sorted.py new file mode 100644 index 0000000..87b28dd --- /dev/null +++ b/utils/for_sorted.py @@ -0,0 +1,78 @@ +def merge_sorted_unique(lst1: list, lst2: list) -> list: + assert lst1 == sorted(lst1) + assert is_all_unique_in_sorted(lst1) + + assert lst2 == sorted(lst2) + assert is_all_unique_in_sorted(lst2) + + result = [] + i, j = 0, 0 + while i < len(lst1) and j < len(lst2): + if lst1[i] == lst2[j]: + result.append(lst1[i]) + i += 1 + j += 1 + elif lst1[i] < lst2[j]: + result.append(lst1[i]) + i += 1 + else: + result.append(lst2[j]) + j += 1 + while i < len(lst1): + result.append(lst1[i]) + i += 1 + while j < len(lst2): + result.append(lst2[j]) + j += 1 + return result + + +def merge_sorted_unique_many(lists: list[list]) -> list: + merged = [] + index = [0] * len(lists) + while True: + i_min = -1 + for i, lst in enumerate(lists): + if ( + index[i] < len(lst) and + (i_min == -1 or lst[index[i]] < lists[i_min][index[i_min]]) + ): + i_min = i + if i_min == -1: + break + min_elem = lists[i_min][index[i_min]] + if not merged or merged[-1] != min_elem: + merged.append(min_elem) + index[i_min] += 1 + return merged + + +def intersect_sorted_unique(lst1: list, lst2: list) -> list: + assert lst1 == sorted(lst1) + assert is_all_unique_in_sorted(lst1) + + assert lst2 == sorted(lst2) + assert is_all_unique_in_sorted(lst2) + + """ + intersect two sorted arrays + """ + result = [] + i, j = 0, 0 + while i < len(lst1) and j < len(lst2): + if lst1[i] == lst2[j]: + result.append(lst1[i]) + i += 1 + j += 1 + elif lst1[i] < lst2[j]: + i += 1 + else: + j += 1 + return result + + +def is_all_unique_in_sorted(lst: list) -> bool: + for i in range(len(lst) - 1): + if lst[i] == lst[i + 1]: + return False + return True diff --git a/utils/test_intersect_sorted.py b/utils/test_intersect_sorted.py new file mode 100644 index 0000000..37128c0 --- /dev/null +++ b/utils/test_intersect_sorted.py @@ -0,0 +1,98 @@ +import unittest +from search_framework.utils.for_sorted import intersect_sorted_unique + + +class TestIntersect(unittest.TestCase): + def test_normal(self): + lst1 = [1, 2] + lst2 = [2, 3] + result = [2] + self.assertEqual(intersect_sorted_unique(lst1, lst2), result) + + def test_different_len(self): + cases = [ + { + "fst": [1, 2, 3, 4, 5], + "snd": [2, 5], + "result": [2, 5] + }, + { + "fst": [1, 6], + "snd": [1, 4, 5, 6], + "result": [1, 6] + }, + { + "fst": [1, 2], + "snd": [2, 4, 7], + "result": [2] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_empty(self): + cases = [ + { + "fst": [1, 2], + "snd": [], + "result": [] + }, + { + "fst": [], + "snd": [3, 4], + "result": [] + }, + { + "fst": [], + "snd": [], + "result": [] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_one_elem(self): + cases = [ + { + "fst": [1], + "snd": [1, 4], + "result": [1] + }, + { + "fst": [1, 2], + "snd": [2], + "result": [2] + }, + { + "fst": [1], + "snd": [4], + "result": [] + }, + { + "fst": [5], + "snd": [5], + "result": [5] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_some(self): + cases = [ + { + "fst": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "snd": [2, 4, 6, 8, 10, 11, 12, 13], + "result": [2, 4, 6, 8, 10] + }, + { + "fst": [2, 4, 6, 8, 10, 11, 12, 13], + "snd": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "result": [2, 4, 6, 8, 10] + }, + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) \ No newline at end of file diff --git a/utils/test_merge_sorted.py b/utils/test_merge_sorted.py new file mode 100644 index 0000000..4bdedc5 --- /dev/null +++ b/utils/test_merge_sorted.py @@ -0,0 +1,157 @@ +import unittest +from search_framework.utils.for_sorted import merge_sorted_unique, merge_sorted_unique_many + + +class TestMergeUnique(unittest.TestCase): + def test_normal(self): + lst1 = [1, 2] + lst2 = [3, 4] + result = [1, 2, 3, 4] + self.assertEqual(merge_sorted_unique(lst1, lst2), result) + + def test_different_len(self): + cases = [ + { + "fst": [1, 2, 3, 4, 5], + "snd": [6, 7], + "result": [1, 2, 3, 4, 5, 6, 7] + }, + { + "fst": [1, 2], + "snd": [3, 4, 5, 6], + "result": [1, 2, 3, 4, 5, 6] + }, + { + "fst": [1, 2], + "snd": [3, 6, 7], + "result": [1, 2, 3, 6, 7] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_empty(self): + cases = [ + { + "fst": [1, 2], + "snd": [], + "result": [1, 2] + }, + { + "fst": [], + "snd": [3, 4], + "result": [3, 4] + }, + { + "fst": [], + "snd": [], + "result": [] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_one_elem(self): + cases = [ + { + "fst": [1], + "snd": [3, 4], + "result": [1, 3, 4] + }, + { + "fst": [1, 2], + "snd": [4], + "result": [1, 2, 4] + }, + { + "fst": [1], + "snd": [4], + "result": [1, 4] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_inverse(self): + cases = [ + { + "fst": [3, 4, 5], + "snd": [1, 2], + "result": [1, 2, 3, 4, 5] + }, + { + "fst": [3], + "snd": [1], + "result": [1, 3] + }, + { + "fst": [4, 5], + "snd": [1], + "result": [1, 4, 5] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_some(self): + cases = [ + { + "fst": [1, 3, 5, 7], + "snd": [2, 4, 6, 8, 10], + "result": [1, 2, 3, 4, 5, 6, 7, 8, 10] + }, + { + "fst": [2, 4, 6, 8, 10], + "snd": [1, 3, 5, 7], + "result":[1, 2, 3, 4, 5, 6, 7, 8, 10] + }, + { + "fst": [1, 3, 5, 7, 9], + "snd": [2, 4, 6, 8, 10], + "result": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + def test_only_unique(self): + cases = [ + { + "fst": [1, 2], + "snd": [2, 3], + "result": [1, 2, 3] + }, + { + "fst": [1, 2, 3, 4], + "snd": [2, 3, 4, 5, 6], + "result": [1, 2, 3, 4, 5, 6] + }, + { + "fst": [1, 2, 3, 5, 6], + "snd": [3, 4, 5], + "result": [1, 2, 3, 4, 5, 6] + } + ] + for i, tc in enumerate(cases): + with self.subTest(i=i): + self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"]) + + +class TestMergeSortedMany(unittest.TestCase): + def test_normal(self): + lst = [ + [1, 2, 3], + [2, 3, 4], + [3, 4, 5], + [6, 7, 8], + ] + result = [1, 2, 3, 4, 5, 6, 7, 8] + self.assertEqual( + merge_sorted_unique_many(lst), + result + ) \ No newline at end of file