From 12a58a1dfae1fd36c06ef5bca87678dea0a0c91a Mon Sep 17 00:00:00 2001
From: greatjudge <greatjudge0@gmail.com>
Date: Tue, 12 Mar 2024 02:09:28 +0300
Subject: [PATCH] write all

---
 .gitignore                     |   4 +
 hw_boolean_search.py           |  80 +++++++++-----
 index.py                       |  44 ++++++++
 searcher.py                    | 190 +++++++++++++++++++++++++++++++++
 start.sh                       |   2 +-
 utils/for_sorted.py            |  78 ++++++++++++++
 utils/test_intersect_sorted.py |  98 +++++++++++++++++
 utils/test_merge_sorted.py     | 157 +++++++++++++++++++++++++++
 8 files changed, 625 insertions(+), 28 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 index.py
 create mode 100644 searcher.py
 mode change 100644 => 100755 start.sh
 create mode 100644 utils/for_sorted.py
 create mode 100644 utils/test_intersect_sorted.py
 create mode 100644 utils/test_merge_sorted.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..416387e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.idea
+*.csv
+data
+__pycache__/
\ No newline at end of file
diff --git a/hw_boolean_search.py b/hw_boolean_search.py
index d98b8cc..4ce2f6c 100644
--- a/hw_boolean_search.py
+++ b/hw_boolean_search.py
@@ -3,37 +3,63 @@
 
 import argparse
 import codecs
-import sys
+from typing import Iterable
+from time import perf_counter
 
+from index import IndexMemory
+from searcher import Searcher
 
-class Index:
-    def __init__(self, index_file):
-        # TODO: build index
-        pass
 
 
-class QueryTree:
-    def __init__(self, qid, query):
-        # TODO: parse query and create query tree
-        pass
 
-    def search(self, index):
-        # TODO: lookup query terms in the index and implement boolean search logic
-        pass
+def create_and_fill_memory_index(docs_filepath):
+    print('Start creating memory index...')
+    start = perf_counter()
+    index = IndexMemory()
+    with open(docs_filepath) as f:
+        for i, line in enumerate(f, 1):
+            fields = line.rstrip('\n').split('\t')
+            docid, title, body = fields
+            doc = title + ' ' + body
+            index.add_document(doc, docid)
+            if i % 1000 == 0:
+                print(f"Processed {i} documents")
+    index.commit()
+    print(f'Memory index created, total time: {perf_counter() - start}')
+    return index
+
+
+
+INDEX_DIR = 'indexdir'
 
 
 class SearchResults:
-    def add(self, found):
-        # TODO: add next query's results
-        pass
+    def __init__(self):
+        self._qid_to_docids: dict[int, set] = {}
+
+    def add(self, qid: int, docids: Iterable[str]):
+        self._qid_to_docids[qid] = set(docids)
 
     def print_submission(self, objects_file, submission_file):
-        # TODO: generate submission file
-        pass
+        obj_file = open(objects_file)
+        obj_file_iter = iter(obj_file)
+        next(obj_file_iter)
+
+        subm_file = open(submission_file, 'w')
+        subm_file.write('ObjectId,Relevance\n')
+
+        for line in obj_file_iter:
+            obj_id, qid, docid = line.rstrip('\n').split(',')
+            qid = int(qid)
+            if docid in self._qid_to_docids[qid]:
+                finded = 1
+            else:
+                finded = 0
+            subm_file.write(f'{obj_id},{finded}\n')
 
 
 def main():
-    # Command line arguments.
+    # Command line argum_indexents.
     parser = argparse.ArgumentParser(description='Homework: Boolean Search')
     parser.add_argument('--queries_file', required = True, help='queries.numerate.txt')
     parser.add_argument('--objects_file', required = True, help='objects.numerate.txt')
@@ -41,22 +67,23 @@ def main():
     parser.add_argument('--submission_file', required = True, help='output file with relevances')
     args = parser.parse_args()
 
-    # Build index.
-    index = Index(args.docs_file)
+    index = create_and_fill_memory_index(args.docs_file)
+
+    # Searcher
+    searcher = Searcher(index, top_k=30)
 
     # Process queries.
     search_results = SearchResults()
     with codecs.open(args.queries_file, mode='r', encoding='utf-8') as queries_fh:
-        for line in queries_fh:
+        for i, line in enumerate(queries_fh, 1):
             fields = line.rstrip('\n').split('\t')
             qid = int(fields[0])
             query = fields[1]
 
-            # Parse query.
-            query_tree = QueryTree(qid, query)
-
-            # Search and save results.
-            search_results.add(query_tree.search(index))
+            result_docids = searcher.search(query)
+            search_results.add(qid, result_docids)
+            if i % 100 == 0:
+                print(f'Processed {i} queries')
 
     # Generate submission file.
     search_results.print_submission(args.objects_file, args.submission_file)
@@ -64,4 +91,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/index.py b/index.py
new file mode 100644
index 0000000..780ddb0
--- /dev/null
+++ b/index.py
@@ -0,0 +1,44 @@
+from collections import Counter
+from typing import Iterable
+
+
+class Tokenizer:
+    def tokenize(self, doc: str) -> Iterable[str]:
+        return doc.split()
+
+
+class IndexMemory:
+    def __init__(self) -> None:
+        self._inverted_index: dict[str, list[int]] = {}
+        self._docnum = 0
+        self._docnum_to_docid: dict[int, str] = {}
+        self._tokenizer = Tokenizer()
+        self.token_occurences = Counter()
+        self.total_token_occur = 0
+
+    def add_document(self, doc: str, docid: str):
+        tokens = self._tokenizer.tokenize(doc)
+        for token in tokens:
+            self.token_occurences[token] += 1
+            postings = self._inverted_index.setdefault(token, [])
+            if not postings or postings[-1] != self._docnum:
+                postings.append(self._docnum)
+        self._docnum_to_docid[self._docnum] = docid
+        self._docnum += 1
+
+    def commit(self):
+        self.total_token_occur = self.token_occurences.total()
+
+    def postings(self, token: str) -> list[int]:
+        return self._inverted_index.get(token, [])
+
+    def has_token(self, token: str) -> bool:
+        return token in self._inverted_index
+
+    def get_docid(self, docnum: int):
+        return self._docnum_to_docid[docnum]
+
+    def language_model(self, token: str):
+        '''Probability of `word`. Naive implementation.'''
+        assert self.token_occurences != 0
+        return self.token_occurences[token] / self.total_token_occur
diff --git a/searcher.py b/searcher.py
new file mode 100644
index 0000000..8049fdb
--- /dev/null
+++ b/searcher.py
@@ -0,0 +1,190 @@
+import heapq
+
+from index import IndexMemory
+# from search_framework.index.in_memory.index import IndexMemory
+from utils.for_sorted import (
+    merge_sorted_unique,
+    merge_sorted_unique_many,
+    intersect_sorted_unique,
+)
+from typing import Iterable
+from heapq import nlargest
+
+
+class Searcher:
+    # TODO: TEST
+    def __init__(self, index: IndexMemory, top_k_for_exsisting = 3, top_k = 10):
+        self._index = index
+        self._service_symbols = {"(", ")", " ", "|"}
+        self._op_priority = {"(": 0, "|": 1, " ": 2}
+        self._rpn_operators = {' ', '|'}
+        self.top_k = top_k
+        self.top_k_for_exsisting = top_k_for_exsisting
+
+    def search(self, query: str) -> list[str]:
+        """
+        Return doc_ids list for query
+        """
+        parsed = self._parse_query(query)
+        rpn = self._to_rpn(parsed)
+        postings = self._execute_rpn(rpn)
+
+        doc_ids = [''] * len(postings)
+        for i, docnum in enumerate(postings):
+            doc_ids[i] = self._index.get_docid(docnum)
+        return doc_ids
+
+    def _parse_query(self, query: str) -> list[str]:
+        """
+        Convert string query to list of items (token or operator)
+        """
+        query_list = []
+        word = []
+        for ch in query:
+            if ch in self._service_symbols:
+                if word:
+                    query_list.append(''.join(word))
+                    word = []
+                query_list.append(ch)
+            else:
+                word.append(ch)
+        if word:
+            query_list.append(''.join(word))
+        return query_list
+
+    def _to_rpn(self, query: list[str]) -> list[str]:
+        """
+        Convert parsed query to Reverse Polish Notation
+        """
+        bracket_cnt = 0
+        result: list[str] = []
+        stack = []
+        for w in query:
+            if w == "(":
+                stack.append(w)
+                bracket_cnt += 1
+            elif w == ")":
+                bracket_cnt -= 1
+                if bracket_cnt < 0:
+                    raise ValueError("wrong brackets")
+                while stack and stack[-1] != "(":
+                    result.append(stack.pop())
+                stack.pop()
+            elif w in self._op_priority:
+                while stack and self._op_priority[stack[-1]] >= self._op_priority[w]:
+                    result.append(stack.pop())
+                stack.append(w)
+            else:
+                result.append(w)
+        if bracket_cnt != 0:
+            raise ValueError("wrong brackets")
+        while stack:
+            result.append(stack.pop())
+        return result
+
+    def  _execute_rpn(self, query: list[str]) -> list[int]:
+        """
+        Return postings list
+        """
+        stack: list[list[int]] = []
+        for w in query:
+            if w in self._rpn_operators:
+                if len(stack) < 2:
+                    raise ValueError("bad query")
+                snd, fst = stack.pop(), stack.pop()
+                if w == ' ':
+                    result = intersect_sorted_unique(fst, snd)
+                else:
+                    result = merge_sorted_unique(fst, snd)
+                stack.append(result)
+            else:
+                edited = {w}
+                if len(w) >= 2:
+                    edited |= edits1(w)
+                edited = self._limit_candidates(edited)
+
+                layouted = change_layout(w)
+                lay_edited = {layouted}
+                if len(layouted) >= 3:
+                    edited |= self._limit_candidates(edits1(layouted))
+                edited |= self._limit_candidates(lay_edited)
+                edited = self._get_top_k(edited, top_k=self.top_k)
+
+                postings = merge_sorted_unique_many(
+                    [self._index.postings(token) for token in edited]
+                )
+                stack.append(postings)
+        result = stack.pop()
+        if stack:
+            raise ValueError("bad query")
+        return result
+
+    def _limit_candidates(self, tokens: Iterable[str]) -> set[str]:
+        '''The subset of `tokens` that appear in the dictionary of words_occurences.'''
+        output = set()
+        for token in tokens:
+            if self._index.has_token(token):
+                output.add(token)
+        return output
+
+    def _get_top_k(self, words: Iterable[str], top_k: int) -> list[str]:
+        return nlargest(top_k, words, self._index.language_model)
+
+
+def all_edits1(words: set[str]) -> set[str]:
+    output = set()
+    for w in words:
+        output |= edits1(w)
+    return output
+
+
+def edits1(word: str) -> set[str]:
+    '''All edits that are one edit away from `word`.'''
+    if is_rus(word):
+        letters = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
+    else:
+        letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+    deletes = [L + R[1:] for L, R in splits]
+    transposes = [L[:-1] + R[0] + L[-1] + R[1:] for L, R in splits if len(L) > 0 and len(R) > 0]
+    replaces = []
+    for c in letters:
+        replaces += [L + c + R[1:] for L, R in splits]
+    inserts = []
+    for c in letters:
+        inserts += [L + c + R for L, R in splits]
+    # prefixes = [L for L, _ in splits if len(L) >= 3]
+    # suffixes = [R for _, R in splits if len(R) >= 3]
+    return set(deletes + transposes + replaces + inserts)
+
+
+def edits2(word: str) -> set[str]:
+    '''All edits that are 2 edits away from `word`.'''
+    output = set()
+    for candidate in edits1(word):
+        output.update(edits1(candidate))
+    return output
+
+
+ENG = '''qwertyuiop[]asdfghjkl;'zxcvbnm,./`QWERTYUIOP{}ASDFGHJKL:"ZXCVBNM<>?~'''
+RUS = '''йцукенгшщзхъфывапролджэячсмитьбю.ёЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ,Ё'''
+
+
+def layout_eng_2_rus(text):
+    layout = dict(zip(map(ord, ENG), RUS))
+    return text.translate(layout)
+
+
+def layout_rus_2_eng(text):
+    layout = dict(zip(map(ord, RUS), ENG))
+    return text.translate(layout)
+
+
+def change_layout(word: str) -> str:
+    if is_rus(word):
+        return layout_rus_2_eng(word)
+    return layout_eng_2_rus(word)
+
+
+def is_rus(text, alphabet=set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')):
+    return not alphabet.isdisjoint(text.lower())
diff --git a/start.sh b/start.sh
old mode 100644
new mode 100755
index 0dc4976..cf8ae8d
--- a/start.sh
+++ b/start.sh
@@ -1,4 +1,4 @@
-docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python hw_boolean_search.py \
+docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python -O hw_boolean_search.py \
     --queries_file /work_dir/data/queries.numerate.txt \
     --objects_file  /work_dir/data/objects.numerate.txt\
     --docs_file /work_dir/data/docs.txt \
diff --git a/utils/for_sorted.py b/utils/for_sorted.py
new file mode 100644
index 0000000..87b28dd
--- /dev/null
+++ b/utils/for_sorted.py
@@ -0,0 +1,78 @@
+def merge_sorted_unique(lst1: list, lst2: list) -> list:
+    assert lst1 == sorted(lst1)
+    assert is_all_unique_in_sorted(lst1)
+
+    assert lst2 == sorted(lst2)
+    assert is_all_unique_in_sorted(lst2)
+
+    result = []
+    i, j = 0, 0
+    while i < len(lst1) and j < len(lst2):
+        if lst1[i] == lst2[j]:
+            result.append(lst1[i])
+            i += 1
+            j += 1
+        elif lst1[i] < lst2[j]:
+            result.append(lst1[i])
+            i += 1
+        else:
+            result.append(lst2[j])
+            j += 1
+    while i < len(lst1):
+        result.append(lst1[i])
+        i += 1
+    while j < len(lst2):
+        result.append(lst2[j])
+        j += 1
+    return result
+
+
+def merge_sorted_unique_many(lists: list[list]) -> list:
+    merged = []
+    index = [0] * len(lists)
+    while True:
+        i_min = -1
+        for i, lst in enumerate(lists):
+            if (
+                    index[i] < len(lst) and
+                    (i_min == -1 or lst[index[i]] < lists[i_min][index[i_min]])
+            ):
+                i_min = i
+        if i_min == -1:
+            break
+        min_elem = lists[i_min][index[i_min]]
+        if not merged or merged[-1] != min_elem:
+            merged.append(min_elem)
+        index[i_min] += 1
+    return merged
+
+
+def intersect_sorted_unique(lst1: list, lst2: list) -> list:
+    assert lst1 == sorted(lst1)
+    assert is_all_unique_in_sorted(lst1)
+
+    assert lst2 == sorted(lst2)
+    assert is_all_unique_in_sorted(lst2)
+
+    """
+    intersect two sorted arrays
+    """
+    result = []
+    i, j = 0, 0
+    while i < len(lst1) and j < len(lst2):
+        if lst1[i] == lst2[j]:
+            result.append(lst1[i])
+            i += 1
+            j += 1
+        elif lst1[i] < lst2[j]:
+            i += 1
+        else:
+            j += 1
+    return result
+
+
+def is_all_unique_in_sorted(lst: list) -> bool:
+    for i in range(len(lst) - 1):
+        if lst[i] == lst[i + 1]:
+            return False
+    return True
diff --git a/utils/test_intersect_sorted.py b/utils/test_intersect_sorted.py
new file mode 100644
index 0000000..37128c0
--- /dev/null
+++ b/utils/test_intersect_sorted.py
@@ -0,0 +1,98 @@
+import unittest
+from search_framework.utils.for_sorted import intersect_sorted_unique
+
+
+class TestIntersect(unittest.TestCase):
+    def test_normal(self):
+        lst1 = [1, 2]
+        lst2 = [2, 3]
+        result = [2]
+        self.assertEqual(intersect_sorted_unique(lst1, lst2), result)
+
+    def test_different_len(self):
+        cases = [
+            {
+                "fst": [1, 2, 3, 4, 5],
+                "snd": [2, 5],
+                "result": [2, 5]
+            },
+            {
+                "fst": [1, 6],
+                "snd": [1, 4, 5, 6],
+                "result": [1, 6]
+            },
+            {
+                "fst": [1, 2],
+                "snd": [2, 4, 7],
+                "result": [2]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_empty(self):
+        cases = [
+            {
+                "fst": [1, 2],
+                "snd": [],
+                "result": []
+            },
+            {
+                "fst": [],
+                "snd": [3, 4],
+                "result": []
+            },
+            {
+                "fst": [],
+                "snd": [],
+                "result": []
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_one_elem(self):
+        cases = [
+            {
+                "fst": [1],
+                "snd": [1, 4],
+                "result": [1]
+            },
+            {
+                "fst": [1, 2],
+                "snd": [2],
+                "result": [2]
+            },
+            {
+                "fst": [1],
+                "snd": [4],
+                "result": []
+            },
+            {
+                "fst": [5],
+                "snd": [5],
+                "result": [5]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_some(self):
+        cases = [
+            {
+                "fst": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                "snd": [2, 4, 6, 8, 10, 11, 12, 13],
+                "result": [2, 4, 6, 8, 10]
+            },
+            {
+                "fst": [2, 4, 6, 8, 10, 11, 12, 13],
+                "snd": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                "result": [2, 4, 6, 8, 10]
+            },
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(intersect_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
\ No newline at end of file
diff --git a/utils/test_merge_sorted.py b/utils/test_merge_sorted.py
new file mode 100644
index 0000000..4bdedc5
--- /dev/null
+++ b/utils/test_merge_sorted.py
@@ -0,0 +1,157 @@
+import unittest
+from search_framework.utils.for_sorted import merge_sorted_unique, merge_sorted_unique_many
+
+
+class TestMergeUnique(unittest.TestCase):
+    def test_normal(self):
+        lst1 = [1, 2]
+        lst2 = [3, 4]
+        result = [1, 2, 3, 4]
+        self.assertEqual(merge_sorted_unique(lst1, lst2), result)
+
+    def test_different_len(self):
+        cases = [
+            {
+                "fst": [1, 2, 3, 4, 5],
+                "snd": [6, 7],
+                "result": [1, 2, 3, 4, 5, 6, 7]
+            },
+            {
+                "fst": [1, 2],
+                "snd": [3, 4, 5, 6],
+                "result": [1, 2, 3, 4, 5, 6]
+            },
+            {
+                "fst": [1, 2],
+                "snd": [3, 6, 7],
+                "result": [1, 2, 3, 6, 7]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_empty(self):
+        cases = [
+            {
+                "fst": [1, 2],
+                "snd": [],
+                "result": [1, 2]
+            },
+            {
+                "fst": [],
+                "snd": [3, 4],
+                "result": [3, 4]
+            },
+            {
+                "fst": [],
+                "snd": [],
+                "result": []
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_one_elem(self):
+        cases = [
+            {
+                "fst": [1],
+                "snd": [3, 4],
+                "result": [1, 3, 4]
+            },
+            {
+                "fst": [1, 2],
+                "snd": [4],
+                "result": [1, 2, 4]
+            },
+            {
+                "fst": [1],
+                "snd": [4],
+                "result": [1, 4]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_inverse(self):
+        cases = [
+            {
+                "fst": [3, 4, 5],
+                "snd": [1, 2],
+                "result": [1, 2, 3, 4, 5]
+            },
+            {
+                "fst": [3],
+                "snd": [1],
+                "result": [1, 3]
+            },
+            {
+                "fst": [4, 5],
+                "snd": [1],
+                "result": [1, 4, 5]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_some(self):
+        cases = [
+            {
+                "fst": [1, 3, 5, 7],
+                "snd": [2, 4, 6, 8, 10],
+                "result": [1, 2, 3, 4, 5, 6, 7, 8, 10]
+            },
+            {
+                "fst": [2, 4, 6, 8, 10],
+                "snd": [1, 3, 5, 7],
+                "result":[1, 2, 3, 4, 5, 6, 7, 8, 10]
+            },
+            {
+                "fst": [1, 3, 5, 7, 9],
+                "snd": [2, 4, 6, 8, 10],
+                "result": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+    def test_only_unique(self):
+        cases = [
+            {
+                "fst": [1, 2],
+                "snd": [2, 3],
+                "result": [1, 2, 3]
+            },
+            {
+                "fst": [1, 2, 3, 4],
+                "snd": [2, 3, 4, 5, 6],
+                "result": [1, 2, 3, 4, 5, 6]
+            },
+            {
+                "fst": [1, 2, 3, 5, 6],
+                "snd": [3, 4, 5],
+                "result": [1, 2, 3, 4, 5, 6]
+            }
+        ]
+        for i, tc in enumerate(cases):
+            with self.subTest(i=i):
+                self.assertEqual(merge_sorted_unique(tc["fst"], tc["snd"]), tc["result"])
+
+
+class TestMergeSortedMany(unittest.TestCase):
+    def test_normal(self):
+        lst = [
+            [1, 2, 3],
+            [2, 3, 4],
+            [3, 4, 5],
+            [6, 7, 8],
+        ]
+        result = [1, 2, 3, 4, 5, 6, 7, 8]
+        self.assertEqual(
+            merge_sorted_unique_many(lst),
+            result
+        )
\ No newline at end of file