47mm · greatjudge · Mar 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.idea
+*.csv
+data
+__pycache__/
diff --git a/hw_boolean_search.py b/hw_boolean_search.py
@@ -3,65 +3,91 @@
 
 import argparse
 import codecs
-import sys
+from typing import Iterable
+from time import perf_counter
 
+from index import IndexMemory
+from searcher import Searcher
 
-class Index:
-    def __init__(self, index_file):
-        # TODO: build index
-        pass
 
 
-class QueryTree:
-    def __init__(self, qid, query):
-        # TODO: parse query and create query tree
-        pass
 
-    def search(self, index):
-        # TODO: lookup query terms in the index and implement boolean search logic
-        pass
+def create_and_fill_memory_index(docs_filepath):
+    print('Start creating memory index...')
+    start = perf_counter()
+    index = IndexMemory()
+    with open(docs_filepath) as f:
+        for i, line in enumerate(f, 1):
+            fields = line.rstrip('\n').split('\t')
+            docid, title, body = fields
+            doc = title + ' ' + body
+            index.add_document(doc, docid)
+            if i % 1000 == 0:
+                print(f"Processed {i} documents")
+    index.commit()
+    print(f'Memory index created, total time: {perf_counter() - start}')
+    return index
+
+
+
+INDEX_DIR = 'indexdir'
 
 
 class SearchResults:
-    def add(self, found):
-        # TODO: add next query's results
-        pass
+    def __init__(self):
+        self._qid_to_docids: dict[int, set] = {}
+
+    def add(self, qid: int, docids: Iterable[str]):
+        self._qid_to_docids[qid] = set(docids)
 
     def print_submission(self, objects_file, submission_file):
-        # TODO: generate submission file
-        pass
+        obj_file = open(objects_file)
+        obj_file_iter = iter(obj_file)
+        next(obj_file_iter)
+
+        subm_file = open(submission_file, 'w')
+        subm_file.write('ObjectId,Relevance\n')
+
+        for line in obj_file_iter:
+            obj_id, qid, docid = line.rstrip('\n').split(',')
+            qid = int(qid)
+            if docid in self._qid_to_docids[qid]:
+                finded = 1
+            else:
+                finded = 0
+            subm_file.write(f'{obj_id},{finded}\n')
 
 
 def main():
-    # Command line arguments.
+    # Command line argum_indexents.
     parser = argparse.ArgumentParser(description='Homework: Boolean Search')
     parser.add_argument('--queries_file', required = True, help='queries.numerate.txt')
     parser.add_argument('--objects_file', required = True, help='objects.numerate.txt')
     parser.add_argument('--docs_file', required = True, help='docs.tsv')
     parser.add_argument('--submission_file', required = True, help='output file with relevances')
     args = parser.parse_args()
 
-    # Build index.
-    index = Index(args.docs_file)
+    index = create_and_fill_memory_index(args.docs_file)
+
+    # Searcher
+    searcher = Searcher(index, top_k=30)
 
     # Process queries.
     search_results = SearchResults()
     with codecs.open(args.queries_file, mode='r', encoding='utf-8') as queries_fh:
-        for line in queries_fh:
+        for i, line in enumerate(queries_fh, 1):
             fields = line.rstrip('\n').split('\t')
             qid = int(fields[0])
             query = fields[1]
 
-            # Parse query.
-            query_tree = QueryTree(qid, query)
-
-            # Search and save results.
-            search_results.add(query_tree.search(index))
+            result_docids = searcher.search(query)
+            search_results.add(qid, result_docids)
+            if i % 100 == 0:
+                print(f'Processed {i} queries')
 
     # Generate submission file.
     search_results.print_submission(args.objects_file, args.submission_file)
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/index.py b/index.py
@@ -0,0 +1,44 @@
+from collections import Counter
+from typing import Iterable
+
+
+class Tokenizer:
+    def tokenize(self, doc: str) -> Iterable[str]:
+        return doc.split()
+
+
+class IndexMemory:
+    def __init__(self) -> None:
+        self._inverted_index: dict[str, list[int]] = {}
+        self._docnum = 0
+        self._docnum_to_docid: dict[int, str] = {}
+        self._tokenizer = Tokenizer()
+        self.token_occurences = Counter()
+        self.total_token_occur = 0
+
+    def add_document(self, doc: str, docid: str):
+        tokens = self._tokenizer.tokenize(doc)
+        for token in tokens:
+            self.token_occurences[token] += 1
+            postings = self._inverted_index.setdefault(token, [])
+            if not postings or postings[-1] != self._docnum:
+                postings.append(self._docnum)
+        self._docnum_to_docid[self._docnum] = docid
+        self._docnum += 1
+
+    def commit(self):
+        self.total_token_occur = self.token_occurences.total()
+
+    def postings(self, token: str) -> list[int]:
+        return self._inverted_index.get(token, [])
+
+    def has_token(self, token: str) -> bool:
+        return token in self._inverted_index
+
+    def get_docid(self, docnum: int):
+        return self._docnum_to_docid[docnum]
+
+    def language_model(self, token: str):
+        '''Probability of `word`. Naive implementation.'''
+        assert self.token_occurences != 0
+        return self.token_occurences[token] / self.total_token_occur
diff --git a/searcher.py b/searcher.py
@@ -0,0 +1,190 @@
+import heapq
+
+from index import IndexMemory
+# from search_framework.index.in_memory.index import IndexMemory
+from utils.for_sorted import (
+    merge_sorted_unique,
+    merge_sorted_unique_many,
+    intersect_sorted_unique,
+)
+from typing import Iterable
+from heapq import nlargest
+
+
+class Searcher:
+    # TODO: TEST
+    def __init__(self, index: IndexMemory, top_k_for_exsisting = 3, top_k = 10):
+        self._index = index
+        self._service_symbols = {"(", ")", " ", "|"}
+        self._op_priority = {"(": 0, "|": 1, " ": 2}
+        self._rpn_operators = {' ', '|'}
+        self.top_k = top_k
+        self.top_k_for_exsisting = top_k_for_exsisting
+
+    def search(self, query: str) -> list[str]:
+        """
+        Return doc_ids list for query
+        """
+        parsed = self._parse_query(query)
+        rpn = self._to_rpn(parsed)
+        postings = self._execute_rpn(rpn)
+
+        doc_ids = [''] * len(postings)
+        for i, docnum in enumerate(postings):
+            doc_ids[i] = self._index.get_docid(docnum)
+        return doc_ids
+
+    def _parse_query(self, query: str) -> list[str]:
+        """
+        Convert string query to list of items (token or operator)
+        """
+        query_list = []
+        word = []
+        for ch in query:
+            if ch in self._service_symbols:
+                if word:
+                    query_list.append(''.join(word))
+                    word = []
+                query_list.append(ch)
+            else:
+                word.append(ch)
+        if word:
+            query_list.append(''.join(word))
+        return query_list
+
+    def _to_rpn(self, query: list[str]) -> list[str]:
+        """
+        Convert parsed query to Reverse Polish Notation
+        """
+        bracket_cnt = 0
+        result: list[str] = []
+        stack = []
+        for w in query:
+            if w == "(":
+                stack.append(w)
+                bracket_cnt += 1
+            elif w == ")":
+                bracket_cnt -= 1
+                if bracket_cnt < 0:
+                    raise ValueError("wrong brackets")
+                while stack and stack[-1] != "(":
+                    result.append(stack.pop())
+                stack.pop()
+            elif w in self._op_priority:
+                while stack and self._op_priority[stack[-1]] >= self._op_priority[w]:
+                    result.append(stack.pop())
+                stack.append(w)
+            else:
+                result.append(w)
+        if bracket_cnt != 0:
+            raise ValueError("wrong brackets")
+        while stack:
+            result.append(stack.pop())
+        return result
+
+    def  _execute_rpn(self, query: list[str]) -> list[int]:
+        """
+        Return postings list
+        """
+        stack: list[list[int]] = []
+        for w in query:
+            if w in self._rpn_operators:
+                if len(stack) < 2:
+                    raise ValueError("bad query")
+                snd, fst = stack.pop(), stack.pop()
+                if w == ' ':
+                    result = intersect_sorted_unique(fst, snd)
+                else:
+                    result = merge_sorted_unique(fst, snd)
+                stack.append(result)
+            else:
+                edited = {w}
+                if len(w) >= 2:
+                    edited |= edits1(w)
+                edited = self._limit_candidates(edited)
+
+                layouted = change_layout(w)
+                lay_edited = {layouted}
+                if len(layouted) >= 3:
+                    edited |= self._limit_candidates(edits1(layouted))
+                edited |= self._limit_candidates(lay_edited)
+                edited = self._get_top_k(edited, top_k=self.top_k)
+
+                postings = merge_sorted_unique_many(
+                    [self._index.postings(token) for token in edited]
+                )
+                stack.append(postings)
+        result = stack.pop()
+        if stack:
+            raise ValueError("bad query")
+        return result
+
+    def _limit_candidates(self, tokens: Iterable[str]) -> set[str]:
+        '''The subset of `tokens` that appear in the dictionary of words_occurences.'''
+        output = set()
+        for token in tokens:
+            if self._index.has_token(token):
+                output.add(token)
+        return output
+
+    def _get_top_k(self, words: Iterable[str], top_k: int) -> list[str]:
+        return nlargest(top_k, words, self._index.language_model)
+
+
+def all_edits1(words: set[str]) -> set[str]:
+    output = set()
+    for w in words:
+        output |= edits1(w)
+    return output
+
+
+def edits1(word: str) -> set[str]:
+    '''All edits that are one edit away from `word`.'''
+    if is_rus(word):
+        letters = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
+    else:
+        letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+    deletes = [L + R[1:] for L, R in splits]
+    transposes = [L[:-1] + R[0] + L[-1] + R[1:] for L, R in splits if len(L) > 0 and len(R) > 0]
+    replaces = []
+    for c in letters:
+        replaces += [L + c + R[1:] for L, R in splits]
+    inserts = []
+    for c in letters:
+        inserts += [L + c + R for L, R in splits]
+    # prefixes = [L for L, _ in splits if len(L) >= 3]
+    # suffixes = [R for _, R in splits if len(R) >= 3]
+    return set(deletes + transposes + replaces + inserts)
+
+
+def edits2(word: str) -> set[str]:
+    '''All edits that are 2 edits away from `word`.'''
+    output = set()
+    for candidate in edits1(word):
+        output.update(edits1(candidate))
+    return output
+
+
+ENG = '''qwertyuiop[]asdfghjkl;'zxcvbnm,./`QWERTYUIOP{}ASDFGHJKL:"ZXCVBNM<>?~'''
+RUS = '''йцукенгшщзхъфывапролджэячсмитьбю.ёЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ,Ё'''
+
+
+def layout_eng_2_rus(text):
+    layout = dict(zip(map(ord, ENG), RUS))
+    return text.translate(layout)
+
+
+def layout_rus_2_eng(text):
+    layout = dict(zip(map(ord, RUS), ENG))
+    return text.translate(layout)
+
+
+def change_layout(word: str) -> str:
+    if is_rus(word):
+        return layout_rus_2_eng(word)
+    return layout_eng_2_rus(word)
+
+
+def is_rus(text, alphabet=set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')):
+    return not alphabet.isdisjoint(text.lower())
diff --git a/start.sh b/start.sh
@@ -1,4 +1,4 @@
-docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python hw_boolean_search.py \
+docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python -O hw_boolean_search.py \
     --queries_file /work_dir/data/queries.numerate.txt \
     --objects_file  /work_dir/data/objects.numerate.txt\
     --docs_file /work_dir/data/docs.txt \