Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.idea
*.csv
data
__pycache__/
80 changes: 53 additions & 27 deletions hw_boolean_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,65 +3,91 @@

import argparse
import codecs
import sys
from typing import Iterable
from time import perf_counter

from index import IndexMemory
from searcher import Searcher

class Index:
def __init__(self, index_file):
# TODO: build index
pass


class QueryTree:
def __init__(self, qid, query):
# TODO: parse query and create query tree
pass

def search(self, index):
# TODO: lookup query terms in the index and implement boolean search logic
pass
def create_and_fill_memory_index(docs_filepath):
print('Start creating memory index...')
start = perf_counter()
index = IndexMemory()
with open(docs_filepath) as f:
for i, line in enumerate(f, 1):
fields = line.rstrip('\n').split('\t')
docid, title, body = fields
doc = title + ' ' + body
index.add_document(doc, docid)
if i % 1000 == 0:
print(f"Processed {i} documents")
index.commit()
print(f'Memory index created, total time: {perf_counter() - start}')
return index



INDEX_DIR = 'indexdir'


class SearchResults:
def add(self, found):
# TODO: add next query's results
pass
def __init__(self):
self._qid_to_docids: dict[int, set] = {}

def add(self, qid: int, docids: Iterable[str]):
self._qid_to_docids[qid] = set(docids)

def print_submission(self, objects_file, submission_file):
# TODO: generate submission file
pass
obj_file = open(objects_file)
obj_file_iter = iter(obj_file)
next(obj_file_iter)

subm_file = open(submission_file, 'w')
subm_file.write('ObjectId,Relevance\n')

for line in obj_file_iter:
obj_id, qid, docid = line.rstrip('\n').split(',')
qid = int(qid)
if docid in self._qid_to_docids[qid]:
finded = 1
else:
finded = 0
subm_file.write(f'{obj_id},{finded}\n')


def main():
# Command line arguments.
# Command line argum_indexents.
parser = argparse.ArgumentParser(description='Homework: Boolean Search')
parser.add_argument('--queries_file', required = True, help='queries.numerate.txt')
parser.add_argument('--objects_file', required = True, help='objects.numerate.txt')
parser.add_argument('--docs_file', required = True, help='docs.tsv')
parser.add_argument('--submission_file', required = True, help='output file with relevances')
args = parser.parse_args()

# Build index.
index = Index(args.docs_file)
index = create_and_fill_memory_index(args.docs_file)

# Searcher
searcher = Searcher(index, top_k=30)

# Process queries.
search_results = SearchResults()
with codecs.open(args.queries_file, mode='r', encoding='utf-8') as queries_fh:
for line in queries_fh:
for i, line in enumerate(queries_fh, 1):
fields = line.rstrip('\n').split('\t')
qid = int(fields[0])
query = fields[1]

# Parse query.
query_tree = QueryTree(qid, query)

# Search and save results.
search_results.add(query_tree.search(index))
result_docids = searcher.search(query)
search_results.add(qid, result_docids)
if i % 100 == 0:
print(f'Processed {i} queries')

# Generate submission file.
search_results.print_submission(args.objects_file, args.submission_file)


if __name__ == "__main__":
main()

44 changes: 44 additions & 0 deletions index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from collections import Counter
from typing import Iterable


class Tokenizer:
def tokenize(self, doc: str) -> Iterable[str]:
return doc.split()


class IndexMemory:
def __init__(self) -> None:
self._inverted_index: dict[str, list[int]] = {}
self._docnum = 0
self._docnum_to_docid: dict[int, str] = {}
self._tokenizer = Tokenizer()
self.token_occurences = Counter()
self.total_token_occur = 0

def add_document(self, doc: str, docid: str):
tokens = self._tokenizer.tokenize(doc)
for token in tokens:
self.token_occurences[token] += 1
postings = self._inverted_index.setdefault(token, [])
if not postings or postings[-1] != self._docnum:
postings.append(self._docnum)
self._docnum_to_docid[self._docnum] = docid
self._docnum += 1

def commit(self):
self.total_token_occur = self.token_occurences.total()

def postings(self, token: str) -> list[int]:
return self._inverted_index.get(token, [])

def has_token(self, token: str) -> bool:
return token in self._inverted_index

def get_docid(self, docnum: int):
return self._docnum_to_docid[docnum]

def language_model(self, token: str):
'''Probability of `word`. Naive implementation.'''
assert self.token_occurences != 0
return self.token_occurences[token] / self.total_token_occur
190 changes: 190 additions & 0 deletions searcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import heapq

from index import IndexMemory
# from search_framework.index.in_memory.index import IndexMemory
from utils.for_sorted import (
merge_sorted_unique,
merge_sorted_unique_many,
intersect_sorted_unique,
)
from typing import Iterable
from heapq import nlargest


class Searcher:
# TODO: TEST
def __init__(self, index: IndexMemory, top_k_for_exsisting = 3, top_k = 10):
self._index = index
self._service_symbols = {"(", ")", " ", "|"}
self._op_priority = {"(": 0, "|": 1, " ": 2}
self._rpn_operators = {' ', '|'}
self.top_k = top_k
self.top_k_for_exsisting = top_k_for_exsisting

def search(self, query: str) -> list[str]:
"""
Return doc_ids list for query
"""
parsed = self._parse_query(query)
rpn = self._to_rpn(parsed)
postings = self._execute_rpn(rpn)

doc_ids = [''] * len(postings)
for i, docnum in enumerate(postings):
doc_ids[i] = self._index.get_docid(docnum)
return doc_ids

def _parse_query(self, query: str) -> list[str]:
"""
Convert string query to list of items (token or operator)
"""
query_list = []
word = []
for ch in query:
if ch in self._service_symbols:
if word:
query_list.append(''.join(word))
word = []
query_list.append(ch)
else:
word.append(ch)
if word:
query_list.append(''.join(word))
return query_list

def _to_rpn(self, query: list[str]) -> list[str]:
"""
Convert parsed query to Reverse Polish Notation
"""
bracket_cnt = 0
result: list[str] = []
stack = []
for w in query:
if w == "(":
stack.append(w)
bracket_cnt += 1
elif w == ")":
bracket_cnt -= 1
if bracket_cnt < 0:
raise ValueError("wrong brackets")
while stack and stack[-1] != "(":
result.append(stack.pop())
stack.pop()
elif w in self._op_priority:
while stack and self._op_priority[stack[-1]] >= self._op_priority[w]:
result.append(stack.pop())
stack.append(w)
else:
result.append(w)
if bracket_cnt != 0:
raise ValueError("wrong brackets")
while stack:
result.append(stack.pop())
return result

def _execute_rpn(self, query: list[str]) -> list[int]:
"""
Return postings list
"""
stack: list[list[int]] = []
for w in query:
if w in self._rpn_operators:
if len(stack) < 2:
raise ValueError("bad query")
snd, fst = stack.pop(), stack.pop()
if w == ' ':
result = intersect_sorted_unique(fst, snd)
else:
result = merge_sorted_unique(fst, snd)
stack.append(result)
else:
edited = {w}
if len(w) >= 2:
edited |= edits1(w)
edited = self._limit_candidates(edited)

layouted = change_layout(w)
lay_edited = {layouted}
if len(layouted) >= 3:
edited |= self._limit_candidates(edits1(layouted))
edited |= self._limit_candidates(lay_edited)
edited = self._get_top_k(edited, top_k=self.top_k)

postings = merge_sorted_unique_many(
[self._index.postings(token) for token in edited]
)
stack.append(postings)
result = stack.pop()
if stack:
raise ValueError("bad query")
return result

def _limit_candidates(self, tokens: Iterable[str]) -> set[str]:
'''The subset of `tokens` that appear in the dictionary of words_occurences.'''
output = set()
for token in tokens:
if self._index.has_token(token):
output.add(token)
return output

def _get_top_k(self, words: Iterable[str], top_k: int) -> list[str]:
return nlargest(top_k, words, self._index.language_model)


def all_edits1(words: set[str]) -> set[str]:
output = set()
for w in words:
output |= edits1(w)
return output


def edits1(word: str) -> set[str]:
'''All edits that are one edit away from `word`.'''
if is_rus(word):
letters = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
else:
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits]
transposes = [L[:-1] + R[0] + L[-1] + R[1:] for L, R in splits if len(L) > 0 and len(R) > 0]
replaces = []
for c in letters:
replaces += [L + c + R[1:] for L, R in splits]
inserts = []
for c in letters:
inserts += [L + c + R for L, R in splits]
# prefixes = [L for L, _ in splits if len(L) >= 3]
# suffixes = [R for _, R in splits if len(R) >= 3]
return set(deletes + transposes + replaces + inserts)


def edits2(word: str) -> set[str]:
'''All edits that are 2 edits away from `word`.'''
output = set()
for candidate in edits1(word):
output.update(edits1(candidate))
return output


ENG = '''qwertyuiop[]asdfghjkl;'zxcvbnm,./`QWERTYUIOP{}ASDFGHJKL:"ZXCVBNM<>?~'''
RUS = '''йцукенгшщзхъфывапролджэячсмитьбю.ёЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ,Ё'''


def layout_eng_2_rus(text):
layout = dict(zip(map(ord, ENG), RUS))
return text.translate(layout)


def layout_rus_2_eng(text):
layout = dict(zip(map(ord, RUS), ENG))
return text.translate(layout)


def change_layout(word: str) -> str:
if is_rus(word):
return layout_rus_2_eng(word)
return layout_eng_2_rus(word)


def is_rus(text, alphabet=set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')):
return not alphabet.isdisjoint(text.lower())
2 changes: 1 addition & 1 deletion start.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python hw_boolean_search.py \
docker run -it --rm --name boolean-search -v "$PWD":/work_dir -w /work_dir python:3.12-alpine python -O hw_boolean_search.py \
--queries_file /work_dir/data/queries.numerate.txt \
--objects_file /work_dir/data/objects.numerate.txt\
--docs_file /work_dir/data/docs.txt \
Expand Down
Loading