From 495d6d8d964405338224cd0d7ad75b0673e9e58b Mon Sep 17 00:00:00 2001 From: createransw <164249698+createransw@users.noreply.github.com> Date: Thu, 21 Mar 2024 00:06:26 +0300 Subject: [PATCH 1/5] Sergeev Maksim --- hw_boolean_search.py | 110 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 10 deletions(-) diff --git a/hw_boolean_search.py b/hw_boolean_search.py index d98b8cc..e7a3b40 100644 --- a/hw_boolean_search.py +++ b/hw_boolean_search.py @@ -5,31 +5,121 @@ import codecs import sys +import json +import string + class Index: def __init__(self, index_file): - # TODO: build index - pass + self.index: dict[str, set[str]] = {} + f = codecs.open(index_file, encoding="utf-8", mode="r") + + for ln in f: + sentence = ln.strip().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).split() + for i in range(1, len(sentence)): + word = sentence[i].lower() + if word not in self.index: + self.index[word] = set[str]() + + self.index[word].add(sentence[0]) + + f.close() class QueryTree: def __init__(self, qid, query): - # TODO: parse query and create query tree - pass + self._request: list[str] = [] + + tmp = "" + + for c in query.lower(): + match c: + case " " | "(" | ")" | "|": + if len(tmp) > 0: + self._request.append(tmp) + + tmp = "" + self._request.append(str(c)) + + case other: + tmp += c + + if len(tmp) > 0: + self._request.append(tmp) + + + def _get(self): + if self._i < len(self._request): + self._c = self._request[self._i] + self._i += 1 + else: + self._c = "!" def search(self, index): - # TODO: lookup query terms in the index and implement boolean search logic - pass + self._i = 0 + self._get() + + result = self._or(index) + + f = codecs.open("res.json", encoding="utf-8", mode="a") + json.dump(list(result), f) + + + return result + + def _or(self, index): + result = self._and(index) + + while self._c == "|": + self._get() + result = result | self._and(index) + + return result + + def _and(self, index): + result = self._token(index) + + while self._c == " ": + self._get() + result = result & self._token(index) + + return result + + + def _token(self, index): + result = set() + + if self._c == "(": + self._get() + result = self._or(index) + if self._c != ")": + raise ValueError('Unmatched bracket') + else: + if self._c in index.index: + result = index.index[self._c] + + self._get() + + return result + class SearchResults: + def __init__(self): + self._results = [] def add(self, found): - # TODO: add next query's results - pass + self._results.append(found) def print_submission(self, objects_file, submission_file): - # TODO: generate submission file - pass + inp = codecs.open(objects_file, encoding="utf-8", mode="r") + outp = codecs.open(submission_file, encoding="utf-8", mode="w") + + outp.write("ObjectId,Relevance\n") + inp.readline() + for ln in inp: + pair = ln.strip().split(",") + outp.write(f"{pair[0]},{int(pair[2] in self._results[int(pair[1]) - 1])}\n") + def main(): From 8fc80886cbdae824e026cb759aa14c5b890a8f2c Mon Sep 17 00:00:00 2001 From: createransw <164249698+createransw@users.noreply.github.com> Date: Thu, 21 Mar 2024 02:04:09 +0300 Subject: [PATCH 2/5] Sergeev Maksim --- hw_boolean_search.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hw_boolean_search.py b/hw_boolean_search.py index e7a3b40..fe0c468 100644 --- a/hw_boolean_search.py +++ b/hw_boolean_search.py @@ -4,6 +4,7 @@ import argparse import codecs import sys +from nltk.stem import SnowballStemmer import json import string @@ -11,13 +12,15 @@ class Index: def __init__(self, index_file): + stemmer = SnowballStemmer("russian") + self.index: dict[str, set[str]] = {} f = codecs.open(index_file, encoding="utf-8", mode="r") for ln in f: sentence = ln.strip().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).split() for i in range(1, len(sentence)): - word = sentence[i].lower() + word = stemmer.stem(sentence[i].lower()) if word not in self.index: self.index[word] = set[str]() @@ -28,6 +31,7 @@ def __init__(self, index_file): class QueryTree: def __init__(self, qid, query): + self._stemmer = SnowballStemmer("russian") self._request: list[str] = [] tmp = "" @@ -73,7 +77,7 @@ def _or(self, index): while self._c == "|": self._get() result = result | self._and(index) - + return result def _and(self, index): @@ -95,8 +99,8 @@ def _token(self, index): if self._c != ")": raise ValueError('Unmatched bracket') else: - if self._c in index.index: - result = index.index[self._c] + if self._stemmer.stem(self._c) in index.index: + result = index.index[self._stemmer.stem(self._c)] self._get() From 49244cd95683208e0a08007eaa26d213d8e26c98 Mon Sep 17 00:00:00 2001 From: Maksim Date: Sun, 24 Mar 2024 14:07:33 +0300 Subject: [PATCH 3/5] Cosmetic changes --- hw_boolean_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hw_boolean_search.py b/hw_boolean_search.py index fe0c468..8e39ebe 100644 --- a/hw_boolean_search.py +++ b/hw_boolean_search.py @@ -10,6 +10,7 @@ import string + class Index: def __init__(self, index_file): stemmer = SnowballStemmer("russian") From 4c3c3cc9e9eaa3a73ef9dacdce7e70c60c2c6fea Mon Sep 17 00:00:00 2001 From: Maksim Date: Wed, 27 Mar 2024 19:56:04 +0300 Subject: [PATCH 4/5] requirements --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4279125 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +langdetect==1.0.9 +nltk==3.8.1 From 91b39745cff1a5b58a85e1fc6ad89fe7cb9d7819 Mon Sep 17 00:00:00 2001 From: Maksim Date: Wed, 27 Mar 2024 22:35:38 +0300 Subject: [PATCH 5/5] My best score --- hw_boolean_search.py | 8 +++----- requirements.txt | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/hw_boolean_search.py b/hw_boolean_search.py index 8e39ebe..0c4a603 100644 --- a/hw_boolean_search.py +++ b/hw_boolean_search.py @@ -18,7 +18,9 @@ def __init__(self, index_file): self.index: dict[str, set[str]] = {} f = codecs.open(index_file, encoding="utf-8", mode="r") - for ln in f: + for i, ln in enumerate(f): + if i % 200 == 0: + print(f"index {i}") sentence = ln.strip().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).split() for i in range(1, len(sentence)): word = stemmer.stem(sentence[i].lower()) @@ -65,10 +67,6 @@ def search(self, index): self._get() result = self._or(index) - - f = codecs.open("res.json", encoding="utf-8", mode="a") - json.dump(list(result), f) - return result diff --git a/requirements.txt b/requirements.txt index 4279125..34e7188 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ -langdetect==1.0.9 nltk==3.8.1