-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.py
More file actions
78 lines (70 loc) · 2.68 KB
/
example.py
File metadata and controls
78 lines (70 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
import datetime
import sys
import os
import optparse
from pyretrieval import processor, document, indexer
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option("-s", "--stopwords", dest="stopwords")
parser.add_option("-l", "--lemmas", dest="lemmas")
parser.add_option("-d", "--documents", dest="documents")
(options, args) = parser.parse_args()
start = datetime.datetime.now()
print "Initializing Document Processor"
proc = processor.Processor()
if options.stopwords:
# LOAD STOPWORDS
print "Loading Stopwords..."
temp_time = datetime.datetime.now()
proc.load_stopwords(options.stopwords, True)
print "Finished after:", str(datetime.datetime.now() - temp_time)
lemmatize = False
if options.lemmas:
# LOAD LEMMAS
print "Loading Lemmas..."
temp_time = datetime.datetime.now()
proc.load_lemmas(options.lemmas, True)
lemmatize = True
print "Finished after:", str(datetime.datetime.now() - temp_time)
# PROCESS DOCUMENTS
print "Processing Documents"
docs = []
temp_time = datetime.datetime.now()
with open(options.documents) as file:
i = 0
for line in file:
i += 1
sys.stdout.write(str(i) + '\r')
doc = proc.process(line.decode("utf8"), lemmatize)
docs.append(doc)
print "Finished after:", str(datetime.datetime.now() - temp_time)
# INDEX DOCUMENTS
print "Indexing Documents..."
idxr = indexer.Indexer()
temp_time = datetime.datetime.now()
for doc in docs:
idxr.index(doc)
print "Finished after:", str(datetime.datetime.now() - temp_time)
# CALCULATE INVERSE DOCUMENT FREQUENCY
# temp_time = datetime.datetime.now()
# print "Calculating inverse document frequency"
# idxr.calc_idfs()
# print "Finished after:", str(datetime.datetime.now()-temp_time)
# print "==================================="
# print "Total Duration:", str(datetime.datetime.now()-start)
# IR SYSTEM READY
print "IR-SYSTEM READY ENTER QUERY AND PRESS ENTER:"
string = ""
while string != "quit":
print "==============================================================="
print "==============================================================="
string = raw_input("Query: ").decode(sys.stdout.encoding)
query = proc.process(string, lemmatize)
result = idxr.search(query, 5)
print "document vector of query: {0}".format(query.to_json())
print ""
for kv in result:
print kv[1], kv[0].to_json()
print kv[0].metadata["original"]
print ""