-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.py
More file actions
97 lines (82 loc) · 2.54 KB
/
index.py
File metadata and controls
97 lines (82 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
import sys, json;
import solr
import time
import requests
import os
#sys.stdin.reconfigure(encoding='ISO-8859-1')
os.environ['PYTHONIOENCODING'] = 'ISO-8859-1'
class Logger(object):
def __init__(self):
self.terminal = sys.stdout
self.log = open("/tmp/wiki_batch_logfile.log", "w")
def write(self, message):
self.terminal.write(message)
self.log.write(message)
self.log.flush()
def flush(self):
pass
sys.stdout = Logger()
# XXX main function to add documents in a batch to solr
def send_to_solr(s, articles, synerr):
try:
s.add_many(articles, commit=False)
except:
print("Caught error in Unicode")
synerr+= 1
solrUrl = "35.232.98.133:30983"
batchSize = 100000
# XXX Connect to solr first, we must crash if it is not around...
s = solr.Solr('http://' + solrUrl + '/solr/wikipedia')
# XXX first ensure the collection is empty
headers = {'Content-Type': 'application/xml'}
resp = requests.post(url = 'http://' + solrUrl + '/solr/wikipedia/update', data =
'<delete><query>*:*</query></delete>', headers = headers)
if resp.status_code != 200:
print("Could not flush collection")
sys.exit(0)
print("sent delete all...")
resp = requests.post(url = 'http://' + solrUrl + '/solr/wikipedia/update', data = '<commit/>', headers = headers)
if resp.status_code != 200:
print("Could not flush collection")
print(resp)
sys.exit(0)
print("Deleted all...")
# Counter to batch input to solr
cnt=0
synerr=0
articles = []
start = time.time()
numiter = 0
docid = 1
for line in sys.stdin:
cnt+=1
docid += 1
# Do TSV parsing...
fields = line.split('\t')
topic = fields[0]
date = fields[1]
abst = fields[2]
articles.append({'id': docid, 'topic': topic, 'date' : date, 'abstract': abst})
if cnt >= batchSize:
send_to_solr(s, articles, synerr)
cnt = 0
print("Adding... " + str(numiter))
numiter += 1
articles = []
# Catch last iteration
print("count of last push is " + str(cnt))
print(" Total documents pushed = " + str(numiter * batchSize + cnt))
s.add_many(articles, commit=True)
end = time.time()
print("Total error lines count is " + str(synerr))
# XXX Now for the benchmark
resp = requests.get('http://' + solrUrl + '/solr/wikipedia/select?q=*:*')
if resp.status_code != 200:
print("some issue")
output = resp.json()
totdocs = output['response']['numFound']
tottime = end - start # In seconds
docspersec = int(totdocs/tottime)
print("Total documents is " + str(totdocs) + "& time taken is " + str(tottime))
print("The total throughput we got it " + str(docspersec) + " docs per second")