-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMLParser.py
More file actions
executable file
·94 lines (72 loc) · 2.76 KB
/
MLParser.py
File metadata and controls
executable file
·94 lines (72 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#! /usr/bin/python
import sys
from lxml import html
import requests
from statistics import mean, mode, median, stdev, median_grouped
def removeOutliers(priceList, mean, stdev):
for price in priceList:
if (abs(((price - mean)/stdev)) > 1):
priceList.remove(price)
return(priceList)
def ParseMLPrice(url):
adPrices = []
statusCode = 200
counter = 1
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
if "_DisplayType_LF" not in url:
url = url + "_DisplayType_LF"
while (statusCode == 200):
try:
page = requests.get(url)
tree = html.fromstring(page.content)
except requests.exceptions.Timeout as t:
# Maybe set up for a retry, or continue in a retry loop
print('ERROR: URL Timeout! URL = ' + url)
except requests.exceptions.TooManyRedirects as r:
# Tell the user their URL was bad and try a different one
print('ERROR: Too Many Redirects! URL = ' + url)
except requests.exceptions.ConnectionError as c:
# Tell the user their URL isn't found
print('ERROR: Connection Error! URL = ' + url)
sys.exit(1)
except requests.exceptions.RequestException as e:
# catastrophic error. bail.
print ('ERROR: Request Exception! URL = ' + url)
sys.exit(1)
except lxml.etree.XMLSyntaxError as l:
print ('ERROR: XML Syntax Error! URL = ' + url)
#prices = tree.xpath("/html/body/main/div/div[2]/section/ol/li[1]/div/div/div[2]/div[2]/div[1]/div[1]/div/div/div/span[1]/span[2]/span[2]/text()")
prices = tree.xpath("/html/body/main/div/div[2]/section/ol[1]/li[2]/div/div/a/div/div[1]/div/div/div/span[1]/span[2]/span[2]/text()")
for price in prices:
if price.strip():
value = (''.join(s for s in price if s.isdigit()))
if value.strip():
adPrices.append(int(value))
counter += 50
url = url + "_Desde_" + str(counter)
statusCode = page.status_code
return (adPrices)
def main():
adPrices = ParseMLPrice(sys.argv[1])
meanPrice = mean(adPrices)
stdevPrice = stdev(adPrices)
adPrices = removeOutliers(adPrices, meanPrice, stdevPrice)
print('Average: ' + str(round(mean(adPrices),2)))
try:
print('Mode: ' + str(round(mode(adPrices))))
except:
print('ERROR: Unable to print Mode!')
print('Median: ' + str(round(median(adPrices))))
print('Standard Deviation: ' + str(round(stdev(adPrices))))
print('Median Grouped: ' + str(round(median_grouped(adPrices))))
print('Minimum: ' + str(round(min(adPrices))))
print('Maximum: ' + str(round(max(adPrices))))
if __name__ == '__main__':
sys.exit(main())