-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathw3c_parser.py
More file actions
70 lines (56 loc) · 2.72 KB
/
w3c_parser.py
File metadata and controls
70 lines (56 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
from collections import namedtuple
from datetime import datetime
Record = namedtuple('Record', 'host ident user date method section protocol status bytes page')
month_num = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9,
Oct=10, Nov=11, Dec=12)
class W3cParser(object):
"""
Parses records from a w3c-formatted http logfile
"""
# Captures all desired groups. Quicker than a generic tokenizer like '[\["](.*?)[\]"]|(\S+)'
LOG_RECORD_REGEX = re.compile('([(\d\.)]+) (.*?) (.*?) \[(.*?)\] "(.*?)" (\d{3}) (\d+|-)')
# W3C format has these fields, in this order.
log_fields = ('host', 'ident', 'user', 'date', 'request', 'status', 'bytes')
log_field_map = {field: i for i, field in enumerate(log_fields)}
def parse_line(self, raw_record):
"""
Parses useful data from a line of a w3c http log file
:param raw_record str: line of the log to parse
:return: Record object with all relevant data
"""
# Merge the matched groups since they never overlap
raw_fields = self.LOG_RECORD_REGEX.match(raw_record).groups()
record = {}
for i, field in enumerate(self.log_fields):
record[field] = raw_fields[i]
record['date'] = self._parse_date(record['date'])
record['bytes'] = self._parse_bytes(record['bytes'])
method, page, section, protocol = self._parse_request(record['request'])
del record['request']
record.update(method=method, page=page, section=section, protocol=protocol)
return Record(**record)
def _parse_request(self, request_str):
method, page, protocol = request_str.split()
sections = page.split('/')
# One slash signifies the page is at the root
section = sections[1] if len(sections) > 2 else 'root'
# Strip query params from page
page = page.split('?')[0]
return method, page, section, protocol
def _parse_date(self, date_str):
# Ignore the TZ since native lib can't handle it.
# TODO: Ideally we'd use a library for extracting this
# Not using datetime.strptime as it was found to be very slow (x10 vs below))
# return datetime.strptime(date_str.split()[0], '%d/%b/%Y:%H:%M:%S')
# The date_str is fixed width so we can parse by slicing
# e.g. 14/Jan/2016:13:50:24 +0000
day = int(date_str[0:2])
month = month_num[date_str[3:6]]
year = int(date_str[7:11])
hour = int(date_str[12:14])
minute = int(date_str[15:17])
second = int(date_str[18:20])
return datetime(year, month, day, hour, minute, second)
def _parse_bytes(self, byte_str):
return 0 if byte_str == '-' else int(byte_str)