From a80fba505b1a9b09eff3f0d5e2a5ec626f797bfa Mon Sep 17 00:00:00 2001 From: Alain Lichnewsky Date: Sat, 13 Mar 2021 15:08:11 +0100 Subject: [PATCH 01/57] Syntactic and library changes to accomodate Python3 --- logtools/__init__.py | 38 ++++++++++++++++++++-------------- logtools/_config.py | 10 ++++++++- logtools/_filter.py | 17 ++++++++++----- logtools/_filterbots.py | 17 ++++++++++----- logtools/_flattenjson.py | 12 +++++++++-- logtools/_geoip.py | 19 ++++++++++++----- logtools/_join.py | 21 ++++++++++++------- logtools/_merge.py | 15 ++++++++++---- logtools/_parse.py | 16 ++++++++++---- logtools/_plot.py | 25 ++++++++++++++-------- logtools/_qps.py | 16 ++++++++++---- logtools/_sample.py | 15 ++++++++++---- logtools/_serve.py | 11 ++++++++-- logtools/_sumstat.py | 15 ++++++++++---- logtools/_tail.py | 15 ++++++++++---- logtools/_urlparse.py | 23 +++++++++++++------- logtools/join_backends.py | 10 ++++++++- logtools/parsers.py | 12 +++++++++-- logtools/test/test_logtools.py | 25 +++++++++++++++------- 19 files changed, 239 insertions(+), 93 deletions(-) diff --git a/logtools/__init__.py b/logtools/__init__.py index db0df3c..562fde9 100644 --- a/logtools/__init__.py +++ b/logtools/__init__.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + import logging @@ -19,19 +27,19 @@ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) -from _config import * +from ._config import * -from _filterbots import * -from _flattenjson import * -from _geoip import * -from _join import * -from _merge import * -from _parse import * -from _urlparse import * -from _plot import * -from _qps import * -from _sample import * -from _filter import * -from _tail import * -from _sumstat import * -from _serve import * +from ._filterbots import * +from ._flattenjson import * +from ._geoip import * +from ._join import * +from ._merge import * +from ._parse import * +from ._urlparse import * +from ._plot import * +from ._qps import * +from ._sample import * +from ._filter import * +from ._tail import * +from ._sumstat import * +from ._serve import * diff --git a/logtools/_config.py b/logtools/_config.py index c61628b..9106215 100644 --- a/logtools/_config.py +++ b/logtools/_config.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._config @@ -20,7 +28,7 @@ import os import sys -from ConfigParser import SafeConfigParser, NoOptionError, NoSectionError +from configparser import SafeConfigParser, NoOptionError, NoSectionError __all__ = ['logtools_config', 'interpolate_config', 'AttrDict'] diff --git a/logtools/_filter.py b/logtools/_filter.py index c2b33dd..3ae26c9 100644 --- a/logtools/_filter.py +++ b/logtools/_filter.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._filter Filter rows based on blacklists and field matching. @@ -19,14 +27,13 @@ import sys import string import logging -from itertools import imap from functools import partial from operator import and_ from optparse import OptionParser import acora -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict import logtools.parsers __all__ = ['logfilter_parse_args', 'logfilter', @@ -37,7 +44,7 @@ # character set, however might diverge slightly in case of locale- # specific character sets. _word_boundary_chars = set(string.printable)\ - .difference(string.letters)\ + .difference(string.ascii_letters)\ .difference(string.digits)\ .difference(('_',)) @@ -196,7 +203,7 @@ def _is_blacklisted_func(line): num_lines=0 num_filtered=0 num_nomatch=0 - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): try: is_blacklisted = _is_blacklisted_func(line) except (KeyError, ValueError): @@ -225,7 +232,7 @@ def logfilter_main(): options, args = logfilter_parse_args() if options.printlines: for line in logfilter(fh=sys.stdin, *args, **options): - print line + print(line) else: for line in logfilter(fh=sys.stdin, *args, **options): pass diff --git a/logtools/_filterbots.py b/logtools/_filterbots.py index 14859f6..dc60c8b 100644 --- a/logtools/_filterbots.py +++ b/logtools/_filterbots.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._filterbots Filter bots from logrows based on an ip/host and useragent blacklists. @@ -18,12 +26,11 @@ import re import sys import logging -from itertools import imap from functools import partial from operator import and_ from optparse import OptionParser -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict import logtools.parsers __all__ = ['filterbots_parse_args', 'filterbots', @@ -95,7 +102,7 @@ def parse_bots_ua(bots_ua): bots_ua_suffix_dict = {} bots_ua_re = [] - for line in imap(lambda x: x.strip(), bots_ua): + for line in map(lambda x: x.strip(), bots_ua): if line.startswith("#"): # Comment line continue @@ -209,7 +216,7 @@ def _is_bot_func(line): num_lines=0 num_filtered=0 num_nomatch=0 - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): try: is_bot = _is_bot_func(line) except (KeyError, ValueError): @@ -238,7 +245,7 @@ def filterbots_main(): options, args = filterbots_parse_args() if options.printlines: for line in filterbots(fh=sys.stdin, *args, **options): - print line + print( line ) else: for line in filterbots(fh=sys.stdin, *args, **options): pass diff --git a/logtools/_flattenjson.py b/logtools/_flattenjson.py index bc89705..45fca18 100644 --- a/logtools/_flattenjson.py +++ b/logtools/_flattenjson.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._flattenjson @@ -23,7 +31,7 @@ from json import dumps, load from optparse import OptionParser -from _config import interpolate_config, AttrDict +from ._config import interpolate_config, AttrDict __all__ = ['flattenjson_parse_args', 'flattenjson', 'flattenjson_main'] @@ -53,5 +61,5 @@ def flattenjson_main(): options, args = flattenjson_parse_args() for row in flattenjson(options, args, fh=sys.stdin): if row: - print row.encode('utf-8', 'ignore') + print( row.encode('utf-8', 'ignore') ) return 0 diff --git a/logtools/_geoip.py b/logtools/_geoip.py index 2d5cbf7..ea90e16 100644 --- a/logtools/_geoip.py +++ b/logtools/_geoip.py @@ -11,6 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# Modifications limited to syntactic aspects only, no testing with geoip. +# +# ........................................ ****** + """ logtools._geoip GeoIP interoperability tool. @@ -20,10 +30,9 @@ import re import sys import logging -from itertools import imap from optparse import OptionParser -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['geoip_parse_args', 'geoip', 'geoip_main'] @@ -74,7 +83,7 @@ def geoip(fh, ip_re, **kwargs): filter_func = lambda x: \ True if x == kwargs['filter'] else False - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): match = ip_re.match(line) if match: ip = match.group(1) @@ -92,7 +101,7 @@ def geoip_main(): options, args = geoip_parse_args() for geocode, ip, line in geoip(fh=sys.stdin, *args, **options): if options.printline is True: - print "{0}\t{1}".format(geocode, line) + print( "{0}\t{1}".format(geocode, line)) else: - print "{0}\t{1}".format(geocode, ip) + print( "{0}\t{1}".format(geocode, ip)) return 0 diff --git a/logtools/_join.py b/logtools/_join.py index 62d8916..3da2fe0 100644 --- a/logtools/_join.py +++ b/logtools/_join.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._join @@ -24,13 +32,12 @@ import logging import unicodedata from time import time -from itertools import imap from datetime import datetime from optparse import OptionParser -from urlparse import parse_qs, urlsplit +from urllib.parse import parse_qs, urlsplit from logtools.join_backends import * -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['logjoin_parse_args', 'logjoin', 'logjoin_main'] @@ -87,16 +94,16 @@ def logjoin(fh, field, delimiter, backend, join_connect_string, }[backend](remote_fields=join_remote_fields, remote_name=join_remote_name, remote_key=join_remote_key, connect_string=join_connect_string) - for row in imap(lambda x: x.strip(), fh): + for row in map(lambda x: x.strip(), fh): key = row.split(delimiter)[field] for join_row in backend_impl.join(key): - yield key, unicode(row) + delimiter + delimiter.join(imap(unicode, join_row)) + yield key, unicode(row) + delimiter + delimiter.join(map(unicode, join_row)) def logjoin_main(): """Console entry-point""" options, args = logjoin_parse_args() for key, row in logjoin(fh=sys.stdin, *args, **options): - print >> sys.stdout, unicodedata.normalize('NFKD', unicode(row))\ - .encode('ascii','ignore') + print( unicodedata.normalize('NFKD', unicode(row)).encode('ascii','ignore'), + file = sys.stdout ) return 0 diff --git a/logtools/_merge.py b/logtools/_merge.py index 44b8eab..6a95850 100644 --- a/logtools/_merge.py +++ b/logtools/_merge.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._merge Logfile merging utilities. @@ -22,12 +30,11 @@ import re import sys import logging -from itertools import imap from datetime import datetime from optparse import OptionParser from heapq import heappush, heappop, merge -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict import logtools.parsers __all__ = ['logmerge_parse_args', 'logmerge', 'logmerge_main'] @@ -98,7 +105,7 @@ def logmerge(options, args): else: key_func = lambda x: (extract_func(x), x) - iters = (imap(key_func, open(filename, "r")) for filename in args) + iters = ( map(key_func, open(filename, "r")) for filename in args) for k, line in merge(*iters): yield k, line.strip() @@ -107,5 +114,5 @@ def logmerge_main(): """Console entry-point""" options, args = logmerge_parse_args() for key, line in logmerge(options, args): - print line + print( line ) return 0 diff --git a/logtools/_parse.py b/logtools/_parse.py index 00448c7..c9ca639 100644 --- a/logtools/_parse.py +++ b/logtools/_parse.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._parse Log format parsing programmatic and command-line utilities. @@ -22,7 +30,7 @@ from optparse import OptionParser import logtools.parsers -from _config import interpolate_config, AttrDict +from ._config import interpolate_config, AttrDict __all__ = ['logparse_parse_args', 'logparse', 'logparse_main'] @@ -96,10 +104,10 @@ def logparse(options, args, fh): for line in fh: try: yield key_func(line) - except KeyError, exc: + except KeyError as exc: # Could not find user-specified field logging.warn("Could not match user-specified fields: %s", exc) - except ValueError, exc: + except ValueError as exc: # Could not parse the log line if options.ignore: logging.debug("Could not match fields for parsed line: %s", line) @@ -114,5 +122,5 @@ def logparse_main(): options, args = logparse_parse_args() for row in logparse(options, args, fh=sys.stdin): if row: - print row.encode('ascii', 'ignore') + print( row.encode('ascii', 'ignore') ) return 0 diff --git a/logtools/_plot.py b/logtools/_plot.py index f4286ec..22ca641 100644 --- a/logtools/_plot.py +++ b/logtools/_plot.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._plot Plotting methods for logfiles @@ -22,14 +30,13 @@ import locale import logging import unicodedata -from itertools import imap from random import randint from datetime import datetime from operator import itemgetter from optparse import OptionParser from abc import ABCMeta, abstractmethod -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['logplot_parse_args', 'logplot', 'logplot_main'] @@ -81,7 +88,7 @@ def _plot_line(self, options, args, fh): field = options.field-1 pts = [] - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) k = float(splitted_line.pop(field)) pts.append((k, ' '.join(splitted_line))) @@ -121,7 +128,7 @@ def _plot_pie(self, options, args, fh): chart = PieChart2D(options.width, options.height) pts = [] - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) k = int(splitted_line.pop(field)) pts.append((k, ' '.join(splitted_line), locale.format('%d', k, True))) @@ -150,7 +157,7 @@ def _plot_timeseries(self, options, args, fh): datefield = options.datefield-1 pts = [] - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) v = float(splitted_line[field]) t = datetime.strptime(splitted_line[datefield], options.dateformat) @@ -223,7 +230,7 @@ def _plot_hist(self, options, args, fh): pts = [] max_y = -float("inf") - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) k = float(splitted_line.pop(field)) pts.append((k, ' '.join(splitted_line))) @@ -258,7 +265,7 @@ def _plot_pie(self, options, args, fh): pts = [] ttl = 0. - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) k = float(splitted_line.pop(field)) ttl += k @@ -294,7 +301,7 @@ def _plot_line(self, options, args, fh): pts = [] max_y = -float("inf") - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) k = float(splitted_line.pop(field)) label = unicodedata.normalize('NFKD', \ @@ -330,7 +337,7 @@ def _plot_timeseries(self, options, args, fh): pts = [] max_y = -float("inf") - for l in imap(lambda x: x.strip(), fh): + for l in map(lambda x: x.strip(), fh): splitted_line = l.split(delimiter) v = float(splitted_line[field]) t = datetime.strptime(splitted_line[datefield], options.dateformat) diff --git a/logtools/_qps.py b/logtools/_qps.py index 33d4040..310006c 100644 --- a/logtools/_qps.py +++ b/logtools/_qps.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._qps Compute QPS estimates based on parsing of timestamps from logs on @@ -20,11 +28,10 @@ import sys import logging from time import time -from itertools import imap from datetime import datetime from optparse import OptionParser -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['qps_parse_args', 'qps', 'qps_main'] @@ -89,7 +96,7 @@ def qps(fh, dt_re, dateformat, window_size, ignore, **kwargs): samples.append(t0) # Run over rest of input stream - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): try: t = datetime.strptime(_re.match(line).groups()[0], dateformat) except (AttributeError, KeyError, TypeError, ValueError): @@ -128,6 +135,7 @@ def qps_main(): """Console entry-point""" options, args = qps_parse_args() for qps_info in qps(fh=sys.stdin, *args, **options): - print >> sys.stdout, "{start_time}\t{end_time}\t{num_samples}\t{qps:.2f}".format(**qps_info) + print ( "{start_time}\t{end_time}\t{num_samples}\t{qps:.2f}".format(**qps_info), + file = sys.stdout) return 0 diff --git a/logtools/_sample.py b/logtools/_sample.py index a1a8a92..76f6f2a 100644 --- a/logtools/_sample.py +++ b/logtools/_sample.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._sample Sampling tools for logfiles @@ -20,12 +28,11 @@ import re import sys import logging -from itertools import imap from random import randint, random from optparse import OptionParser from heapq import heappush, heappop, heapreplace -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['logsample_parse_args', 'logsample', 'logsample_weighted', 'logsample_main'] @@ -117,10 +124,10 @@ def logsample_main(): if options.weighted is True: for k, r in logsample_weighted(fh=sys.stdin, *args, **options): - print r + print( r ) else: for r in logsample(fh=sys.stdin, *args, **options): - print r + print( r ) return 0 diff --git a/logtools/_serve.py b/logtools/_serve.py index 2df5949..cfd64ca 100644 --- a/logtools/_serve.py +++ b/logtools/_serve.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._serve @@ -22,14 +30,13 @@ import sys import logging import wsgiref -from itertools import imap from random import randint from threading import Thread from operator import itemgetter from optparse import OptionParser from abc import ABCMeta, abstractmethod -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['logserve_parse_args', 'logserve', 'logserve_main'] diff --git a/logtools/_sumstat.py b/logtools/_sumstat.py index e6598ff..e4b6371 100644 --- a/logtools/_sumstat.py +++ b/logtools/_sumstat.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._sumstat @@ -27,12 +35,11 @@ import logging from textwrap import dedent -from itertools import imap from optparse import OptionParser from prettytable import PrettyTable -from _config import interpolate_config, AttrDict +from ._config import interpolate_config, AttrDict __all__ = ['sumstat_parse_args', 'sumstat', 'sumstat_main'] @@ -77,7 +84,7 @@ def sumstat(fh, delimiter, reverse=False, **kwargs): counts = [] N, M = 0, 0 - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): try: row = line.split(delimiter, 1) count = row[0] @@ -155,7 +162,7 @@ def sumstat_main(): [stat_dict['min'], stat_dict['max'], stat_dict['avg']] + \ stat_dict['percentiles'] ) - print table + print(table) S10th, S25th, S40th, S50th, S75th, S90th = stat_dict['cover'] M = stat_dict['M'] diff --git a/logtools/_tail.py b/logtools/_tail.py index f3dca57..92a92be 100644 --- a/logtools/_tail.py +++ b/logtools/_tail.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._tail A tail-like utility that allows tailing via time-frames and more complex @@ -20,7 +28,6 @@ import sys import string import logging -from itertools import imap from functools import partial from operator import and_ from datetime import datetime @@ -28,7 +35,7 @@ import dateutil.parser -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict import logtools.parsers __all__ = ['logtail_parse_args', 'logtail', @@ -117,7 +124,7 @@ def _is_match_func(line): num_lines=0 num_filtered=0 num_nomatch=0 - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): try: is_match = _is_match_func(line) except (KeyError, ValueError): @@ -146,7 +153,7 @@ def logtail_main(): options, args = logtail_parse_args() if options.printlines: for line in logtail(fh=sys.stdin, *args, **options): - print line + print( line) else: for line in logtail(fh=sys.stdin, *args, **options): pass diff --git a/logtools/_urlparse.py b/logtools/_urlparse.py index de190e8..3b91926 100644 --- a/logtools/_urlparse.py +++ b/logtools/_urlparse.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools._urlparse @@ -21,13 +29,12 @@ import sys import logging from time import time -from itertools import imap from datetime import datetime -from urllib import unquote_plus +from urllib.parse import unquote_plus +from urllib.parse import parse_qs, urlsplit from optparse import OptionParser -from urlparse import parse_qs, urlsplit -from _config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict __all__ = ['urlparse_parse_args', 'urlparse', 'urlparse_main'] @@ -71,10 +78,10 @@ def urlparse(fh, part=None, query_params=None, decode=False, **kwargs): [val.get(p, (None,))[0] for p in query_params] if decode is True: - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): yield unquote_plus(line) else: - for line in imap(lambda x: x.strip(), fh): + for line in map(lambda x: x.strip(), fh): url = urlsplit(line) val = { "scheme": url.scheme, @@ -95,9 +102,9 @@ def urlparse_main(): if hasattr(parsed_url, '__iter__'): # Format as tab-delimited for output parsed_url = "\t".join(parsed_url) - print parsed_url + print(parsed_url) else: # Lines where we couldnt get any match (e.g when using -q) - print '' + print ('') return 0 diff --git a/logtools/join_backends.py b/logtools/join_backends.py index 50a99f5..230eb4e 100644 --- a/logtools/join_backends.py +++ b/logtools/join_backends.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """ logtools.join_backends Backends used by the logjoin API / tool @@ -24,7 +32,7 @@ from abc import ABCMeta, abstractmethod import json -from _config import AttrDict +from ._config import AttrDict from sqlsoup import SQLSoup diff --git a/logtools/parsers.py b/logtools/parsers.py index 1c9b53a..48b62d2 100644 --- a/logtools/parsers.py +++ b/logtools/parsers.py @@ -26,8 +26,16 @@ from datetime import datetime from abc import ABCMeta, abstractmethod import json +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + -from _config import AttrDict +from ._config import AttrDict __all__ = ['multikey_getter_gen', 'unescape_json', 'LogParser', 'JSONParser', 'LogLine', 'AccessLog', 'CommonLogFormat', 'uWSGIParser'] @@ -176,7 +184,7 @@ def parse(self, logline): """ try: match = self.fieldselector.match(logline) - except AttributeError, exc: + except AttributeError as exc: raise AttributeError("%s needs a valid format string (--format)" % \ self.__class__.__name__ ) diff --git a/logtools/test/test_logtools.py b/logtools/test/test_logtools.py index e9b68ec..7781301 100755 --- a/logtools/test/test_logtools.py +++ b/logtools/test/test_logtools.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# ........................................ NOTICE +# +# This file has been derived and modified from a source licensed under Apache Version 2.0. +# See files NOTICE and README.md for more details. +# +# ........................................ ****** + """Unit-test code for logtools""" import os @@ -184,7 +192,8 @@ def testGeoIP(self): try: import GeoIP except ImportError: - print >> sys.stderr, "GeoIP Python package not available - skipping geoip unittest." + print( "GeoIP Python package not available - skipping geoip unittest.", + file = sys.stderr) return output = [(geocode, ip, line) for geocode, ip, line in geoip(fh=self.fh, **self.options)] @@ -195,7 +204,8 @@ def testFilter(self): try: import GeoIP except ImportError: - print >> sys.stderr, "GeoIP Python package not available - skipping geoip unittest." + print ("GeoIP Python package not available - skipping geoip unittest.", + file = sys.stderr) return # Check positive filter @@ -262,7 +272,7 @@ def testACWB(self): for l in logfilter(self.testset, blacklist=self.blacklist, field=1, delimiter="\t", with_acora=True, ignorecase=False, word_boundaries=True): - #print l + #print(l) lines += 1 self.assertEquals(lines, self.exp_emitted_wb, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted_wb)) @@ -273,7 +283,7 @@ def testAC(self): for l in logfilter(self.testset, blacklist=self.blacklist, field=1, delimiter="\t", with_acora=True, ignorecase=False, word_boundaries=False): - #print l + #print(l) lines += 1 self.assertEquals(lines, self.exp_emitted, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted)) @@ -284,7 +294,7 @@ def testRE(self): for l in logfilter(self.testset, blacklist=self.blacklist, field=1, delimiter="\t", with_acora=False, ignorecase=False, word_boundaries=False): - #print l + #print( l) lines += 1 self.assertEquals(lines, self.exp_emitted, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted)) @@ -295,7 +305,7 @@ def testREWB(self): for l in logfilter(self.testset, blacklist=self.blacklist, field=1, delimiter="\t", with_acora=False, ignorecase=False, word_boundaries=True): - #print l + #print( l) lines += 1 self.assertEquals(lines, self.exp_emitted_wb, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted_wb)) @@ -395,7 +405,8 @@ def testGChart(self): try: import pygooglechart except ImportError: - print >> sys.stderr, "pygooglechart Python package not available - skipping logplot gchart unittest." + print( "pygooglechart Python package not available - skipping logplot gchart unittest.", + file = sys.stderr) return options = AttrDict({ 'backend': 'gchart', From da32ad81e15a9bca4c5af58ea87f5145e5bd8e01 Mon Sep 17 00:00:00 2001 From: Alain Lichnewsky Date: Sat, 13 Mar 2021 16:22:47 +0100 Subject: [PATCH 02/57] About to look at QPS issue, only 1 failed test --- logtools/_filterbots.py | 2 +- logtools/_join.py | 6 +- logtools/_plot.py | 6 +- logtools/parsers.py | 6 +- logtools/test/test_logtools.py | 111 +++++++++++++++++++-------------- 5 files changed, 73 insertions(+), 58 deletions(-) diff --git a/logtools/_filterbots.py b/logtools/_filterbots.py index dc60c8b..455c9b9 100644 --- a/logtools/_filterbots.py +++ b/logtools/_filterbots.py @@ -26,7 +26,7 @@ import re import sys import logging -from functools import partial +from functools import partial, reduce from operator import and_ from optparse import OptionParser diff --git a/logtools/_join.py b/logtools/_join.py index 3da2fe0..a4512c7 100644 --- a/logtools/_join.py +++ b/logtools/_join.py @@ -87,7 +87,7 @@ def logjoin(fh, field, delimiter, backend, join_connect_string, """Perform a join""" field = field-1 - delimiter = unicode(delimiter) + delimiter = str(delimiter) backend_impl = { "sqlalchemy": SQLAlchemyJoinBackend @@ -97,13 +97,13 @@ def logjoin(fh, field, delimiter, backend, join_connect_string, for row in map(lambda x: x.strip(), fh): key = row.split(delimiter)[field] for join_row in backend_impl.join(key): - yield key, unicode(row) + delimiter + delimiter.join(map(unicode, join_row)) + yield key, str(row) + delimiter + delimiter.join(map(str, join_row)) def logjoin_main(): """Console entry-point""" options, args = logjoin_parse_args() for key, row in logjoin(fh=sys.stdin, *args, **options): - print( unicodedata.normalize('NFKD', unicode(row)).encode('ascii','ignore'), + print( unicodedata.normalize('NFKD', str(row)).encode('ascii','ignore'), file = sys.stdout ) return 0 diff --git a/logtools/_plot.py b/logtools/_plot.py index 22ca641..880ee44 100644 --- a/logtools/_plot.py +++ b/logtools/_plot.py @@ -113,7 +113,7 @@ def _plot_line(self, options, args, fh): # Axis labels chart.set_axis_labels(Axis.BOTTOM, labels) - left_axis = range(0, max_y + 1, 25) + left_axis = list( range(0, max_y + 1, 25) ) left_axis[0] = '' chart.set_axis_labels(Axis.LEFT, left_axis) @@ -144,7 +144,7 @@ def _plot_pie(self, options, args, fh): chart.add_data(data) chart.set_pie_labels(labels) if options.get('legend', None) is True: - chart.set_legend(map(str, legend)) + chart.set_legend( list (map(str, legend))) return chart @@ -305,7 +305,7 @@ def _plot_line(self, options, args, fh): splitted_line = l.split(delimiter) k = float(splitted_line.pop(field)) label = unicodedata.normalize('NFKD', \ - unicode(' '.join(splitted_line), 'utf-8')).encode('ascii','ignore') + str(' '.join(splitted_line), 'utf-8')).encode('ascii','ignore') pts.append((k, label)) if k > max_y: max_y = k diff --git a/logtools/parsers.py b/logtools/parsers.py index 48b62d2..e2f7ff5 100644 --- a/logtools/parsers.py +++ b/logtools/parsers.py @@ -49,11 +49,11 @@ def multikey_getter_gen(parser, keys, is_indices=False, delimiter="\t"): def multikey_getter(line, parser, keyset): data = parser(line.strip()) - return delimiter.join((unicode(data[k]) for k in keyset)) + return delimiter.join((str(data[k]) for k in keyset)) def multiindex_getter(line, parser, keyset): data = parser(line.strip()) - return delimiter.join((unicode(data.by_index(idx-1, raw=True)) for idx in keys)) + return delimiter.join((str(data.by_index(idx-1, raw=True)) for idx in keys)) if is_indices is True: # Field indices @@ -149,7 +149,7 @@ def parse(self, line): self._logline_wrapper.fieldnames = parsed_row.keys() data.clear() - for k, v in parsed_row.iteritems(): + for k, v in parsed_row.items(): data[k] = v return data diff --git a/logtools/test/test_logtools.py b/logtools/test/test_logtools.py index 7781301..62c4eb0 100755 --- a/logtools/test/test_logtools.py +++ b/logtools/test/test_logtools.py @@ -27,7 +27,7 @@ import logging from tempfile import mkstemp from datetime import datetime -from StringIO import StringIO +from io import StringIO from operator import itemgetter from logtools import (filterbots, logfilter, geoip, logsample, logsample_weighted, @@ -58,13 +58,13 @@ def testUrlParse(self): i=0 for row in urlparse(StringIO('\n'.join(self.rows)+'\n'), part='netloc'): i+=1 - self.assertEquals(i, len(self.rows), \ + self.assertEqual(i, len(self.rows), \ "Number of rows output is not equal to input size") def testMultipleQueryParams(self): url = "http://www.mydomain.com/my/path/myfile?myparam1=myval1&myparam2=myval2" for row in urlparse(StringIO(url+"\n"), part='query', query_params='myparam1,myparam2'): - self.assertEquals(row[0], 'myval1', "Returned query param value was not as expected: %s" % \ + self.assertEqual(row[0], 'myval1', "Returned query param value was not as expected: %s" % \ row) @@ -86,7 +86,7 @@ def testJSONParser(self): parser = JSONParser() for logrow in self.json_rows: parsed = parser(logrow) - self.assertNotEquals(parsed, None, "Could not parse line: %s" % str(logrow)) + self.assertNotEqual(parsed, None, "Could not parse line: %s" % str(logrow)) def testAccessLog(self): parser = AccessLog() @@ -94,33 +94,33 @@ def testAccessLog(self): self.assertRaises(ValueError, parser, 'example for invalid format') for logrow in self.clf_rows: parsed = parser(logrow) - self.assertNotEquals(parsed, None, "Could not parse line: %s" % str(logrow)) + self.assertNotEqual(parsed, None, "Could not parse line: %s" % str(logrow)) def testCommonLogFormat(self): parser = CommonLogFormat() self.assertRaises(ValueError, parser, 'example for invalid format') for logrow in self.clf_rows: parsed = parser(logrow) - self.assertNotEquals(parsed, None, "Could not parse line: %s" % str(logrow)) + self.assertNotEqual(parsed, None, "Could not parse line: %s" % str(logrow)) def testuWSGIParser(self): parser = uWSGIParser() for logrow in self.uwsgi_rows: parsed = parser(logrow) - self.assertNotEquals(parsed, None, "Could not parse line: %s" % logrow) + self.assertNotEqual(parsed, None, "Could not parse line: %s" % logrow) def testLogParse(self): options = AttrDict({'parser': 'CommonLogFormat', 'field': 4, 'header': False}) fh = StringIO('\n'.join(self.clf_rows)) output = [l for l in logparse(options, None, fh)] - self.assertEquals(len(output), len(self.clf_rows), "Output size was not equal to input size!") + self.assertEqual(len(output), len(self.clf_rows), "Output size was not equal to input size!") def testMultiKeyGetter(self): parser = parser = CommonLogFormat() func = multikey_getter_gen(parser, keys=(1,2), is_indices=True) fh = StringIO('\n'.join(self.clf_rows)) output = [func(l) for l in fh] - self.assertEquals(len(output), len(self.clf_rows), "Output size was not equal to input size!") + self.assertEqual(len(output), len(self.clf_rows), "Output size was not equal to input size!") class FilterBotsTestCase(unittest.TestCase): @@ -168,13 +168,13 @@ def testParserFiltering(self): i=0 for l in filterbots(fh=self.json_fh, **json_options): i+=1 - self.assertEquals(i, 1, "filterbots output size different than expected: %s" % str(i)) + self.assertEqual(i, 1, "filterbots output size different than expected: %s" % str(i)) def testRegExpFiltering(self): i=0 for l in filterbots(fh=self.fh, **self.options): i+=1 - self.assertEquals(i, 1, "filterbots output size different than expected: %s" % str(i)) + self.assertEqual(i, 1, "filterbots output size different than expected: %s" % str(i)) class GeoIPTestCase(unittest.TestCase): @@ -197,7 +197,7 @@ def testGeoIP(self): return output = [(geocode, ip, line) for geocode, ip, line in geoip(fh=self.fh, **self.options)] - self.assertEquals(len(output), 2, "Output size was different than expected: %s" % str(len(output))) + self.assertEqual(len(output), 2, "Output size was different than expected: %s" % str(len(output))) def testFilter(self): """Test GeoIP filtering functionality""" @@ -211,7 +211,7 @@ def testFilter(self): # Check positive filter self.options['filter'] = 'United States' output = [(geocode, ip, line) for geocode, ip, line in geoip(fh=self.fh, **self.options)] - self.assertEquals(len(output), 2, "Output size was different than expected: %s" % str(len(output))) + self.assertEqual(len(output), 2, "Output size was different than expected: %s" % str(len(output))) # Check negative filter self.options['filter'] = 'India' @@ -235,12 +235,12 @@ def setUp(self): def testUniformSampling(self): output = [r for r in logsample(fh=self.fh, **self.options)] - self.assertEquals(len(output), self.options.num_samples, + self.assertEqual(len(output), self.options.num_samples, "logsample output size different than expected: %s" % len(output)) def testWeightedSampling(self): output = [(k, r) for k, r in logsample_weighted(fh=self.fh, **self.weighted_opts)] - self.assertEquals(len(output), self.weighted_opts.num_samples, + self.assertEqual(len(output), self.weighted_opts.num_samples, "logsample output size different than expected: %s" % len(output)) class FilterTestCase(unittest.TestCase): @@ -274,7 +274,7 @@ def testACWB(self): word_boundaries=True): #print(l) lines += 1 - self.assertEquals(lines, self.exp_emitted_wb, "Number of lines emitted was not as expected: %s (Expected: %s)" % + self.assertEqual(lines, self.exp_emitted_wb, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted_wb)) def testAC(self): @@ -285,7 +285,7 @@ def testAC(self): word_boundaries=False): #print(l) lines += 1 - self.assertEquals(lines, self.exp_emitted, "Number of lines emitted was not as expected: %s (Expected: %s)" % + self.assertEqual(lines, self.exp_emitted, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted)) def testRE(self): @@ -296,7 +296,7 @@ def testRE(self): word_boundaries=False): #print( l) lines += 1 - self.assertEquals(lines, self.exp_emitted, "Number of lines emitted was not as expected: %s (Expected: %s)" % + self.assertEqual(lines, self.exp_emitted, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted)) def testREWB(self): @@ -307,7 +307,7 @@ def testREWB(self): word_boundaries=True): #print( l) lines += 1 - self.assertEquals(lines, self.exp_emitted_wb, "Number of lines emitted was not as expected: %s (Expected: %s)" % + self.assertEqual(lines, self.exp_emitted_wb, "Number of lines emitted was not as expected: %s (Expected: %s)" % (lines, self.exp_emitted_wb)) @@ -322,48 +322,60 @@ def tearDown(self): os.remove(fname) def testNumericMerge(self): - os.write(self.tempfiles[0][0], "\n".join(['1 one', '5 five', '300 threehundred', - '500 fivehundred'])) - os.write(self.tempfiles[1][0], "\n".join(['-1 minusone', '0 zero', - '670 sixhundredseventy' ,'1000 thousand'])) - os.write(self.tempfiles[2][0], "\n".join(['3 three', '22 twentytwo', '80 eighty'])) + t1 =['1 one', '5 five', '300 threehundred', + '500 fivehundred'] + os.write(self.tempfiles[0][0], "\n".join(t1).encode()) + t2 = ['-1 minusone', '0 zero', + '670 sixhundredseventy' ,'1000 thousand'] + os.write(self.tempfiles[1][0], "\n".join(t2).encode()) + t3= ['3 three', '22 twentytwo', '80 eighty'] + os.write(self.tempfiles[2][0], "\n".join(t3).encode()) options = AttrDict({'delimiter': ' ', 'field': 1, 'numeric': True }) output = [(k, l) for k, l in logmerge(options, self.args)] - self.assertEquals(len(output), 11, "Output size was not equal to input size!") - self.assertEquals(map(itemgetter(0), output), sorted(map(lambda x: int(x[0]), output)), + self.assertEqual(len(output), 11, "Output size was not equal to input size!") + self.assertEqual( list( map(itemgetter(0), output)), + sorted ( list( map( lambda x: int(x[0]), output))), "Output was not numerically sorted!") def testDateMerge(self): - os.write(self.tempfiles[0][0], "\n".join(['2010/01/12 07:00:00,one', '2010/01/12 08:00:00,five', - '2010/01/13 10:00:00,threehundred'])) - os.write(self.tempfiles[1][0], "\n".join(['2010/01/12 07:30:00,one', '2010/01/12 08:10:00,five', - '2010/01/12 21:00:00,threehundred'])) - os.write(self.tempfiles[2][0], "\n".join(['2010/01/11 05:33:03,one', '2010/01/12 03:10:00,five', - '2010/01/21 22:00:00,threehundred'])) + t1 = ['2010/01/12 07:00:00,one', '2010/01/12 08:00:00,five', + '2010/01/13 10:00:00,threehundred'] + os.write(self.tempfiles[0][0], "\n".join(t1).encode()) + t2 =['2010/01/12 07:30:00,one', '2010/01/12 08:10:00,five', + '2010/01/12 21:00:00,threehundred'] + os.write(self.tempfiles[1][0], "\n".join(t2).encode()) + t3 = ['2010/01/11 05:33:03,one', '2010/01/12 03:10:00,five', + '2010/01/21 22:00:00,threehundred'] + os.write(self.tempfiles[2][0], "\n".join(t3).encode()) dateformat = '%Y/%m/%d %H:%M:%S' options = AttrDict({'delimiter': ',', 'field': 1, 'datetime': True, 'dateformat': dateformat }) output = [(k, l) for k, l in logmerge(options, self.args)] - self.assertEquals(len(output), 9, "Output size was not equal to input size!") - self.assertEquals(map(itemgetter(0), output), sorted(map(itemgetter(0), output)), + self.assertEqual(len(output), 9, "Output size was not equal to input size!") + self.assertEqual( list( map(itemgetter(0), output)), + sorted( list( map(itemgetter(0), output))), "Output was not time sorted!") def testLexicalMerge(self): - os.write(self.tempfiles[0][0], "\n".join(['1 one', '300 threehundred', '5 five', - '500 fivehundred'])) - os.write(self.tempfiles[1][0], "\n".join(['-1 minusone', '0 zero', '1000 thousand', - '670 sixhundredseventy'])) - os.write(self.tempfiles[2][0], "\n".join(['22 twentytwo', '3 three', - '80 eighty'])) + t1 = ['1 one', '300 threehundred', '5 five', + '500 fivehundred'] + os.write(self.tempfiles[0][0], "\n".join(t1).encode()) + t2 = ['-1 minusone', '0 zero', '1000 thousand', + '670 sixhundredseventy'] + os.write(self.tempfiles[1][0], "\n".join(t2).encode()) + t3 = ['22 twentytwo', '3 three', + '80 eighty'] + os.write(self.tempfiles[2][0], "\n".join(t3).encode()) options = AttrDict({ 'delimiter': ' ', 'field': 1, 'numeric': False }) output = [(k, l) for k, l in logmerge(options, self.args)] - self.assertEquals(len(output), 11, "Output size was not equal to input size!") - self.assertEquals(map(itemgetter(0), output), sorted(map(itemgetter(0), output)), + self.assertEqual(len(output), 11, "Output size was not equal to input size!") + self.assertEqual( list( map(itemgetter(0), output)), + sorted( list( map(itemgetter(0), output))), "Output was not lexically sorted!") @@ -387,10 +399,13 @@ def setUp(self): def testQps(self): blocks=0 qs=[] - for q in qps(fh=self.fh, **self.options): + qpsVal = list( qps(fh=self.fh, **self.options)) + sys.stderr.write(f"In testQps, qpsVal ({type(qpsVal)}):\t{qpsVal}\n") + for q in qpsVal: blocks+=1 qs.append(q) - self.assertEquals(blocks, 3, "qps output size different than expected: %s" % str(blocks)) + self.assertEqual(blocks, 3, + "qps output size different than expected: %s" % str(blocks)) class PlotTestCase(unittest.TestCase): @@ -423,7 +438,7 @@ def testGChart(self): self.fh.seek(0) options['type'] = plot_type chart = logplot(options, None, self.fh) - self.assertNotEquals(chart, None, "logplot returned None. Expected a Plot object") + self.assertNotEqual(chart, None, "logplot returned None. Expected a Plot object") # Should raise ValueError here due to fh being at EOF self.assertRaises(ValueError, logplot, options, None, self.fh) @@ -448,9 +463,9 @@ def setUp(self): def testSumstat(self): stat = sumstat(fh=self.data, delimiter=' ', reverse=True) - self.assertEquals(stat['M'], self.M) - self.assertEquals(stat['N'], self.N) - self.assertEquals(stat['avg'], self.avg) + self.assertEqual(stat['M'], self.M) + self.assertEqual(stat['N'], self.N) + self.assertEqual(stat['avg'], self.avg) if __name__ == "__main__": unittest.main() From 41f75579b0ac258b1c9158652f7f179154c43c77 Mon Sep 17 00:00:00 2001 From: Alain Lichnewsky Date: Sat, 13 Mar 2021 21:47:47 +0100 Subject: [PATCH 03/57] Corrected issue with faiiing test in "qps": problematic interaction between locale and datetime.strptime Passes all tests with Python3 Python 3.8.6 (in locale FR_fr) --- logtools/__init__.py | 1 - logtools/_plot.py | 4 +++- logtools/_qps.py | 11 ++++++++--- logtools/_sumstat.py | 4 +++- logtools/test/test_logtools.py | 6 ++++-- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/logtools/__init__.py b/logtools/__init__.py index 562fde9..3795c47 100644 --- a/logtools/__init__.py +++ b/logtools/__init__.py @@ -28,7 +28,6 @@ ) from ._config import * - from ._filterbots import * from ._flattenjson import * from ._geoip import * diff --git a/logtools/_plot.py b/logtools/_plot.py index 880ee44..1b8846d 100644 --- a/logtools/_plot.py +++ b/logtools/_plot.py @@ -40,7 +40,9 @@ __all__ = ['logplot_parse_args', 'logplot', 'logplot_main'] -locale.setlocale(locale.LC_ALL, "") +# problematic in my environment, +if False: + locale.setlocale(locale.LC_ALL, "") class PlotBackend(object): __metaclass__ = ABCMeta diff --git a/logtools/_qps.py b/logtools/_qps.py index 310006c..f2e20e5 100644 --- a/logtools/_qps.py +++ b/logtools/_qps.py @@ -69,6 +69,7 @@ def qps_parse_args(): return AttrDict(options.__dict__), args + def qps(fh, dt_re, dateformat, window_size, ignore, **kwargs): """Calculate QPS from input stream based on parsing of timestamps and using a sliding time window""" @@ -83,8 +84,11 @@ def qps(fh, dt_re, dateformat, window_size, ignore, **kwargs): if not line: return try: - t = datetime.strptime(_re.match(line).groups()[0], dateformat) - except (AttributeError, KeyError, TypeError, ValueError): + mstr = _re.match(line).groups()[0] + t = datetime.strptime(mstr, dateformat) + except (AttributeError, KeyError, TypeError, ValueError) as err: + sys.stderr.write(f"In qps: Exception in line:{line[:-1]}\n\t{err}\n\t{type(err)}\n") + if ignore: logging.debug("Could not match datefield for parsed line: %s", line) continue @@ -99,7 +103,8 @@ def qps(fh, dt_re, dateformat, window_size, ignore, **kwargs): for line in map(lambda x: x.strip(), fh): try: t = datetime.strptime(_re.match(line).groups()[0], dateformat) - except (AttributeError, KeyError, TypeError, ValueError): + except (AttributeError, KeyError, TypeError, ValueError) as err: + sys.stderr.write(f"In qps: Exception in line:{line[:-1]}\n\t{err}\n\t{type(er)}\n") if ignore: logging.debug("Could not match datefield for parsed line: %s", line) continue diff --git a/logtools/_sumstat.py b/logtools/_sumstat.py index e4b6371..8a19e4c 100644 --- a/logtools/_sumstat.py +++ b/logtools/_sumstat.py @@ -44,7 +44,9 @@ __all__ = ['sumstat_parse_args', 'sumstat', 'sumstat_main'] -locale.setlocale(locale.LC_ALL, "") +# problematic in my environment +if False: + locale.setlocale(locale.LC_ALL, "") def arith_mean(values): diff --git a/logtools/test/test_logtools.py b/logtools/test/test_logtools.py index 62c4eb0..1a36a6c 100755 --- a/logtools/test/test_logtools.py +++ b/logtools/test/test_logtools.py @@ -347,7 +347,7 @@ def testDateMerge(self): '2010/01/12 21:00:00,threehundred'] os.write(self.tempfiles[1][0], "\n".join(t2).encode()) t3 = ['2010/01/11 05:33:03,one', '2010/01/12 03:10:00,five', - '2010/01/21 22:00:00,threehundred'] + '2010/01/21 22:00:00,threehundred'] os.write(self.tempfiles[2][0], "\n".join(t3).encode()) dateformat = '%Y/%m/%d %H:%M:%S' @@ -378,7 +378,9 @@ def testLexicalMerge(self): sorted( list( map(itemgetter(0), output))), "Output was not lexically sorted!") - +# +# QPS: Queries Per Second +# class QPSTestCase(unittest.TestCase): def setUp(self): self.options = AttrDict({ From 1aec89c155f71a8a9eb0a5c7a869ab646394457a Mon Sep 17 00:00:00 2001 From: Alain Lichnewsky Date: Tue, 16 Mar 2021 16:40:55 +0100 Subject: [PATCH 04/57] Made installable with setup (python3, virtualenv), logging level configurable, added documentation --- AUTHORS | 1 + MORE-DOC.md | 90 ++++++++++++++++++++++++++ PYTHON3-README.md | 79 +++++++++++++++++++++++ README.md | 8 ++- aux/testStrptime.py | 138 ++++++++++++++++++++++++++++++++++++++++ logtools/__init__.py | 5 -- logtools/_config.py | 22 +++++++ logtools/_filterbots.py | 16 ++++- requirements.txt | 9 +++ setup.py | 10 +-- 10 files changed, 366 insertions(+), 12 deletions(-) create mode 100644 MORE-DOC.md create mode 100644 PYTHON3-README.md create mode 100755 aux/testStrptime.py create mode 100644 requirements.txt diff --git a/AUTHORS b/AUTHORS index 7d648c3..184845d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1 +1,2 @@ Adam Ever-Hadani +Alain Lichnewsky ## Port to Python 3 diff --git a/MORE-DOC.md b/MORE-DOC.md new file mode 100644 index 0000000..f0eb18c --- /dev/null +++ b/MORE-DOC.md @@ -0,0 +1,90 @@ +# More documentation extracted from source + +Included here is additional information extracted by reading the code. + + +## Per File / Script + +### `logtools/parsers.py` + + +#### Formats supported + +1. Apache access_log logfile parser. See + http://httpd.apache.org/docs/1.3/logs.html#accesslog" + - supported by `class AccessLog` + - includes the Common Log Format + - example: `127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /a.gif HTTP/1.0" 200 2326` + +2. JSON log format + - supported by `class JSONParser` + +3. uWSGI log format + - uWSGI is a software application that "aims at developing a full stack for + building hosting services". + - https://en.wikipedia.org/wiki/UWSGI + - the log format is described at https://uwsgi-docs.readthedocs.io/en/latest/LogFormat.html + - supported by `class uWSGIParser` + + +### `logtools/_filter.py` +Provides `logfilter` script/command. + - Filter rows based on blacklists and field matching. + - Several parser possibilities + + +### `logtools/_filterbots.py` +Provides `filterbots` script/command. + - Filter rows based on blacklists; match IP and User-Agent fields. + - Several parser possibilities + +### `logtools._flattenjson` + +Extracts objects (dictionaries) from inside a JSON list; + - Useful when piping into tools such as `json2csv` which expect "flat" json streams. + +### `logtools._join` + +Perform a join between log stream and some other arbitrary source of data. + - Can be used with pluggable drivers e.g to join against database, other files etc. + + +### `logtools._merge` + +Logfile merging utilities. + - These typically help in streaming multiple individually sorted input logfiles + through, outputting them in combined sorted order (typically by date field) + + +### `logtools._plot` +Plotting methods for logfiles + +### `logtools._qps` +Compute QPS (Query Per Second) estimates based on parsing of timestamps from logs on +sliding time windows. + +### `logtools._sample` +Sampling tools for logfiles + +### `logtools._serve` + +Miniature web server for delivering real-time OLAP-style log stats. + + +### `logtools._sumstat` + +Generates summary statistics for a given logfile of the form: +` ` + + - logfile is expected to be pre-sorted by count. + +### `logtools._tail` + +A tail-like utility that allows tailing via time-frames and more complex +expressions. + + +### `logtools._urlparse` + +Parses URLs, Decodes query parameters,and allows some selection on URL parts. + diff --git a/PYTHON3-README.md b/PYTHON3-README.md new file mode 100644 index 0000000..ece71dd --- /dev/null +++ b/PYTHON3-README.md @@ -0,0 +1,79 @@ +# Python-3 port + +## Note + +These are short notes documenting the Python-3 port, which is still to +be considered experimental. + +### Intent and issues found + +The idea was to do a straightforward port to Python-3, since I wanted to +use the package with the native Python-3 on my Ubuntu 20.10 Linux. + +Following issues were encountered: + +- parts concerning `geoIP` have not been ported or tested, therefore are deemed + not functional + +- the `sqlsoup` package uses features of `SQLAlchemy` ( `MapperExtension` ) which have + been deprecated (then suppressed since version 0.7; see + https://docs.sqlalchemy.org/en/13/orm/deprecated.html ). + Current versions are + `sqlsoup 0.9.1` and `SQLAlchemy 1.4.0b3`. This port has been made requiring specific + versions under `virtualenv`; `setup.py`has been changed accordingly. A file + [requirements.txt](./requirements.txt) has been added to document this, and + can be used with `pip3`. + +- the package's usage of `datetime.strptime` in my locale `"fr_FR.UTF-8"`was found + problematic ( `testQps` fails when parsing date `11/Oct/2000:14:01:14 -0700` + which is fine in my locale) : + disabled statements `locale.setlocale(locale.LC_ALL, "")`in + `_qps.py` and `_plot.py`. + The directory `aux` has been added with script `testStrptime.py` to test + under different locales. + +### Added functionality + +1. added CLI flags to customize the level of logging; not customizable from + ~/.logtoolsrc (propagates slowly to various entries) + + +### Test and operative environment + + - a `virtualenv`environment has been set up, requiring Python 3.8.6, which happens + to be the native Python-3 on my system: `virtualenv -p 3.8.6` + - it has been populated according to requirements + - installation and use of the package are all performed under this environment + +### Installation + + This may be done as follows: + + - setup the `virtualenv` environment + - change directory to the package (where `setup.py` is found) + - run `setup.py` using the python interpreter in the `virtualenv` environment: + + +``` + # establish virtualenv + . venvSandBox/bin/activate + # keep track of wd and cd to source + v=`pwd` + pushd ~/src/logtools/ + # install proper + $v/venvSandBox/bin/python3 setup.py install +``` + + ### First experiments + + - configuration: see `~/.logtoolsrc` + + - filterbots`: + + ``` +touch bots_hosts.txt # File designated in ~/.logtoolsrc +touch bots_useragents.txt # File designated in ~/.logtoolsrc +cat /var/log/auth.log | filterbots --print + ``` + + diff --git a/README.md b/README.md index 5277c31..8237b45 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ This software is distributed under the Apache 2.0 license. ## Installation - +### Python2 To install this package and associated console scripts, unpack the distributable tar file, or check out the project directory, and then run: @@ -60,7 +60,13 @@ If for some reason setuptools does not install 'nose' by default, you can easily install it if you have setuptools installed by running: easy_install nose + +### Python3 + +Concerning the Python3 port, see [PYTHON3-README.md](./PYTHON3-README.md) +### Supplemental documentation +See [MORE-DOC.md](./MORE-DOC.md). ## Console Scripts diff --git a/aux/testStrptime.py b/aux/testStrptime.py new file mode 100755 index 0000000..585d54e --- /dev/null +++ b/aux/testStrptime.py @@ -0,0 +1,138 @@ +#!/usr/bin/python3 + +import os +import sys +import locale +import argparse +import traceback + +from collections.abc import Iterable + +from datetime import datetime +import pandas as PAN + + +def testDates(locKey): + "Check datetime.strptime using data in global table; show effects of locale" + if locKey not in errDict: + errDict[locKey] = 0 + for strDte, fmt, val in testDatesTb: + try : + t = datetime.strptime( strDte, fmt) + except Exception as err: + print(f"Unable to recover date from '{strDte}'\n\tformat:'{fmt}'" , + file=sys.stderr) + errDict[locKey] += 1 + else: + print( strDte, fmt, val, t) + +def testDateMain(): + "Check datetime.strptime for locales in global table testLocales" + print( f"Default locale" ) + testDates(None) + print( f"... default "+ ". "*30 +"\n" ) + + for loc in testLocales: + print( f"Setting locale: {loc}" ) + locale.setlocale(locale.LC_ALL, loc) + testDates(loc) + print( f"... {loc:12s} "+ ". "*30 +"\n" ) + + print( f"Error summary\n\t{errDict}") + + +def localeAttribs(loc): + "Collect attributes of locale" + locale.setlocale(locale.LC_ALL, loc) + lcDict = locale.localeconv() + for x in locKeys: + lcDict[ x[1] ] = locale.nl_langinfo(x[0]) + rDict = {} + for k in lcDict: + v = lcDict[k] + if not isinstance(v,str) and isinstance(v, Iterable): + rDict[k] = [ "[ " + ", ".join( map ( str, v)) + "]" ] + else: + rDict[k] = [v] + return rDict + +# +# Note the reason for putting dictionnary values in lists (as above) is described +# in https://stackoverflow.com/questions/57631895/dictionary-to-dataframe-error-if-using-all-scalar-values-you-must-pass-an-ind +# + +def testLocaleAttr(): + "Collect locale attributes for a set of locale in global table testLocales" + dfList = [] + for loc in testLocales: + attrs = localeAttribs(loc) + attrs["LOCALE"] = loc + df = PAN.DataFrame(data = attrs).transpose() + dfList.append( df ) + dfGlob = PAN.concat(dfList, axis = 1 ) + dfGlob.columns = dfGlob.loc["LOCALE",:] + df1 = dfGlob.drop("LOCALE", axis = 0) + return df1 + +if __name__ == '__main__': + description =""" + This program performs a check on the ability to extract date and time using + strptime depending on the locale configuration. Also, it documents some + of the attributes customized by setting locale. + """ + + testDatesTb= ( ('10/Oct/2000:13:57:01', '%d/%b/%Y:%H:%M:%S', None), + ('10/Oct/2000:13:57:01 -0700', '%d/%b/%Y:%H:%M:%S -0700', None) + ) + + testLocales= ( "", "fr_FR.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "C") + + errDict = {} + + locKeys= ( + (locale.D_T_FMT, 'locale.D_T_FMT'), + (locale.CODESET, 'locale.CODESET'), + (locale.D_FMT, 'locale.D_FMT'), + (locale.T_FMT, 'locale.T_FMT'), + (locale.T_FMT_AMPM, 'locale.T_FMT_AMPM')) + + + def mainPgm(): + argLineParser = argparse.ArgumentParser( + description = description, + formatter_class=argparse.RawDescriptionHelpFormatter ) + + argLineParser.add_argument("-v","--verbose" ,action="store_true", + dest="doVerbose", + help="Verbose printout of debug oriented messages on stderr") + argLineParser.add_argument("-d","--debug" ,action="store_true", + dest="doDebug", + help="Debug messages on stderr") + + argLineParser.add_argument("-c","--check" ,action="store_true", + dest="doCheck", + help="Checks ability to read time for data and formats in internal table") + argLineParser.add_argument("-p","--print" ,action="store_true", + dest="doPrint", + help="Prints customizable attributes for a set of locales in internal table") + + try: + options = argLineParser.parse_args() + if options.doDebug: + sys.stderr.write (f"options:{repr(options)}\n") + + if options.doCheck: + testDateMain() + if options.doPrint: + print( testLocaleAttr() ) + + + except Exception: + sys.stderr.write ( "Quitting because of error(s)\n" ) + traceback.print_exc() + sys.exit(1) + + + mainPgm() + + diff --git a/logtools/__init__.py b/logtools/__init__.py index 3795c47..81eb724 100644 --- a/logtools/__init__.py +++ b/logtools/__init__.py @@ -22,11 +22,6 @@ import logging -logging.basicConfig( - level = logging.INFO, - format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) - from ._config import * from ._filterbots import * from ._flattenjson import * diff --git a/logtools/_config.py b/logtools/_config.py index 9106215..0cfcece 100644 --- a/logtools/_config.py +++ b/logtools/_config.py @@ -28,6 +28,8 @@ import os import sys +import logging + from configparser import SafeConfigParser, NoOptionError, NoSectionError __all__ = ['logtools_config', 'interpolate_config', 'AttrDict'] @@ -60,4 +62,24 @@ def interpolate_config(var, section, key, default=None, type=str): raise KeyError("Missing parameter: '{0}'".format(key)) +def setLoglevel(options): + """ Customize logging level, using options dictionnary collected from CLI + """ + if options.logLevSym and options.logLevVal: + print("Flags --sym and --num are exclusive", file = sys.stderr ) + sys.exit(1) + try : + basics ={'format' : "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} + if options.logLevVal: + basics['level'] = options.logLevVal + elif options.logLevSym: + basics['level'] = options.logLevSym + logging.basicConfig(**basics) + + except ValueError as err: + print( f"Bad --sym or --num flag value\n\t{err}", file = sys.stderr) + sys.exit(2) + except Exception as err: + print( f"Unexpected error\n\t{err}", file = sys.stderr) + raise diff --git a/logtools/_filterbots.py b/logtools/_filterbots.py index 455c9b9..1a48678 100644 --- a/logtools/_filterbots.py +++ b/logtools/_filterbots.py @@ -30,12 +30,14 @@ from operator import and_ from optparse import OptionParser -from ._config import logtools_config, interpolate_config, AttrDict +from ._config import logtools_config, interpolate_config, AttrDict, setLoglevel import logtools.parsers __all__ = ['filterbots_parse_args', 'filterbots', 'filterbots_main', 'parse_bots_ua', 'is_bot_ua'] + + def filterbots_parse_args(): usage = "%prog " \ "-u " \ @@ -65,6 +67,15 @@ def filterbots_parse_args(): parser.add_option("-P", "--profile", dest="profile", default='filterbots', help="Configuration profile (section in configuration file)") + + parser.add_option("-s","--sym" , type = str, + dest="logLevSym", + help="logging level (symbol)") + + parser.add_option("-n","--num" , type=int , + dest="logLevVal", + help="logging level (value)") + options, args = parser.parse_args() @@ -91,6 +102,9 @@ def filterbots_parse_args(): if options.parser and not options.ip_ua_fields: parser.error("Must supply --ip-ua-fields parameter when using parser-based matching.") + # Set the logging level + setLoglevel(options) + return AttrDict(options.__dict__), args def parse_bots_ua(bots_ua): diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7d73008 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +acora==2.2 +greenlet==1.0.0 +logtools==0.8.7 +prettytable==2.1.0 +python-dateutil==2.8.1 +six==1.15.0 +SQLAlchemy==0.7.9 +sqlsoup==0.9.1 +wcwidth==0.2.5 diff --git a/setup.py b/setup.py index de314e5..2e140c8 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( name = 'logtools', - version = '0.8.7', + version = '0.8.7p3', description = 'Log analysis and filtering tools', author = 'Adam Ever-Hadani', author_email = 'adamhadani@gmail.com', @@ -30,7 +30,7 @@ 'logparse', 'logmerge', 'logjoin', 'urlparse', 'logplot', 'qps', 'filter'], classifiers = [ "Programming Language :: Python", - "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 3.8.6", "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Apache Software License", "Operating System :: POSIX", @@ -65,9 +65,9 @@ install_requires = [ #"pygooglechart>=0.2.1", # Optional dependency, seems broken for now "prettytable>=0.5", - "sqlalchemy>=0.7.9", - "sqlsoup>=0.9.0", - "acora>=1.7", + "sqlalchemy==0.7.9", + "sqlsoup>=0.9.1", + "acora>=2.2", "python-dateutil>=2.1" ], From 83a0d0a05920c772f7ef4f09e9d0507e3f46f19c Mon Sep 17 00:00:00 2001 From: Alain Lichnewsky Date: Thu, 18 Mar 2021 10:47:21 +0100 Subject: [PATCH 05/57] Checkpoint, adding tests and functionality useful in my own context. Note that doing so uncovers some porting issues in code sections not covered by other tests. --- MORE-DOC.md | 70 +++++++++++++++++++++++++++++++++++-- PYTHON3-ADDITIONS.md | 34 ++++++++++++++++++ PYTHON3-README.md | 83 +++++++++++++++++++++++++++++++++++--------- logtools/_parse.py | 17 +++++++-- logtools/parsers.py | 45 ++++++++++++++++++++++-- 5 files changed, 226 insertions(+), 23 deletions(-) create mode 100644 PYTHON3-ADDITIONS.md diff --git a/MORE-DOC.md b/MORE-DOC.md index f0eb18c..a42ff41 100644 --- a/MORE-DOC.md +++ b/MORE-DOC.md @@ -27,6 +27,33 @@ Included here is additional information extracted by reading the code. - supported by `class uWSGIParser` +Examples of log formats are found in the test directory: + +- Common Log Format : + + ̀̀̀ + 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 + 127.0.0.2 - jay [10/Oct/2000:13:56:12 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 + ̀̀̀ + +- uWSGI : + + ̀̀̀ + [pid: 11216|app: 0|req: 2680/5864] 24.218.159.119 () {40 vars in 957 bytes} + [Thu Jun 13 22:29:59 2013] + "GET /my/uri/path/?param_id=52&token=s61048gkje_l001z + => generated 1813 bytes in 11 msecs (HTTP/1.1 200) + 2 headers in 73 bytes (1 switches on core 0)" + + [pid: 11217|app: 0|req: 3064/5865] 10.18.50.145 () {34 vars in 382 bytes} + [Thu Jun 13 22:30:00 2013] + "GET / => generated 8264 bytes in 9 msecs (HTTP/1.1 200) + 2 headers in 73 bytes (1 switches on core 0)" + ̀̀̀ + + + + ### `logtools/_filter.py` Provides `logfilter` script/command. - Filter rows based on blacklists and field matching. @@ -35,9 +62,48 @@ Provides `logfilter` script/command. ### `logtools/_filterbots.py` Provides `filterbots` script/command. - - Filter rows based on blacklists; match IP and User-Agent fields. + - Filter rows based on regular expressions and blacklists; match IP and User-Agent fields. + - `--reverse` permits reverse filtering (show the excluded stuff) - Several parser possibilities - + - flag : `--parser`, + + Feed logs through a parser. Useful when reading encoded/escaped formats + (e.g JSON) and when selecting parsed fields rather than matching via + regular expression. + - value is name of parser class, to be `eval`'d (by Python's `eval` function), + and then instantiated + - available parsers are classes in `logtools/parsers.py`, which include: + + JSONParser, + + AccessLog (Apache access_log logfile parser, which can + consume arbitrary Apache log field directives, see + http://httpd.apache.org/docs/1.3/logs.html#accesslog ), + + CommonLogFormat (derived from AccessLog ), specialized to + parse the CLF Format, defined as: + `%h %l %u %t \"%r\" %>s %b` + (see http://httpd.apache.org/docs/1.3/logs.html#accesslog) + + + uWSGIParser + + - bottomline : the parser returns a dict, out of which the fields 'ua' and + 'ip' are extracted (modulo redefinition by flag `-f`) and filtered through + blacklist. + + Any error, including + ' KeyError' cause emission via log of ERROR message `No match for line` + + Lines are filtered out for corresponding to Bots + + other lines are transmitted + + + requirements : + - flags `-f` or `--ip-ua-fields` define field(s) selector for filtering bots + when using a parser . Value format should be + `'ua:,ip:'`. + If one of these is missing, it will not be used for filtering. + + - default : the default is .... A regular expression can be specified using `-r` flag + to match IP/useragent fields; groups 'ip' and 'ua' receive special processing + quite similar to what is described above at 'bottomline': + + `ìp`: check whether ip in IP blacklist + + `ua`: + + ### `logtools._flattenjson` Extracts objects (dictionaries) from inside a JSON list; diff --git a/PYTHON3-ADDITIONS.md b/PYTHON3-ADDITIONS.md new file mode 100644 index 0000000..a22295d --- /dev/null +++ b/PYTHON3-ADDITIONS.md @@ -0,0 +1,34 @@ +# Additional features in Python-3 port + +## Note + +These are short notes documenting functions added when porting to Python-3, +still to be considered experimental + +## Intent + +The idea was to make the package more usable in my own context, which concerns +Linux/Ubuntu produced logs. + +Other aspects: + - emphasis on `fr_FR.UTF-8` locale + - most logs are in `RSYSLOG_TraditionalFileFormat` + +### Added functionality + +1. added CLI flags to customize the level of logging; not customizable from + ~/.logtoolsrc (propagates slowly to various entries) + - adds flags `-s` (symbolic designation like ̀-s DEBUG`) `-n` (numerical + like `-n 10`) + + +2. Added RFC 5424 parser `SyslogRFC5424`, here ̀-f`supports symbolic field selection + + ``` + cat testData/tytyRFC.log | testLogparse --parser SyslogRFC5424 -f hostname -s INFO + ``` + + Field names can be found running with flag `-s DEBUG` + +3. Looking at parser for `RSYSLOG_TraditionalFileFormat` + Keep you posted! diff --git a/PYTHON3-README.md b/PYTHON3-README.md index ece71dd..444ec07 100644 --- a/PYTHON3-README.md +++ b/PYTHON3-README.md @@ -34,23 +34,26 @@ Following issues were encountered: ### Added functionality -1. added CLI flags to customize the level of logging; not customizable from - ~/.logtoolsrc (propagates slowly to various entries) + See [PYTHON3-ADDITIONS](./PYTHON3-ADDITIONS.md) + ### Test and operative environment - - - a `virtualenv`environment has been set up, requiring Python 3.8.6, which happens - to be the native Python-3 on my system: `virtualenv -p 3.8.6` - - it has been populated according to requirements - - installation and use of the package are all performed under this environment + This is really setting up the development / maintenance environment: + + - setup a `virtualenv`environment, requiring Python 3.8.6, ( or whatever version + you want to use. Python 3.8.6 happens to be the native Python-3 on my system, + with which all development and tests have been done: `virtualenv -p 3.8.6` + - populate it according to [requirements.txt](./requirements.txt) + - development and maintenance of the package are all performed under this environment ### Installation This may be done as follows: - - setup the `virtualenv` environment - - change directory to the package (where `setup.py` is found) + - setup or activate the `virtualenv` environment using the + [requirements.txt](./requirements.txt) file. + - change directory to the package (where `setup.py` and source code are found) - run `setup.py` using the python interpreter in the `virtualenv` environment: @@ -66,14 +69,62 @@ Following issues were encountered: ### First experiments - - configuration: see `~/.logtoolsrc` - - - filterbots`: - - ``` + - configuration: + - establish a `~/.logtoolsrc` file, which will used for setting + parameters or defaults + - as configuration files are named in `~/.logtoolsrc`, create and populate them, + for empty files are OK if there is no blacklist: +``` touch bots_hosts.txt # File designated in ~/.logtoolsrc touch bots_useragents.txt # File designated in ~/.logtoolsrc -cat /var/log/auth.log | filterbots --print + ``` + + - `filterbots`: + 1. extract log entries corresponding to some ̀sudo` uses. Notice that + we are using the `-s` flag to define the level of output + ``` +gunzip --stdout /var/log/auth.log.*.gz | \ +cat /var/log/auth.log - | \ +filterbots -s ERROR -r ".*sudo:(?P[^:]+).*COMMAND=(?P\S+).*" --print ``` - + - filter : + + + - `logmerge` + 1. Merges several logs sorting a field defined by delimiter (1 char) and field number: + ``` + logmerge -d'-' -f1 /var/log/auth.log /var/log/syslog | \ + grep -i upload + ``` + + 2. Use a parser for merging + - the following supposes Apache Common Log Format + see http://httpd.apache.org/docs/1.3/logs.html#accesslog), examples shown in + [MORE-DOC.md](./MORE-DOC.md) : + ``` + logmerge --parser CommonLogFormat -f4 /var/log/auth.log /var/log/syslog + ``` + - by default `format='%h %l %u %t "%r" %>s %b'` + + - `logparse` + 1. Parses according to parser format, select output field: + example extracts the date-time): + ``` + cat /tmp/tyty.log | logparse --parser CommonLogFormat -s INFO -f4 + ``` + + 2. Same, selects multiple fields, for some reason only 1 line is output + + ``` + cat /tmp/tyty.log | testLogparse --parser CommonLogFormat -s DEBUG -f1,4 + ``` + + 3. Added RFC 5424 parser `SyslogRFC5424`, here ̀-f`supports symbolic field selection + + ``` + cat testData/tytyRFC.log | testLogparse --parser SyslogRFC5424 -f hostname -s INFO + ``` + + Field names can be found running with flag `-s DEBUG` + diff --git a/logtools/_parse.py b/logtools/_parse.py index c9ca639..e84ac1c 100644 --- a/logtools/_parse.py +++ b/logtools/_parse.py @@ -28,9 +28,10 @@ import logging from operator import and_ from optparse import OptionParser +from functools import reduce import logtools.parsers -from ._config import interpolate_config, AttrDict +from ._config import interpolate_config, AttrDict, setLoglevel __all__ = ['logparse_parse_args', 'logparse', 'logparse_main'] @@ -51,6 +52,15 @@ def logparse_parse_args(): parser.add_option("-P", "--profile", dest="profile", default='logparse', help="Configuration profile (section in configuration file)") # noqa + parser.add_option("-s","--sym" , type = str, + dest="logLevSym", + help="logging level (symbol)") + + parser.add_option("-n","--num" , type=int , + dest="logLevVal", + help="logging level (value)") + + options, args = parser.parse_args() # Interpolate from configuration @@ -63,6 +73,9 @@ def logparse_parse_args(): options.header = interpolate_config(options.header, options.profile, 'header', default=False, type=bool) + # Set the logging level + setLoglevel(options) + return AttrDict(options.__dict__), args @@ -79,7 +92,7 @@ def logparse(options, args, fh): keyfunc = None keys = None if isinstance(options.field, int) or \ - (isinstance(options.field, basestring) and options.field.isdigit()): + (isinstance(options.field, str) and options.field.isdigit()): # Field given as integer (index) field = int(options.field) - 1 key_func = lambda x: parser(x.strip()).by_index(field, raw=True) diff --git a/logtools/parsers.py b/logtools/parsers.py index e2f7ff5..f00532b 100644 --- a/logtools/parsers.py +++ b/logtools/parsers.py @@ -195,7 +195,7 @@ def parse(self, logline): data[k] = v return data else: - raise ValueError("Could not parse log line: '%s'" % logline) + raise ValueError("Could not parse log line: %s" % repr(logline)) def _parse_log_format(self, format): """This code piece is based on the apachelogs @@ -241,8 +241,10 @@ def _parse_log_format(self, format): subpatterns.append(subpattern) _pattern = '^' + ' '.join(subpatterns) + '$' + logging.debug( f"_parse_log_format input format '{format}'") + logging.debug( f"\t\tgenerated rex:{repr(_pattern)} ") + # repr adds escapes so the the regexp can be copied and used _regex = re.compile(_pattern) - return _regex @@ -276,4 +278,41 @@ def parse(self, logline): data[k] = v return data else: - raise ValueError("Could not parse log line: '%s'" % logline) + raise ValueError("Could not parse log line: %s" % repr(logline)) + +# +# Addition to handle Syslog RFC-5424 +# +from syslog_rfc5424_parser import SyslogMessage, ParseError + +class SyslogRFC5424(LogParser): + """ Parser for Syslog RFC-5424 + """ + def __init__(self): + LogParser.__init__(self) + self._logline_wrapper = LogLine() + + def parse(self, logline): + "Parse log line " + data = self._logline_wrapper + + logging.debug( f"Parsing RFC5424 line:{repr(logline)}") + try: + parsed = SyslogMessage.parse(logline) + pdict = parsed.as_dict() + + data.fieldnames = pdict.keys() + data.clear() + for k, v in pdict.items(): + data[k] = v + + logging.debug( f"\tParsed(type(parsed)):{pdict}") + return data + except ParseError as err: + logging.error( f"\tRFC5424 parse error:{err}") + + data.fieldnames = [] + data.clear() + return data + + From ea2459b6c0bef66c60a0770528e82066574144ef Mon Sep 17 00:00:00 2001 From: Alain Lichnewsky Date: Fri, 19 Mar 2021 22:20:00 +0100 Subject: [PATCH 06/57] started testing parser for RSYSLOG_TraditionalFileFormat --- PYTHON3-ADDITIONS.md | 5 +- aux/parseRsyslogd.py | 340 +++++++++++++++++++++++++++++++++++++++++++ logtools/parsers.py | 27 +++- requirements.txt | 2 + 4 files changed, 372 insertions(+), 2 deletions(-) create mode 100755 aux/parseRsyslogd.py diff --git a/PYTHON3-ADDITIONS.md b/PYTHON3-ADDITIONS.md index a22295d..695e32c 100644 --- a/PYTHON3-ADDITIONS.md +++ b/PYTHON3-ADDITIONS.md @@ -1,3 +1,4 @@ + # Additional features in Python-3 port ## Note @@ -22,7 +23,9 @@ Other aspects: like `-n 10`) -2. Added RFC 5424 parser `SyslogRFC5424`, here ̀-f`supports symbolic field selection +2. Added RFC 5424 parser `SyslogRFC5424`, here ̀-f`supports symbolic field selection. + This addition makes use of package `syslog_rfc5424_parser` from + https://github.com/EasyPost/syslog-rfc5424-parser. ``` cat testData/tytyRFC.log | testLogparse --parser SyslogRFC5424 -f hostname -s INFO diff --git a/aux/parseRsyslogd.py b/aux/parseRsyslogd.py new file mode 100755 index 0000000..b3966da --- /dev/null +++ b/aux/parseRsyslogd.py @@ -0,0 +1,340 @@ +#!/usr/bin/python3 + +import os +import sys +import locale +import argparse +import traceback +import logging + +from collections.abc import Iterable +from functools import reduce +from io import StringIO + +import re + +## For development environment REMOVE ONCE/IF INSTALLED ***************** +## this is normally found in logtools/_config.py + +def setLoglevel(options): + """ Customize logging level, using options dictionnary collected from CLI + """ + if options.logLevSym and options.logLevVal: + print("Flags --sym and --num are exclusive", file = sys.stderr ) + sys.exit(1) + try : + basics ={'format' : "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} + if options.logLevVal: + basics['level'] = options.logLevVal + elif options.logLevSym: + basics['level'] = options.logLevSym + logging.basicConfig(**basics) + + except ValueError as err: + print( f"Bad --sym or --num flag value\n\t{err}", file = sys.stderr) + sys.exit(2) + except Exception as err: + print( f"Unexpected error\n\t{err}", file = sys.stderr) + raise +## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + +def count(func,iter): + """ Count number elements in iter where func returns true + """ + def c(x): + return 0 if not x else 1 + return reduce(int.__add__, map( func, iter), 0) + +# +# These are the templates considered; when extending keep in mind: +# - "FileFormat" must be first. +# - default "TraditionalFileFormat" must be second corresponding to "-n 1" +# +# Make sure to escape backslash ! +# +templateDefs = """ +#$template FileFormat,"%TIMESTAMP:::date-rfc3339% %HOSTNAME% %syslogtag%%msg:::sp-if-no-1st-sp%%msg:::drop-last-lf%\\n" + +#$template TraditionalFileFormat,"%TIMESTAMP% %HOSTNAME% %syslogtag%%msg:::sp-if-no-1st-sp%%msg:::drop-last-lf%\\n" + +#$template ForwardFormat,"<%PRI%>%TIMESTAMP:::date-rfc3339% %HOSTNAME% %syslogtag:1:32%%msg:::sp-if-no-1st-sp%%msg%" + +#$template TraditionalForwardFormat,"<%PRI%>%TIMESTAMP% %HOSTNAME% %syslogtag:1:32%%msg:::sp-if-no-1st-sp%%msg%" + +""" + +templateNotConsidered = """ +#$template StdSQLFormat,"insert into SystemEvents (Message, Facility, FromHost, Priority, DeviceReportedTime, ReceivedAt, InfoUnitID, SysLogTag) values ('%msg%', %syslogfacility%, '%HOSTNAME%', %syslogpriority%, '%timereported:::date-mysql%', '%timegenerated:::date-mysql%', %iut%, '%syslogtag%')",SQL + +#$template jsonRfc5424Template,"{\\"type\\":\\"mytype1\\",\\"host\\":\\"%HOSTNAME%\\",\\"message\\":\\"<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% %APP-NAME% %PROCID% %MSGID% %STRUCTURED-DATA% %msg:::json%\\"}\\n" + +""" + + +def prepareTemplateDict(tempStr): + """ Build tables for selecting rsyslogd parsers by name of template. It is + expected that all parsers corresponding to these names will be made + available based on parser name. This may imply augmenting the parser + selection mechanism in parser.py. + """ + # A template consists of a template directive, a name, the actual template + # text and optional options. + rexs= ( "^(\#?\$?template)\s+", # template directive + "(?P[A-Za-z0-9_@]+)\s*,\s*", # name + "\"(?P