diff --git a/LICENSE b/LICENSE index 37aaa8a..612b3e8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License Copyright (c) 2020 homoluctus +Copyright (c) 2022 Dominik George Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/maillogger/analyze.py b/maillogger/analyze.py new file mode 100644 index 0000000..4e7df3a --- /dev/null +++ b/maillogger/analyze.py @@ -0,0 +1,78 @@ +from dataclasses import asdict, dataclass, field +from typing import Dict, List + +from maillogger.parser import ParseResultType, ParseResultTo + +GroupedResultType = Dict[str, List[ParseResultType]] +AggregateResultType = List[Dict[str, str]] + + +def group_by_mail_id(results: List[ParseResultType]) -> GroupedResultType: + """Group a list of parse results into a dict by mail_id + + Args: + results (List[ParseResultType]): List of parse results as dicts + + Returns: + GroupedResultType: return a dictionary with the mail_id as key + + { + '677RGS0': [ + {...}, {...} + ] + } + """ + + groups = {} + + for result in results: + groups.setdefault(result['mail_id'], []).append(result) + + return groups + + +def aggregate(groups: GroupedResultType) -> AggregateResultType: + """Aggregate all results per mail id into one item + + Args: + groups (GroupedResultType): dict of lists grouped by mail id + + Returns: + AggregateResultType: dictionary with one dict keyed by mail id + """ + + aggregates = {} + + for mail_id, records in groups.items(): + for record in records: + aggregates.setdefault(mail_id, AggregateResult(mail_id)).update(record) + + return [aggregate.to_dict() for aggregate in aggregates.values()] + + +@dataclass +class AggregateResult: + mail_id: str + + from_address: str = '' + to_addresses: List[str] = field(default_factory=list) + + size: str = '0' + + def to_dict(self) -> ParseResultType: + return asdict(self) + + def update(self, record: ParseResultType) -> None: + if not self.mail_id: + self.mail_id = record["mail_id"] + elif record["mail_id"] != self.mail_id: + raise ValueError("Trying to aggregate different mail ids!") + + if "from_address" in record: + self.from_address = record["from_address"] + + if "size" in record: + self.size = record["size"] + + if "to_address" in record: + self.to_addresses.append(record["to_address"]) diff --git a/maillogger/cli.py b/maillogger/cli.py index 1e30204..c8ca45a 100644 --- a/maillogger/cli.py +++ b/maillogger/cli.py @@ -33,6 +33,18 @@ def setup_options(parser: argparse.ArgumentParser) -> None: help='File data format to write the parsed maillog (Default: csv)' ) + parser.add_argument( + '-g', '--group', + action='store_true', + help='Group results by mail id (only available in JSON format)' + ) + + parser.add_argument( + '-a', '--aggregate', + action='store_true', + help='Aggregate results by mail id' + ) + parser.add_argument( '-c', '--compress', action='store_true', @@ -50,4 +62,9 @@ def setup_options(parser: argparse.ArgumentParser) -> None: def parse_options(args: Optional[List[str]] = None) -> argparse.Namespace: parser = get_parser() setup_options(parser) - return parser.parse_args(args=args) + + options = parser.parse_args(args=args) + if options.group and options.fmt in ("csv", "tsv"): + parser.error("Grouping by mail id can only be used with JSON output.") + + return options diff --git a/maillogger/file/writer.py b/maillogger/file/writer.py index 36d0919..464c1d3 100644 --- a/maillogger/file/writer.py +++ b/maillogger/file/writer.py @@ -1,12 +1,15 @@ import csv import json from dataclasses import dataclass -from typing import Any, ClassVar, List, Type +from typing import Any, ClassVar, Dict, List, Type, Union +from maillogger.analyze import AggregateResultType, GroupedResultType from maillogger.exceptions import UnsupportedDataFormatError from maillogger.file.base import FileHandler from maillogger.parser import ParseResultType +OutputResultType = Union[GroupedResultType, List[ParseResultType], AggregateResultType] + @dataclass class FileWriter(FileHandler): @@ -32,7 +35,7 @@ def add_file_ext(self) -> None: ext = f'{self.ext}.{self.gz_ext}' self.filepath = f'{self.filepath}.{ext}' - def handle(self, records: List[ParseResultType]) -> None: + def handle(self, records: OutputResultType) -> None: if not records: return @@ -51,7 +54,7 @@ class CsvWriter(FileWriter): newline: str = '' - def write(self, records: List[ParseResultType]) -> None: + def write(self, records: OutputResultType) -> None: writer = csv.DictWriter(self.fd, fieldnames=list(records[0].keys())) writer.writeheader() writer.writerows(records) @@ -64,7 +67,7 @@ class JsonWriter(FileWriter): ensure_ascii: bool = False indent: int = 2 - def write(self, records: List[ParseResultType]) -> None: + def write(self, records: OutputResultType) -> None: json.dump( records, self.fd, # type: ignore @@ -76,7 +79,7 @@ def write(self, records: List[ParseResultType]) -> None: class TsvWriter(FileWriter): ext = 'tsv' - def write(self, records: List[ParseResultType]) -> None: + def write(self, records: OutputResultType) -> None: header = '\t'.join(records[0].keys()) self.fd.write(f'{header}\n') # type: ignore @@ -101,7 +104,7 @@ def get_writer(filepath: str, fmt: str, **kwargs: Any) -> Type[FileWriter]: def write( filepath: str, - records: List[ParseResultType], + records: OutputResultType, fmt: str, **kwargs: Any) -> None: fmt = fmt.lower() diff --git a/maillogger/main.py b/maillogger/main.py index 5ec960d..f9c21b9 100644 --- a/maillogger/main.py +++ b/maillogger/main.py @@ -1,3 +1,4 @@ +from maillogger.analyze import aggregate, group_by_mail_id from maillogger.cli import parse_options from maillogger.file.loader import Loader from maillogger.file.writer import write @@ -10,12 +11,23 @@ def main() -> None: loader = Loader(options.source_file) contents = loader.handle() + parse_to = True + parse_from = True + if options.fmt in ("csv", "tsv"): + parse_from = False + parsed_contents = [] for c in contents: - result = parse(c) + result = parse(c, parse_to, parse_from) if result: parsed_contents.append(result) + if options.group or options.aggregate: + parsed_contents = group_by_mail_id(parsed_contents) + + if options.aggregate: + parsed_contents = aggregate(parsed_contents) + write( filepath=options.target_file, records=parsed_contents, fmt=options.fmt, compress=options.compress) diff --git a/maillogger/parser.py b/maillogger/parser.py index f423b12..040b70a 100644 --- a/maillogger/parser.py +++ b/maillogger/parser.py @@ -3,26 +3,34 @@ from datetime import datetime from typing import Dict, Optional +REGEX_PREFIX = r'(?P[A-Z][a-z]{2}) +(?P[0-9]{,2}) ' \ + + r'(?P