From bc0b3cd6c889ca257c875222f10ecd92de74dc8b Mon Sep 17 00:00:00 2001 From: Shiplu Mokaddim Date: Thu, 2 Apr 2020 10:58:47 +0200 Subject: [PATCH] multi url and xpath support together --- webquery.py | 90 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/webquery.py b/webquery.py index 935ffee..7ed8150 100644 --- a/webquery.py +++ b/webquery.py @@ -1,7 +1,9 @@ import os import sys import hashlib -import urllib.request, urllib.parse, urllib.error +import urllib.request +import urllib.parse +import urllib.error import argparse from datetime import datetime, timedelta from lxml import etree @@ -50,62 +52,88 @@ def urlcontent(url): if (current_dt - file_creation_dt) > timedelta(hours=3): req = urllib.request.Request(url, headers={'User-Agent': config.USER_AGENT}) content = urllib.request.urlopen(req).read().decode(encoding='utf-8') - with open(cachefile, 'w') as f: + with open(cachefile, 'wt') as f: f.write(content) else: - with open(cachefile, 'r') as f: + with open(cachefile, 'rt') as f: content = f.read() return content +def parse(urls, xpaths): + """parse all the urls with xpaths + + Args: + urls (iterator): an iterator of urls + xpaths (iterator): an iterator of xpaths + + Returns: + dict: result dictionary. Format looks like + {URL: {XPATH: [match]}} + """ + result = {} + for url in urls: + result[url] = {} + root = etree.HTML(urlcontent(url), base_url=url) + for xpath in xpaths: + result[url][xpath] = [] + for match in root.xpath(xpath): + result[url][xpath].append(match) + return result + + def main(): cmd_args = cmdline_args() - root = etree.HTML(urlcontent(cmd_args.url), base_url=cmd_args.url) - if cmd_args.table: - tbl = prettytable.PrettyTable() - tbl.valign = 't' - tbl.align = 'l' + for url in cmd_args.url: + root = etree.HTML(urlcontent(url), base_url=url) + tbl = prettytable.PrettyTable() + tbl.valign = 't' + tbl.align = 'l' - cols = [] - for xpath in cmd_args.xpath: - cols.append((xpath, [str(el).strip() for el in root.xpath(xpath)])) + cols = [] + for xpath in cmd_args.xpath: + cols.append((xpath, [str(el).strip() for el in root.xpath(xpath)])) - log("cols: %s" % cols) - max_row_len = max([len(matches) for col, matches in cols]) - log("max len = %s" % max_row_len) - tbl.add_column("SL", list(range(1, max_row_len + 1))) + log("cols: %s" % cols) + max_row_len = max([len(matches) for col, matches in cols]) + log("max len = %s" % max_row_len) + tbl.add_column("SL", list(range(1, max_row_len + 1))) - xpathmap = dict([(xpath, "%s-%02d" % (NAME, idx)) for (idx, xpath) in enumerate(cmd_args.xpath, 1)]) + xpathmap = dict([(xpath, "%s-%02d" % (NAME, idx)) for (idx, xpath) in enumerate(cmd_args.xpath, 1)]) - for xpath, matches in cols: - if len(matches) < max_row_len: - padded_list = [""] * (max_row_len - len(matches)) - matches.extend(padded_list) - tbl.add_column(xpathmap[xpath], matches, align='l') + for xpath, matches in cols: + if len(matches) < max_row_len: + padded_list = [""] * (max_row_len - len(matches)) + matches.extend(padded_list) + tbl.add_column(xpathmap[xpath], matches, align='l') - print(tbl) + print(tbl) - print(("%s Ref.:" % NAME)) - for xpath, name in sorted(list(xpathmap.items()), key=lambda u: u[1]): - print(("%2s - %s" % (name, xpath))) + print(("%s Ref.:" % NAME)) + for xpath, name in sorted(list(xpathmap.items()), key=lambda u: u[1]): + print(("%2s - %s" % (name, xpath))) else: - for xpath in cmd_args.xpath: - print("Xpath: %s" % xpath) - for position_match in enumerate(root.xpath(xpath), 1): - print("%3s %s" % (position_match)) + for url_pos, (url, values) in enumerate(parse(cmd_args.url, cmd_args.xpath).items(), 1): + print("%3s %s" % (url_pos, url)) + for xpath_pos, (xpath, matches) in enumerate(values.items(), 1): + print("\t%3s %s" % (xpath_pos, xpath)) + for pos_val in enumerate(matches, 1): + print("\t\t%3s %s" % (pos_val)) def cmdline_args(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--table', action='store_true', default=False, help='Shows matches in a table') - parser.add_argument('url', help="url to query") - parser.add_argument('xpath', nargs='+', help='xpath to apply') + parser.add_argument('-u', '--url', nargs='+', required=True, + help="url to query") + parser.add_argument('-x', '--xpath', nargs='+', required=True, + help='xpath to apply') return parser.parse_args()