Add rudimentary keyword search for CCPA policies using HTML file#5
Add rudimentary keyword search for CCPA policies using HTML file#5objorkman wants to merge 2 commits intoblues-lab:masterfrom
Conversation
polipy/extractors.py
Outdated
|
|
||
| def extract_text(url_type, url=None, dynamic_source=None, static_source=None, **kwargs): | ||
| if url_type is None or url_type in ['html', 'other']: | ||
| def extract_text(url_type, url=None, dynamic_source=None, static_source=None, html_file=None, **kwargs): |
There was a problem hiding this comment.
Looks good! I would do it this way so you can save the extracted keywords separately, as a separate extractor:
def extract(extractor, **kwargs):
content = None
if extractor == 'text':
content = extract_text(**kwargs)
elif extractor == 'keywords' and 'html_file' in kwargs and kwargs['html_file'] is not None:
content = extract_ccpa_info(kwargs['html_file'])
return content
def extract_text(url_type, url=None, dynamic_source=None, static_source=None, **kwargs):
if url_type is None or url_type in ['html', 'other']:
content = extract_html(dynamic_source, url)
elif url_type == 'pdf':
content = extract_pdf(static_source)
elif url_type == 'plain':
content = dynamic_source
else:
content = dynamic_source
return contentMaybe take a look and see if that achieves the same functionality? (apart from not saving it under the "text" extractor).
polipy/extractors.py
Outdated
| # if w in text: | ||
| # substring += w + ',' | ||
| result += substring + '\n' | ||
| print(result) |
There was a problem hiding this comment.
Don't forget to remove the print statements at some later point.
polipy/polipy.py
Outdated
| from .constants import UTC_DATE, CWD | ||
| from .exceptions import NetworkIOException, ParserException | ||
| from .logger import get_logger | ||
| from bs4 import BeautifulSoup |
| self.url['url'] = url | ||
| self.url = self.url | parse_url(url) | ||
| self.url['domain'] = self.url['domain'].strip().strip('.').strip('/') | ||
| self.html_file = html_file |
There was a problem hiding this comment.
Yeah, I think this is fine for now. Maybe ultimately we could merge the self.htm_file better into the Polipy object, as that's basically the exact same attribute as the self.source['dynamic_html'] after the scraping part. But otherwise it's good to have an option of either providing an URL (and then scraping) or the already-scraped HTML to the constructor.
Output is currently printed and stored as a string