diff --git a/analyzer/windows/modules/auxiliary/html_scraper.py b/analyzer/windows/modules/auxiliary/html_scraper.py index c6b2de97..296e1fa6 100644 --- a/analyzer/windows/modules/auxiliary/html_scraper.py +++ b/analyzer/windows/modules/auxiliary/html_scraper.py @@ -56,24 +56,23 @@ def upload_to_htmldump_folder(file_name: str, content: bytes): def scrape_html(self): if not HAVE_SELENIUM: - log.debug("Selenium not installed on machine, not scraping", self.driver_path) + log.warning("Selenium not installed on machine, not scraping", self.driver_path) return if not os.path.isfile(self.driver_path): - log.debug("Web driver not found in path %s, not scraping", self.driver_path) + log.warning("Web driver not found in path %s, not scraping", self.driver_path) return - if not hasattr(self.config, "category") or self.config.category != "file": - log.debug("Category is not file, not scraping", self.config.category) + if not hasattr(self.config, "category") or self.config.category not in ("file", "url"): + log.debug("Category %s is neither 'file' nor 'url', not scraping", self.config.category) return - if not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type: - log.debug("File is not html, not scraping", self.config.category) + if (self.config.category == "file" and + (not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type)): + log.debug("File is not html, not scraping (file_type is %s)", self.config.file_type) return try: - file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name)) - service = Service(self.driver_path) # This flag ensures that gecko driver will run without opening a cmd window @@ -82,29 +81,33 @@ def scrape_html(self): firefox_options = webdriver.FirefoxOptions() firefox_options.add_argument("--disable-gpu") firefox_options.headless = True - self.browser = webdriver.Firefox(options=firefox_options, service=service) self.browser.set_page_load_timeout(10) - sample_url = "file:///{}".format(os.path.abspath(file_path)) + if self.config.category == "file": + file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name)) + sample_url = "file:///{}".format(os.path.abspath(file_path)) + else: + sample_url = self.config.target + + log.debug("html_scraper try to scrape: %s", sample_url) try: self.browser.get(sample_url) time.sleep(self.browser_runtime) except TimeoutException: log.warning("Page load timed out") - log.debug("Starting upload") self.upload_to_htmldump_folder("html_dump.dump", self.browser.page_source.encode()) if not self.browser.current_url.startswith("file://"): self.upload_to_htmldump_folder("last_url.dump", self.browser.current_url.encode()) - log.debug("HTML scraped successfully") except Exception as e: log.error(e, exc_info=True) def run(self): if not self.enabled: + log.debug("html_scraper RUN rejected because is disabled in config") return False self.scrape_html() diff --git a/modules/processing/html_scraper.py b/modules/processing/html_scraper.py index a30fea5e..ef8f57b2 100644 --- a/modules/processing/html_scraper.py +++ b/modules/processing/html_scraper.py @@ -6,6 +6,7 @@ from typing import Optional from lib.cuckoo.common.abstracts import Processing +from lib.cuckoo.common.exceptions import CuckooDependencyError from data.scraper_safe_url_list import safe_url_list @@ -46,13 +47,14 @@ def force_decode(text: str, max_decode_depth: int) -> Optional[str]: class HtmlScraper(Processing): - def run(self): + def __init__(self, *args, **kwargs): + self.key = "html_scraper" if not HAVE_URLEXTRACT: - print("Missed optional dependency: poetry run pip install -r extra/optional_dependencies.txt") - return + raise CuckooDependencyError("Missing dependency 'URLExtract'") + super().__init__(*args, **kwargs) + def run(self): log.debug("Started html dump processing") - self.key = "html_scraper" html_dump_path = os.path.join(self.analysis_path, "htmldump", "html_dump.dump") last_url_path = os.path.join(self.analysis_path, "htmldump", "last_url.dump")