CAPESandbox · MMikePL · Dec 2, 2025 · Dec 2, 2025
diff --git a/analyzer/windows/modules/auxiliary/html_scraper.py b/analyzer/windows/modules/auxiliary/html_scraper.py
@@ -56,24 +56,23 @@ def upload_to_htmldump_folder(file_name: str, content: bytes):
 
     def scrape_html(self):
         if not HAVE_SELENIUM:
-            log.debug("Selenium not installed on machine, not scraping", self.driver_path)
+            log.warning(f"Selenium not installed on machine, not scraping: {self.driver_path}")
             return
 
         if not os.path.isfile(self.driver_path):
-            log.debug("Web driver not found in path %s, not scraping", self.driver_path)
+            log.warning(f"Web driver not found in path %s, not scraping: {self.driver_path}")
             return
 
-        if not hasattr(self.config, "category") or self.config.category != "file":
-            log.debug("Category is not file, not scraping", self.config.category)
+        if not hasattr(self.config, "category") or self.config.category not in ("file", "url"):
+            log.debug(f"Category is neither 'file' nor 'url', not scraping. (Category is {self.config.category})")
             return
 
-        if not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type:
-            log.debug("File is not html, not scraping", self.config.category)
+        if (self.config.category == "file" and
+                (not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type)):
+            log.debug(f"File is not html, not scraping (file_type is {self.config.file_type}")
             return
 
         try:
-            file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name))
-
             service = Service(self.driver_path)
 
             # This flag ensures that gecko driver will run without opening a cmd window
@@ -82,29 +81,33 @@ def scrape_html(self):
             firefox_options = webdriver.FirefoxOptions()
             firefox_options.add_argument("--disable-gpu")
             firefox_options.headless = True
-
             self.browser = webdriver.Firefox(options=firefox_options, service=service)
             self.browser.set_page_load_timeout(10)
 
-            sample_url = "file:///{}".format(os.path.abspath(file_path))
+            if self.config.category == "file":
+                file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name))
+                sample_url = "file:///{}".format(os.path.abspath(file_path))
+            else:
+                sample_url = self.config.target
+
+            log.debug(f"html_scraper try to scrape {sample_url}")
             try:
                 self.browser.get(sample_url)
                 time.sleep(self.browser_runtime)
             except TimeoutException:
                 log.warning("Page load timed out")
 
-            log.debug("Starting upload")
             self.upload_to_htmldump_folder("html_dump.dump", self.browser.page_source.encode())
 
             if not self.browser.current_url.startswith("file://"):
                 self.upload_to_htmldump_folder("last_url.dump", self.browser.current_url.encode())
 
-            log.debug("HTML scraped successfully")
         except Exception as e:
             log.error(e, exc_info=True)
 
     def run(self):
         if not self.enabled:
+            log.debug("html_scraper RUN rejected because is disabled in config")
             return False
 
         self.scrape_html()

diff --git a/modules/processing/html_scraper.py b/modules/processing/html_scraper.py
@@ -6,6 +6,7 @@
 from typing import Optional
 
 from lib.cuckoo.common.abstracts import Processing
+from lib.cuckoo.common.exceptions import CuckooDependencyError
 
 from data.scraper_safe_url_list import safe_url_list
 
@@ -46,13 +47,14 @@ def force_decode(text: str, max_decode_depth: int) -> Optional[str]:
 
 
 class HtmlScraper(Processing):
-    def run(self):
+    def __init__(self, *args, **kwargs):
+        self.key = "html_scraper"
         if not HAVE_URLEXTRACT:
-            print("Missed optional dependency: poetry run pip install -r extra/optional_dependencies.txt")
-            return
+            raise CuckooDependencyError("Missing dependency 'URLExtract'")
+        super().__init__(*args, **kwargs)
 
+    def run(self):
         log.debug("Started html dump processing")
-        self.key = "html_scraper"
 
         html_dump_path = os.path.join(self.analysis_path, "htmldump", "html_dump.dump")
         last_url_path = os.path.join(self.analysis_path, "htmldump", "last_url.dump")