From 27467eb9e7a601ad95a751f3a80704506d60419b Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 15:02:36 +0100 Subject: [PATCH 01/14] files deduplication --- lib/cuckoo/common/cleaners_utils.py | 71 ++++++++++++++++++- .../common/integrations/file_extra_info.py | 52 ++++++++++---- modules/processing/CAPE.py | 27 ++++++- 3 files changed, 132 insertions(+), 18 deletions(-) diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py index 14953df9bf6..ed45c07d0f4 100644 --- a/lib/cuckoo/common/cleaners_utils.py +++ b/lib/cuckoo/common/cleaners_utils.py @@ -125,6 +125,8 @@ def free_space_monitor(path=False, return_value=False, processing=False, analysi cleanup_dict["delete_older_than"] = config.cleaner.analysis if config.cleaner.unused_files_in_mongodb: cleanup_dict["delete_unused_file_data_in_mongo"] = 1 + if config.cleaner.get("files"): + cleanup_dict["delete_files_items_older_than"] = config.cleaner.get("files") need_space, space_available = False, 0 # Calculate the free disk space in megabytes. @@ -683,6 +685,68 @@ def cape_clean_tlp(): delete_bulk_tasks_n_folders(tlp_tasks, False) +def files_clean_before(timerange: str): + """ + Clean up files in storage/files that are not referenced by any analysis + and are older than the specified time range. + """ + older_than = convert_into_time(timerange) + files_folder = os.path.join(CUCKOO_ROOT, "storage", "files") + analyses_folder = os.path.join(CUCKOO_ROOT, "storage", "analyses") + + if not path_exists(files_folder): + return + + # 1. Build set of referenced hashes + referenced = set() + used_mongo = False + + if is_reporting_db_connected() and repconf.mongodb.enabled and "mongo_find" in globals(): + try: + # Query all _id (SHA256) from files collection + cursor = mongo_find("files", {}, {"_id": 1}) + for doc in cursor: + referenced.add(doc["_id"]) + used_mongo = True + log.info("Loaded %d referenced files from MongoDB", len(referenced)) + except Exception as e: + log.error("Failed to query MongoDB for files: %s. Falling back to filesystem scan.", e) + + if not used_mongo and path_exists(analyses_folder): + log.info("Scanning analysis folders for file references...") + with os.scandir(analyses_folder) as it: + for entry in it: + if not entry.is_dir(): + continue + selfextracted = os.path.join(entry.path, "selfextracted") + if path_exists(selfextracted): + with os.scandir(selfextracted) as se_it: + for se_entry in se_it: + if se_entry.is_symlink(): + try: + target = os.readlink(se_entry.path) + # Check if it points to storage/files + if os.path.abspath(target).startswith(os.path.abspath(files_folder)): + referenced.add(os.path.basename(target)) + except OSError: + pass + + # 2. Iterate storage/files and clean + for _, _, filenames in os.walk(files_folder): + for sha256 in filenames: + if sha256 in referenced: + continue + + file_path = os.path.join(files_folder, sha256) + try: + st_ctime = path_get_date(file_path) + # Correct logic: delete if OLDER than limit (<) + if datetime.fromtimestamp(st_ctime) < older_than: + path_delete(file_path) + except Exception as e: + log.warning("Error checking/deleting file %s: %s", file_path, e) + + def binaries_clean_before(timerange: str): # In case if "delete_bin_copy = off" we might need to clean binaries # find storage/binaries/ -name "*" -type f -mtime 5 -delete @@ -783,11 +847,14 @@ def execute_cleanup(args: dict, init_log=True): if args.get("delete_tmp_items_older_than"): tmp_clean_before(args["delete_tmp_items_older_than"]) + if args.get("delete_unused_file_data_in_mongo"): + delete_unused_file_data_in_mongo() + if args.get("delete_binaries_items_older_than"): binaries_clean_before(args["delete_binaries_items_older_than"]) - if args.get("delete_unused_file_data_in_mongo"): - delete_unused_file_data_in_mongo() + if args.get("delete_files_items_older_than"): + files_clean_before(args["delete_files_items_older_than"]) if args.get("cleanup_files_collection_by_id"): cleanup_files_collection_by_id(args["cleanup_files_collection_by_id"]) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index 4d164c1ecf5..c04a323998f 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -385,22 +385,44 @@ def _extracted_files_metadata( file_info["path"] = dest_path file_info["guest_paths"] = [file_info["name"]] file_info["name"] = os.path.basename(dest_path) + # Define the new central storage for all files (extracted, dropped, etc.) + files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files") + master_file_path = os.path.join(files_storage_dir, file_info["sha256"]) + + # 1. Ensure file is in central storage + if not path_exists(master_file_path): + path_mkdir(files_storage_dir, exist_ok=True) + shutil.move(full_path, master_file_path) + elif path_exists(full_path): + # We already have it, delete the temp duplicate + path_delete(full_path) + + # 2. Create symlink in analysis folder (or copy if link fails) if not path_exists(dest_path): - shutil.move(full_path, dest_path) - print( - json.dumps( - { - "path": os.path.join("files", file_info["sha256"]), - "filepath": file_info["name"], - "pids": [], - "ppids": [], - "metadata": "", - "category": "files", - }, - ensure_ascii=False, - ), - file=f, - ) + try: + if hasattr(os, "symlink"): + os.symlink(master_file_path, dest_path) + else: + shutil.copy(master_file_path, dest_path) + except OSError: + # Fallback to copy on error + shutil.copy(master_file_path, dest_path) + + # Update files.json for UI/Reporting to correctly reference the symlinked file + print( + json.dumps( + { + "path": os.path.join("selfextracted", file_info["sha256"]), + "filepath": file_info["name"], + "pids": [], + "ppids": [], + "metadata": "", + "category": "selfextracted", + }, + ensure_ascii=False, + ), + file=f, + ) file_info["data"] = is_text_file(file_info, destination_folder, processing_conf.CAPE.buffer) metadata.append(file_info) diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index cc4671eed89..648777fcb18 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -16,6 +16,7 @@ import json import logging import os +import shutil import timeit from contextlib import suppress from pathlib import Path @@ -23,9 +24,10 @@ from lib.cuckoo.common.abstracts import Processing from lib.cuckoo.common.cape_utils import cape_name_from_yara, is_duplicated_binary, pe_map, static_config_parsers from lib.cuckoo.common.config import Config +from lib.cuckoo.common.constants import CUCKOO_ROOT from lib.cuckoo.common.integrations.file_extra_info import DuplicatesType, static_file_info from lib.cuckoo.common.objects import File -from lib.cuckoo.common.path_utils import path_exists +from lib.cuckoo.common.path_utils import path_exists, path_mkdir from lib.cuckoo.common.replace_patterns_utils import _clean_path from lib.cuckoo.common.utils import ( add_family_detection, @@ -175,6 +177,29 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, f = File(file_path, metadata.get("metadata", "")) sha256 = f.get_sha256() + # Deduplicate dropped, procdump, CAPE, and package files to storage/files + if category in ("dropped", "procdump", "CAPE", "package", "procmemory") and not os.path.islink(file_path): + try: + files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files") + master_path = os.path.join(files_storage_dir, sha256) + + if not path_exists(master_path): + path_mkdir(files_storage_dir, exist_ok=True) + # Move file + shutil.move(file_path, master_path) + else: + # Already exists, delete duplicate + os.remove(file_path) + + # Link back + if hasattr(os, "symlink"): + os.symlink(master_path, file_path) + else: + shutil.copy(master_path, file_path) + + except Exception as e: + log.error("Deduplication failed for %s: %s", file_path, e) + if sha256 in duplicated["sha256"]: log.debug("Skipping file that has already been processed: %s", sha256) return From 655677fbc0808bc4c9422e3588151c36d821875e Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 15:11:09 +0100 Subject: [PATCH 02/14] Update modules/processing/CAPE.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- modules/processing/CAPE.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index 648777fcb18..4ddfe6c2872 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -192,9 +192,9 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, os.remove(file_path) # Link back - if hasattr(os, "symlink"): + try: os.symlink(master_path, file_path) - else: + except (OSError, AttributeError): shutil.copy(master_path, file_path) except Exception as e: From 170b0042641998064141cd6582a81d43843f956d Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 15:11:22 +0100 Subject: [PATCH 03/14] Update lib/cuckoo/common/cleaners_utils.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- lib/cuckoo/common/cleaners_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py index ed45c07d0f4..a132b670c13 100644 --- a/lib/cuckoo/common/cleaners_utils.py +++ b/lib/cuckoo/common/cleaners_utils.py @@ -732,12 +732,12 @@ def files_clean_before(timerange: str): pass # 2. Iterate storage/files and clean - for _, _, filenames in os.walk(files_folder): + for root, _, filenames in os.walk(files_folder): for sha256 in filenames: if sha256 in referenced: continue - file_path = os.path.join(files_folder, sha256) + file_path = os.path.join(root, sha256) try: st_ctime = path_get_date(file_path) # Correct logic: delete if OLDER than limit (<) From 8eb704ef2606f1923f7f44717d863369eae3b70f Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 15:48:41 +0100 Subject: [PATCH 04/14] Update file_extra_info.py --- lib/cuckoo/common/integrations/file_extra_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index c04a323998f..67b1b7bfad1 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -407,7 +407,7 @@ def _extracted_files_metadata( except OSError: # Fallback to copy on error shutil.copy(master_file_path, dest_path) - + # Update files.json for UI/Reporting to correctly reference the symlinked file print( json.dumps( From dc0b4f123c48d30d044bdf8051fd3fc51f507345 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 16:18:16 +0100 Subject: [PATCH 05/14] Implement sharded storage for files by SHA256 Introduced get_files_storage_path to shard files in storage/files by SHA256 prefix, reducing the number of files per directory. Updated file deduplication and cleaning logic in CAPE processing, file_extra_info integration, and cleaners_utils to use the new sharded path structure and to remove empty directories after file deletion. --- lib/cuckoo/common/cleaners_utils.py | 10 +++++++++- lib/cuckoo/common/integrations/file_extra_info.py | 6 +++--- lib/cuckoo/common/utils.py | 12 ++++++++++++ modules/processing/CAPE.py | 5 +++-- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py index a132b670c13..ef0c73546cc 100644 --- a/lib/cuckoo/common/cleaners_utils.py +++ b/lib/cuckoo/common/cleaners_utils.py @@ -732,7 +732,7 @@ def files_clean_before(timerange: str): pass # 2. Iterate storage/files and clean - for root, _, filenames in os.walk(files_folder): + for root, _, filenames in os.walk(files_folder, topdown=False): for sha256 in filenames: if sha256 in referenced: continue @@ -745,6 +745,14 @@ def files_clean_before(timerange: str): path_delete(file_path) except Exception as e: log.warning("Error checking/deleting file %s: %s", file_path, e) + + # Try to remove empty directories (except the root files_folder) + if root != files_folder: + try: + os.rmdir(root) + except OSError: + # Directory not empty or other error + pass def binaries_clean_before(timerange: str): diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index 67b1b7bfad1..c690eb4b071 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -45,7 +45,7 @@ path_read_file, path_write_file, ) -from lib.cuckoo.common.utils import get_options, is_text_file +from lib.cuckoo.common.utils import get_files_storage_path, get_options, is_text_file try: from sflock import unpack @@ -386,8 +386,8 @@ def _extracted_files_metadata( file_info["guest_paths"] = [file_info["name"]] file_info["name"] = os.path.basename(dest_path) # Define the new central storage for all files (extracted, dropped, etc.) - files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files") - master_file_path = os.path.join(files_storage_dir, file_info["sha256"]) + master_file_path = get_files_storage_path(file_info["sha256"]) + files_storage_dir = os.path.dirname(master_file_path) # 1. Ensure file is in central storage if not path_exists(master_file_path): diff --git a/lib/cuckoo/common/utils.py b/lib/cuckoo/common/utils.py index 3f2e67dd1bb..9747832e907 100644 --- a/lib/cuckoo/common/utils.py +++ b/lib/cuckoo/common/utils.py @@ -185,6 +185,18 @@ def get_memdump_path(memdump_id, analysis_folder=False): ) +def get_files_storage_path(sha256: str) -> str: + """ + Get the path to the storage/files directory for a given SHA256. + Uses sharding (e.g., storage/files/ab/cd/abcdef...) to avoid + too many files in a single directory. + """ + if not sha256 or len(sha256) < 4: + return os.path.join(CUCKOO_ROOT, "storage", "files", sha256) + + return os.path.join(CUCKOO_ROOT, "storage", "files", sha256[:2], sha256[2:4], sha256) + + def validate_referrer(url): if not url: return None diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index 4ddfe6c2872..fa580d31962 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -33,6 +33,7 @@ add_family_detection, convert_to_printable_and_truncate, get_clamav_consensus, + get_files_storage_path, make_bytes, texttypes, wide2str, @@ -180,8 +181,8 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, # Deduplicate dropped, procdump, CAPE, and package files to storage/files if category in ("dropped", "procdump", "CAPE", "package", "procmemory") and not os.path.islink(file_path): try: - files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files") - master_path = os.path.join(files_storage_dir, sha256) + master_path = get_files_storage_path(sha256) + files_storage_dir = os.path.dirname(master_path) if not path_exists(master_path): path_mkdir(files_storage_dir, exist_ok=True) From 9034dddf715fb5e1c30313a78ffd33234234a9c4 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 16:29:53 +0100 Subject: [PATCH 06/14] fix --- lib/cuckoo/common/integrations/file_extra_info.py | 5 +---- modules/processing/CAPE.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index c690eb4b071..c8f6d22421b 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -400,10 +400,7 @@ def _extracted_files_metadata( # 2. Create symlink in analysis folder (or copy if link fails) if not path_exists(dest_path): try: - if hasattr(os, "symlink"): - os.symlink(master_file_path, dest_path) - else: - shutil.copy(master_file_path, dest_path) + os.symlink(master_file_path, dest_path) except OSError: # Fallback to copy on error shutil.copy(master_file_path, dest_path) diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index fa580d31962..354490c5e4e 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -24,7 +24,6 @@ from lib.cuckoo.common.abstracts import Processing from lib.cuckoo.common.cape_utils import cape_name_from_yara, is_duplicated_binary, pe_map, static_config_parsers from lib.cuckoo.common.config import Config -from lib.cuckoo.common.constants import CUCKOO_ROOT from lib.cuckoo.common.integrations.file_extra_info import DuplicatesType, static_file_info from lib.cuckoo.common.objects import File from lib.cuckoo.common.path_utils import path_exists, path_mkdir From 0d0ce869aae288a832eb48bc585899ea0d2f0a1f Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 16:32:40 +0100 Subject: [PATCH 07/14] Update cleaners_utils.py --- lib/cuckoo/common/cleaners_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py index ef0c73546cc..0472bf018a8 100644 --- a/lib/cuckoo/common/cleaners_utils.py +++ b/lib/cuckoo/common/cleaners_utils.py @@ -745,7 +745,7 @@ def files_clean_before(timerange: str): path_delete(file_path) except Exception as e: log.warning("Error checking/deleting file %s: %s", file_path, e) - + # Try to remove empty directories (except the root files_folder) if root != files_folder: try: From bfa215e1cbc2e07ce2019596dcecba3e7b99ccc0 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 18:27:30 +0100 Subject: [PATCH 08/14] Update dist.py --- utils/dist.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/utils/dist.py b/utils/dist.py index f319bdc2055..8212708dcdc 100644 --- a/utils/dist.py +++ b/utils/dist.py @@ -41,7 +41,7 @@ from lib.cuckoo.common.dist_db import ExitNodes, Machine, Node, Task, create_session from lib.cuckoo.common.path_utils import path_delete, path_exists, path_get_size, path_mkdir, path_mount_point, path_write_file from lib.cuckoo.common.socket_utils import send_socket_command -from lib.cuckoo.common.utils import get_options +from lib.cuckoo.common.utils import get_files_storage_path, get_options from lib.cuckoo.core.database import ( TASK_BANNED, TASK_DISTRIBUTED, @@ -304,6 +304,72 @@ def node_get_report_nfs(task_id, worker_name, main_task_id) -> bool: return True +def sync_sharded_files_nfs(worker_name, main_task_id): + """ + Synchronize deduplicated files from worker to master using sharded storage. + """ + analysis_path = os.path.join(CUCKOO_ROOT, "storage", "analyses", str(main_task_id)) + files_json_path = os.path.join(analysis_path, "files.json") + + if not path_exists(files_json_path): + return + + try: + with open(files_json_path, "r") as f: + for line in f: + try: + entry = json.loads(line) + rel_path = entry.get("path") + if not rel_path or "selfextracted" not in rel_path: + continue + + # Extract SHA256 from path (e.g. selfextracted/SHA256) + sha256 = os.path.basename(rel_path) + if len(sha256) != 64: + continue + + # Master destination (sharded) + master_dest = get_files_storage_path(sha256) + + # If missing on master, fetch from worker + if not path_exists(master_dest): + worker_mount = os.path.join(CUCKOO_ROOT, dist_conf.NFS.mount_folder, str(worker_name)) + # Construct worker source path (sharded) manually relative to mount + shard_rel = os.path.join("storage", "files", sha256[:2], sha256[2:4], sha256) + worker_src = os.path.join(worker_mount, shard_rel) + + if path_exists(worker_src): + path_mkdir(os.path.dirname(master_dest), exist_ok=True) + shutil.copy2(worker_src, master_dest) + else: + # Fallback check for flat structure on worker (migration support) + flat_worker_src = os.path.join(worker_mount, "storage", "files", sha256) + if path_exists(flat_worker_src): + path_mkdir(os.path.dirname(master_dest), exist_ok=True) + shutil.copy2(flat_worker_src, master_dest) + + # Ensure symlink in analysis folder is correct + link_path = os.path.join(analysis_path, rel_path) + + # If it's a broken link or doesn't exist or is a full file (we want link) + if path_exists(master_dest): + if os.path.islink(link_path): + # Check if it points to the right place? + # For now, simpler to re-link if we want to enforce local storage path + os.remove(link_path) + elif path_exists(link_path): + # It's a file, replace with link to save space + path_delete(link_path) + + path_mkdir(os.path.dirname(link_path), exist_ok=True) + os.symlink(master_dest, link_path) + + except (json.JSONDecodeError, OSError) as e: + log.error("Error syncing file for task %s: %s", main_task_id, e) + except Exception as e: + log.exception("Failed to sync sharded files for task %s: %s", main_task_id, e) + + def _delete_many(node, ids, nodes, db): """ Deletes multiple tasks from a specified node if the node is not the main server. @@ -955,6 +1021,8 @@ def fetch_latest_reports_nfs(self): t.main_task_id, ) + sync_sharded_files_nfs(node.name, t.main_task_id) + # this doesn't exist for some reason if path_exists(t.path): sample_sha256 = None From 8d8d511ca9e53d0b7824ca6a7fa3ec4f9b34fcb4 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 18:58:37 +0100 Subject: [PATCH 09/14] Update cleaners_utils.py --- lib/cuckoo/common/cleaners_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py index 0472bf018a8..6e5b8022ef7 100644 --- a/lib/cuckoo/common/cleaners_utils.py +++ b/lib/cuckoo/common/cleaners_utils.py @@ -351,7 +351,6 @@ def cuckoo_clean_failed_tasks(): # This need to init a console logger handler, because the standard # logger (init_logging()) logs to a file which will be deleted. create_structure() - # ToDo multi status tasks_list = db.list_tasks(status=f"{TASK_FAILED_ANALYSIS}|{TASK_FAILED_PROCESSING}|{TASK_FAILED_REPORTING}|{TASK_RECOVERED}") # ToDo rewrite for bulk delete From ac8e792536d1cf3d71a6e09dca64d08b253bd7be Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 19:03:49 +0100 Subject: [PATCH 10/14] Update dist.py --- utils/dist.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/utils/dist.py b/utils/dist.py index 8212708dcdc..0d734ae9a68 100644 --- a/utils/dist.py +++ b/utils/dist.py @@ -341,29 +341,21 @@ def sync_sharded_files_nfs(worker_name, main_task_id): if path_exists(worker_src): path_mkdir(os.path.dirname(master_dest), exist_ok=True) shutil.copy2(worker_src, master_dest) - else: - # Fallback check for flat structure on worker (migration support) - flat_worker_src = os.path.join(worker_mount, "storage", "files", sha256) - if path_exists(flat_worker_src): - path_mkdir(os.path.dirname(master_dest), exist_ok=True) - shutil.copy2(flat_worker_src, master_dest) # Ensure symlink in analysis folder is correct link_path = os.path.join(analysis_path, rel_path) - + # If it's a broken link or doesn't exist or is a full file (we want link) if path_exists(master_dest): if os.path.islink(link_path): - # Check if it points to the right place? + # Check if it points to the right place? # For now, simpler to re-link if we want to enforce local storage path os.remove(link_path) elif path_exists(link_path): # It's a file, replace with link to save space path_delete(link_path) - path_mkdir(os.path.dirname(link_path), exist_ok=True) os.symlink(master_dest, link_path) - except (json.JSONDecodeError, OSError) as e: log.error("Error syncing file for task %s: %s", main_task_id, e) except Exception as e: From f8ac429d58c14ebbaefa7e800bd4999f79a358bb Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 19:27:34 +0100 Subject: [PATCH 11/14] Update cuckoo.conf.default --- conf/default/cuckoo.conf.default | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/default/cuckoo.conf.default b/conf/default/cuckoo.conf.default index a0da4fb527b..4ee1d3a8938 100644 --- a/conf/default/cuckoo.conf.default +++ b/conf/default/cuckoo.conf.default @@ -233,3 +233,5 @@ analysis = 0 mongo = no # Clean orphan files in mongodb unused_files_in_mongodb = no +# Deduplicated files +files = no From 2a1b78ac1c15fedfce0c0fa8b72267642b276594 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 19:46:43 +0100 Subject: [PATCH 12/14] fix fetch file --- lib/cuckoo/common/web_utils.py | 22 ++++++++++++++++++++-- web/analysis/views.py | 10 +++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/lib/cuckoo/common/web_utils.py b/lib/cuckoo/common/web_utils.py index a8ab2cd0961..7bf652a112c 100644 --- a/lib/cuckoo/common/web_utils.py +++ b/lib/cuckoo/common/web_utils.py @@ -25,6 +25,7 @@ from lib.cuckoo.common.path_utils import path_exists, path_mkdir, path_write_file from lib.cuckoo.common.utils import ( generate_fake_name, + get_files_storage_path, get_ip_address, get_options, get_user_filename, @@ -1118,10 +1119,27 @@ def category_all_files(task_id: str, category: str, base_path: str): # analysis = es.search(index=get_analysis_index(), query=get_query_by_info_id(task_id))["hits"]["hits"][0]["_source"] if analysis: + files = [] if query_category == "CAPE": - return [os.path.join(base_path, block["sha256"]) for block in analysis.get(query_category, {}).get("payloads", [])] + for block in analysis.get(query_category, {}).get("payloads", []): + p = os.path.join(base_path, block["sha256"]) + if path_exists(p): + files.append(p) + else: + p = get_files_storage_path(block["sha256"]) + if path_exists(p): + files.append(p) else: - return [os.path.join(base_path, block["sha256"]) for block in analysis.get(category, [])] + for block in analysis.get(category, []): + p = os.path.join(base_path, block["sha256"]) + if path_exists(p): + files.append(p) + else: + p = get_files_storage_path(block["sha256"]) + if path_exists(p): + files.append(p) + + return files def validate_task(tid, status=TASK_REPORTED): diff --git a/web/analysis/views.py b/web/analysis/views.py index 3db18dd3250..d2bf914c8a7 100644 --- a/web/analysis/views.py +++ b/web/analysis/views.py @@ -33,7 +33,7 @@ from lib.cuckoo.common.config import Config from lib.cuckoo.common.constants import ANALYSIS_BASE_PATH, CUCKOO_ROOT from lib.cuckoo.common.path_utils import path_exists, path_get_size, path_mkdir, path_read_file, path_safe -from lib.cuckoo.common.utils import delete_folder, yara_detected +from lib.cuckoo.common.utils import delete_folder, get_files_storage_path, yara_detected from lib.cuckoo.common.web_utils import category_all_files, my_rate_minutes, my_rate_seconds, perform_search, rateblock, statistics from lib.cuckoo.core.database import TASK_PENDING, Database, Task from modules.reporting.report_doc import CHUNK_CALL_SIZE @@ -1826,6 +1826,10 @@ def file(request, category, task_id, dlfile): # Self Extracted support folder if not path_exists(path): path = os.path.join(CUCKOO_ROOT, "storage", "analyses", str(task_id), "selfextracted", file_name) + + if not path_exists(path) and len(file_name) == 64: + path = get_files_storage_path(file_name) + elif category in ("droppedzipall", "procdumpzipall", "CAPEzipall"): if web_cfg.zipped_download.download_all: sub_cat = category.replace("zipall", "") @@ -1842,6 +1846,10 @@ def file(request, category, task_id, dlfile): path = buf if not path_exists(path): path = os.path.join(CUCKOO_ROOT, "storage", "analyses", str(task_id), "selfextracted", file_name) + + if not path_exists(path) and len(file_name) == 64: + path = get_files_storage_path(file_name) + elif category == "networkzip": buf = os.path.join(CUCKOO_ROOT, "storage", "analyses", task_id, "network", file_name) path = buf From 5c8b9bfbb3328d0c71b2a9866eee0710b335e892 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 20:31:06 +0100 Subject: [PATCH 13/14] sync --- .../common/integrations/file_extra_info.py | 2 +- lib/cuckoo/common/web_utils.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index c8f6d22421b..6af17f5a46f 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -409,7 +409,7 @@ def _extracted_files_metadata( print( json.dumps( { - "path": os.path.join("selfextracted", file_info["sha256"]), + "path": file_info["sha256"], # Store just the SHA256 "filepath": file_info["name"], "pids": [], "ppids": [], diff --git a/lib/cuckoo/common/web_utils.py b/lib/cuckoo/common/web_utils.py index 7bf652a112c..99ddecc6795 100644 --- a/lib/cuckoo/common/web_utils.py +++ b/lib/cuckoo/common/web_utils.py @@ -1122,20 +1122,18 @@ def category_all_files(task_id: str, category: str, base_path: str): files = [] if query_category == "CAPE": for block in analysis.get(query_category, {}).get("payloads", []): - p = os.path.join(base_path, block["sha256"]) - if path_exists(p): - files.append(p) - else: - p = get_files_storage_path(block["sha256"]) + # Path in files.json now stores only the SHA256, not a relative path + sha256 = block.get("path") or block.get("sha256") + if sha256: + p = get_files_storage_path(sha256) if path_exists(p): files.append(p) else: for block in analysis.get(category, []): - p = os.path.join(base_path, block["sha256"]) - if path_exists(p): - files.append(p) - else: - p = get_files_storage_path(block["sha256"]) + # Path in files.json now stores only the SHA256, not a relative path + sha256 = block.get("path") or block.get("sha256") + if sha256: + p = get_files_storage_path(sha256) if path_exists(p): files.append(p) From 283e7471424c8607ca6dcd860a5e13b559ab9a19 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Sat, 13 Dec 2025 21:23:54 +0100 Subject: [PATCH 14/14] Update cleaners_utils.py --- lib/cuckoo/common/cleaners_utils.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py index 6e5b8022ef7..f15a7f894a1 100644 --- a/lib/cuckoo/common/cleaners_utils.py +++ b/lib/cuckoo/common/cleaners_utils.py @@ -717,18 +717,19 @@ def files_clean_before(timerange: str): for entry in it: if not entry.is_dir(): continue - selfextracted = os.path.join(entry.path, "selfextracted") - if path_exists(selfextracted): - with os.scandir(selfextracted) as se_it: - for se_entry in se_it: - if se_entry.is_symlink(): - try: - target = os.readlink(se_entry.path) - # Check if it points to storage/files - if os.path.abspath(target).startswith(os.path.abspath(files_folder)): - referenced.add(os.path.basename(target)) - except OSError: - pass + for subdir in ("selfextracted", "files", "CAPE", "procdump"): + check_dir = os.path.join(entry.path, subdir) + if path_exists(check_dir): + with os.scandir(check_dir) as se_it: + for se_entry in se_it: + if se_entry.is_symlink(): + try: + target = os.readlink(se_entry.path) + # Check if it points to storage/files + if os.path.abspath(target).startswith(os.path.abspath(files_folder)): + referenced.add(os.path.basename(target)) + except OSError: + pass # 2. Iterate storage/files and clean for root, _, filenames in os.walk(files_folder, topdown=False):