From 27467eb9e7a601ad95a751f3a80704506d60419b Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 15:02:36 +0100
Subject: [PATCH 01/14] files deduplication

---
 lib/cuckoo/common/cleaners_utils.py           | 71 ++++++++++++++++++-
 .../common/integrations/file_extra_info.py    | 52 ++++++++++----
 modules/processing/CAPE.py                    | 27 ++++++-
 3 files changed, 132 insertions(+), 18 deletions(-)

diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py
index 14953df9bf6..ed45c07d0f4 100644
--- a/lib/cuckoo/common/cleaners_utils.py
+++ b/lib/cuckoo/common/cleaners_utils.py
@@ -125,6 +125,8 @@ def free_space_monitor(path=False, return_value=False, processing=False, analysi
         cleanup_dict["delete_older_than"] = config.cleaner.analysis
     if config.cleaner.unused_files_in_mongodb:
         cleanup_dict["delete_unused_file_data_in_mongo"] = 1
+    if config.cleaner.get("files"):
+        cleanup_dict["delete_files_items_older_than"] = config.cleaner.get("files")
 
     need_space, space_available = False, 0
     # Calculate the free disk space in megabytes.
@@ -683,6 +685,68 @@ def cape_clean_tlp():
     delete_bulk_tasks_n_folders(tlp_tasks, False)
 
 
+def files_clean_before(timerange: str):
+    """
+    Clean up files in storage/files that are not referenced by any analysis
+    and are older than the specified time range.
+    """
+    older_than = convert_into_time(timerange)
+    files_folder = os.path.join(CUCKOO_ROOT, "storage", "files")
+    analyses_folder = os.path.join(CUCKOO_ROOT, "storage", "analyses")
+
+    if not path_exists(files_folder):
+        return
+
+    # 1. Build set of referenced hashes
+    referenced = set()
+    used_mongo = False
+
+    if is_reporting_db_connected() and repconf.mongodb.enabled and "mongo_find" in globals():
+        try:
+            # Query all _id (SHA256) from files collection
+            cursor = mongo_find("files", {}, {"_id": 1})
+            for doc in cursor:
+                referenced.add(doc["_id"])
+            used_mongo = True
+            log.info("Loaded %d referenced files from MongoDB", len(referenced))
+        except Exception as e:
+            log.error("Failed to query MongoDB for files: %s. Falling back to filesystem scan.", e)
+
+    if not used_mongo and path_exists(analyses_folder):
+        log.info("Scanning analysis folders for file references...")
+        with os.scandir(analyses_folder) as it:
+            for entry in it:
+                if not entry.is_dir():
+                    continue
+                selfextracted = os.path.join(entry.path, "selfextracted")
+                if path_exists(selfextracted):
+                    with os.scandir(selfextracted) as se_it:
+                        for se_entry in se_it:
+                            if se_entry.is_symlink():
+                                try:
+                                    target = os.readlink(se_entry.path)
+                                    # Check if it points to storage/files
+                                    if os.path.abspath(target).startswith(os.path.abspath(files_folder)):
+                                        referenced.add(os.path.basename(target))
+                                except OSError:
+                                    pass
+
+    # 2. Iterate storage/files and clean
+    for _, _, filenames in os.walk(files_folder):
+        for sha256 in filenames:
+            if sha256 in referenced:
+                continue
+
+            file_path = os.path.join(files_folder, sha256)
+            try:
+                st_ctime = path_get_date(file_path)
+                # Correct logic: delete if OLDER than limit (<)
+                if datetime.fromtimestamp(st_ctime) < older_than:
+                    path_delete(file_path)
+            except Exception as e:
+                log.warning("Error checking/deleting file %s: %s", file_path, e)
+
+
 def binaries_clean_before(timerange: str):
     # In case if "delete_bin_copy = off" we might need to clean binaries
     # find storage/binaries/ -name "*" -type f -mtime 5 -delete
@@ -783,11 +847,14 @@ def execute_cleanup(args: dict, init_log=True):
     if args.get("delete_tmp_items_older_than"):
         tmp_clean_before(args["delete_tmp_items_older_than"])
 
+    if args.get("delete_unused_file_data_in_mongo"):
+        delete_unused_file_data_in_mongo()
+
     if args.get("delete_binaries_items_older_than"):
         binaries_clean_before(args["delete_binaries_items_older_than"])
 
-    if args.get("delete_unused_file_data_in_mongo"):
-        delete_unused_file_data_in_mongo()
+    if args.get("delete_files_items_older_than"):
+        files_clean_before(args["delete_files_items_older_than"])
 
     if args.get("cleanup_files_collection_by_id"):
         cleanup_files_collection_by_id(args["cleanup_files_collection_by_id"])
diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index 4d164c1ecf5..c04a323998f 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -385,22 +385,44 @@ def _extracted_files_metadata(
             file_info["path"] = dest_path
             file_info["guest_paths"] = [file_info["name"]]
             file_info["name"] = os.path.basename(dest_path)
+            # Define the new central storage for all files (extracted, dropped, etc.)
+            files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files")
+            master_file_path = os.path.join(files_storage_dir, file_info["sha256"])
+
+            # 1. Ensure file is in central storage
+            if not path_exists(master_file_path):
+                path_mkdir(files_storage_dir, exist_ok=True)
+                shutil.move(full_path, master_file_path)
+            elif path_exists(full_path):
+                # We already have it, delete the temp duplicate
+                path_delete(full_path)
+
+            # 2. Create symlink in analysis folder (or copy if link fails)
             if not path_exists(dest_path):
-                shutil.move(full_path, dest_path)
-                print(
-                    json.dumps(
-                        {
-                            "path": os.path.join("files", file_info["sha256"]),
-                            "filepath": file_info["name"],
-                            "pids": [],
-                            "ppids": [],
-                            "metadata": "",
-                            "category": "files",
-                        },
-                        ensure_ascii=False,
-                    ),
-                    file=f,
-                )
+                try:
+                    if hasattr(os, "symlink"):
+                        os.symlink(master_file_path, dest_path)
+                    else:
+                        shutil.copy(master_file_path, dest_path)
+                except OSError:
+                    # Fallback to copy on error
+                    shutil.copy(master_file_path, dest_path)
+            
+            # Update files.json for UI/Reporting to correctly reference the symlinked file
+            print(
+                json.dumps(
+                    {
+                        "path": os.path.join("selfextracted", file_info["sha256"]),
+                        "filepath": file_info["name"],
+                        "pids": [],
+                        "ppids": [],
+                        "metadata": "",
+                        "category": "selfextracted",
+                    },
+                    ensure_ascii=False,
+                ),
+                file=f,
+            )
             file_info["data"] = is_text_file(file_info, destination_folder, processing_conf.CAPE.buffer)
             metadata.append(file_info)
 
diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index cc4671eed89..648777fcb18 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -16,6 +16,7 @@
 import json
 import logging
 import os
+import shutil
 import timeit
 from contextlib import suppress
 from pathlib import Path
@@ -23,9 +24,10 @@
 from lib.cuckoo.common.abstracts import Processing
 from lib.cuckoo.common.cape_utils import cape_name_from_yara, is_duplicated_binary, pe_map, static_config_parsers
 from lib.cuckoo.common.config import Config
+from lib.cuckoo.common.constants import CUCKOO_ROOT
 from lib.cuckoo.common.integrations.file_extra_info import DuplicatesType, static_file_info
 from lib.cuckoo.common.objects import File
-from lib.cuckoo.common.path_utils import path_exists
+from lib.cuckoo.common.path_utils import path_exists, path_mkdir
 from lib.cuckoo.common.replace_patterns_utils import _clean_path
 from lib.cuckoo.common.utils import (
     add_family_detection,
@@ -175,6 +177,29 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
         f = File(file_path, metadata.get("metadata", ""))
         sha256 = f.get_sha256()
 
+        # Deduplicate dropped, procdump, CAPE, and package files to storage/files
+        if category in ("dropped", "procdump", "CAPE", "package", "procmemory") and not os.path.islink(file_path):
+            try:
+                files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files")
+                master_path = os.path.join(files_storage_dir, sha256)
+
+                if not path_exists(master_path):
+                    path_mkdir(files_storage_dir, exist_ok=True)
+                    # Move file
+                    shutil.move(file_path, master_path)
+                else:
+                    # Already exists, delete duplicate
+                    os.remove(file_path)
+
+                # Link back
+                if hasattr(os, "symlink"):
+                    os.symlink(master_path, file_path)
+                else:
+                    shutil.copy(master_path, file_path)
+
+            except Exception as e:
+                log.error("Deduplication failed for %s: %s", file_path, e)
+
         if sha256 in duplicated["sha256"]:
             log.debug("Skipping file that has already been processed: %s", sha256)
             return

From 655677fbc0808bc4c9422e3588151c36d821875e Mon Sep 17 00:00:00 2001
From: doomedraven <abrukhovetskyy@google.com>
Date: Sat, 13 Dec 2025 15:11:09 +0100
Subject: [PATCH 02/14] Update modules/processing/CAPE.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 modules/processing/CAPE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index 648777fcb18..4ddfe6c2872 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -192,9 +192,9 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
                     os.remove(file_path)
 
                 # Link back
-                if hasattr(os, "symlink"):
+                try:
                     os.symlink(master_path, file_path)
-                else:
+                except (OSError, AttributeError):
                     shutil.copy(master_path, file_path)
 
             except Exception as e:

From 170b0042641998064141cd6582a81d43843f956d Mon Sep 17 00:00:00 2001
From: doomedraven <abrukhovetskyy@google.com>
Date: Sat, 13 Dec 2025 15:11:22 +0100
Subject: [PATCH 03/14] Update lib/cuckoo/common/cleaners_utils.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 lib/cuckoo/common/cleaners_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py
index ed45c07d0f4..a132b670c13 100644
--- a/lib/cuckoo/common/cleaners_utils.py
+++ b/lib/cuckoo/common/cleaners_utils.py
@@ -732,12 +732,12 @@ def files_clean_before(timerange: str):
                                     pass
 
     # 2. Iterate storage/files and clean
-    for _, _, filenames in os.walk(files_folder):
+    for root, _, filenames in os.walk(files_folder):
         for sha256 in filenames:
             if sha256 in referenced:
                 continue
 
-            file_path = os.path.join(files_folder, sha256)
+            file_path = os.path.join(root, sha256)
             try:
                 st_ctime = path_get_date(file_path)
                 # Correct logic: delete if OLDER than limit (<)

From 8eb704ef2606f1923f7f44717d863369eae3b70f Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 15:48:41 +0100
Subject: [PATCH 04/14] Update file_extra_info.py

---
 lib/cuckoo/common/integrations/file_extra_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index c04a323998f..67b1b7bfad1 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -407,7 +407,7 @@ def _extracted_files_metadata(
                 except OSError:
                     # Fallback to copy on error
                     shutil.copy(master_file_path, dest_path)
-            
+
             # Update files.json for UI/Reporting to correctly reference the symlinked file
             print(
                 json.dumps(

From dc0b4f123c48d30d044bdf8051fd3fc51f507345 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 16:18:16 +0100
Subject: [PATCH 05/14] Implement sharded storage for files by SHA256

Introduced get_files_storage_path to shard files in storage/files by SHA256 prefix, reducing the number of files per directory. Updated file deduplication and cleaning logic in CAPE processing, file_extra_info integration, and cleaners_utils to use the new sharded path structure and to remove empty directories after file deletion.
---
 lib/cuckoo/common/cleaners_utils.py               | 10 +++++++++-
 lib/cuckoo/common/integrations/file_extra_info.py |  6 +++---
 lib/cuckoo/common/utils.py                        | 12 ++++++++++++
 modules/processing/CAPE.py                        |  5 +++--
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py
index a132b670c13..ef0c73546cc 100644
--- a/lib/cuckoo/common/cleaners_utils.py
+++ b/lib/cuckoo/common/cleaners_utils.py
@@ -732,7 +732,7 @@ def files_clean_before(timerange: str):
                                     pass
 
     # 2. Iterate storage/files and clean
-    for root, _, filenames in os.walk(files_folder):
+    for root, _, filenames in os.walk(files_folder, topdown=False):
         for sha256 in filenames:
             if sha256 in referenced:
                 continue
@@ -745,6 +745,14 @@ def files_clean_before(timerange: str):
                     path_delete(file_path)
             except Exception as e:
                 log.warning("Error checking/deleting file %s: %s", file_path, e)
+        
+        # Try to remove empty directories (except the root files_folder)
+        if root != files_folder:
+            try:
+                os.rmdir(root)
+            except OSError:
+                # Directory not empty or other error
+                pass
 
 
 def binaries_clean_before(timerange: str):
diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index 67b1b7bfad1..c690eb4b071 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -45,7 +45,7 @@
     path_read_file,
     path_write_file,
 )
-from lib.cuckoo.common.utils import get_options, is_text_file
+from lib.cuckoo.common.utils import get_files_storage_path, get_options, is_text_file
 
 try:
     from sflock import unpack
@@ -386,8 +386,8 @@ def _extracted_files_metadata(
             file_info["guest_paths"] = [file_info["name"]]
             file_info["name"] = os.path.basename(dest_path)
             # Define the new central storage for all files (extracted, dropped, etc.)
-            files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files")
-            master_file_path = os.path.join(files_storage_dir, file_info["sha256"])
+            master_file_path = get_files_storage_path(file_info["sha256"])
+            files_storage_dir = os.path.dirname(master_file_path)
 
             # 1. Ensure file is in central storage
             if not path_exists(master_file_path):
diff --git a/lib/cuckoo/common/utils.py b/lib/cuckoo/common/utils.py
index 3f2e67dd1bb..9747832e907 100644
--- a/lib/cuckoo/common/utils.py
+++ b/lib/cuckoo/common/utils.py
@@ -185,6 +185,18 @@ def get_memdump_path(memdump_id, analysis_folder=False):
     )
 
 
+def get_files_storage_path(sha256: str) -> str:
+    """
+    Get the path to the storage/files directory for a given SHA256.
+    Uses sharding (e.g., storage/files/ab/cd/abcdef...) to avoid
+    too many files in a single directory.
+    """
+    if not sha256 or len(sha256) < 4:
+        return os.path.join(CUCKOO_ROOT, "storage", "files", sha256)
+
+    return os.path.join(CUCKOO_ROOT, "storage", "files", sha256[:2], sha256[2:4], sha256)
+
+
 def validate_referrer(url):
     if not url:
         return None
diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index 4ddfe6c2872..fa580d31962 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -33,6 +33,7 @@
     add_family_detection,
     convert_to_printable_and_truncate,
     get_clamav_consensus,
+    get_files_storage_path,
     make_bytes,
     texttypes,
     wide2str,
@@ -180,8 +181,8 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
         # Deduplicate dropped, procdump, CAPE, and package files to storage/files
         if category in ("dropped", "procdump", "CAPE", "package", "procmemory") and not os.path.islink(file_path):
             try:
-                files_storage_dir = os.path.join(CUCKOO_ROOT, "storage", "files")
-                master_path = os.path.join(files_storage_dir, sha256)
+                master_path = get_files_storage_path(sha256)
+                files_storage_dir = os.path.dirname(master_path)
 
                 if not path_exists(master_path):
                     path_mkdir(files_storage_dir, exist_ok=True)

From 9034dddf715fb5e1c30313a78ffd33234234a9c4 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 16:29:53 +0100
Subject: [PATCH 06/14] fix

---
 lib/cuckoo/common/integrations/file_extra_info.py | 5 +----
 modules/processing/CAPE.py                        | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index c690eb4b071..c8f6d22421b 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -400,10 +400,7 @@ def _extracted_files_metadata(
             # 2. Create symlink in analysis folder (or copy if link fails)
             if not path_exists(dest_path):
                 try:
-                    if hasattr(os, "symlink"):
-                        os.symlink(master_file_path, dest_path)
-                    else:
-                        shutil.copy(master_file_path, dest_path)
+                    os.symlink(master_file_path, dest_path)
                 except OSError:
                     # Fallback to copy on error
                     shutil.copy(master_file_path, dest_path)
diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index fa580d31962..354490c5e4e 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -24,7 +24,6 @@
 from lib.cuckoo.common.abstracts import Processing
 from lib.cuckoo.common.cape_utils import cape_name_from_yara, is_duplicated_binary, pe_map, static_config_parsers
 from lib.cuckoo.common.config import Config
-from lib.cuckoo.common.constants import CUCKOO_ROOT
 from lib.cuckoo.common.integrations.file_extra_info import DuplicatesType, static_file_info
 from lib.cuckoo.common.objects import File
 from lib.cuckoo.common.path_utils import path_exists, path_mkdir

From 0d0ce869aae288a832eb48bc585899ea0d2f0a1f Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 16:32:40 +0100
Subject: [PATCH 07/14] Update cleaners_utils.py

---
 lib/cuckoo/common/cleaners_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py
index ef0c73546cc..0472bf018a8 100644
--- a/lib/cuckoo/common/cleaners_utils.py
+++ b/lib/cuckoo/common/cleaners_utils.py
@@ -745,7 +745,7 @@ def files_clean_before(timerange: str):
                     path_delete(file_path)
             except Exception as e:
                 log.warning("Error checking/deleting file %s: %s", file_path, e)
-        
+
         # Try to remove empty directories (except the root files_folder)
         if root != files_folder:
             try:

From bfa215e1cbc2e07ce2019596dcecba3e7b99ccc0 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 18:27:30 +0100
Subject: [PATCH 08/14] Update dist.py

---
 utils/dist.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/utils/dist.py b/utils/dist.py
index f319bdc2055..8212708dcdc 100644
--- a/utils/dist.py
+++ b/utils/dist.py
@@ -41,7 +41,7 @@
 from lib.cuckoo.common.dist_db import ExitNodes, Machine, Node, Task, create_session
 from lib.cuckoo.common.path_utils import path_delete, path_exists, path_get_size, path_mkdir, path_mount_point, path_write_file
 from lib.cuckoo.common.socket_utils import send_socket_command
-from lib.cuckoo.common.utils import get_options
+from lib.cuckoo.common.utils import get_files_storage_path, get_options
 from lib.cuckoo.core.database import (
     TASK_BANNED,
     TASK_DISTRIBUTED,
@@ -304,6 +304,72 @@ def node_get_report_nfs(task_id, worker_name, main_task_id) -> bool:
     return True
 
 
+def sync_sharded_files_nfs(worker_name, main_task_id):
+    """
+    Synchronize deduplicated files from worker to master using sharded storage.
+    """
+    analysis_path = os.path.join(CUCKOO_ROOT, "storage", "analyses", str(main_task_id))
+    files_json_path = os.path.join(analysis_path, "files.json")
+
+    if not path_exists(files_json_path):
+        return
+
+    try:
+        with open(files_json_path, "r") as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    rel_path = entry.get("path")
+                    if not rel_path or "selfextracted" not in rel_path:
+                        continue
+
+                    # Extract SHA256 from path (e.g. selfextracted/SHA256)
+                    sha256 = os.path.basename(rel_path)
+                    if len(sha256) != 64:
+                        continue
+
+                    # Master destination (sharded)
+                    master_dest = get_files_storage_path(sha256)
+
+                    # If missing on master, fetch from worker
+                    if not path_exists(master_dest):
+                        worker_mount = os.path.join(CUCKOO_ROOT, dist_conf.NFS.mount_folder, str(worker_name))
+                        # Construct worker source path (sharded) manually relative to mount
+                        shard_rel = os.path.join("storage", "files", sha256[:2], sha256[2:4], sha256)
+                        worker_src = os.path.join(worker_mount, shard_rel)
+
+                        if path_exists(worker_src):
+                            path_mkdir(os.path.dirname(master_dest), exist_ok=True)
+                            shutil.copy2(worker_src, master_dest)
+                        else:
+                            # Fallback check for flat structure on worker (migration support)
+                            flat_worker_src = os.path.join(worker_mount, "storage", "files", sha256)
+                            if path_exists(flat_worker_src):
+                                path_mkdir(os.path.dirname(master_dest), exist_ok=True)
+                                shutil.copy2(flat_worker_src, master_dest)
+
+                    # Ensure symlink in analysis folder is correct
+                    link_path = os.path.join(analysis_path, rel_path)
+                    
+                    # If it's a broken link or doesn't exist or is a full file (we want link)
+                    if path_exists(master_dest):
+                        if os.path.islink(link_path):
+                            # Check if it points to the right place? 
+                            # For now, simpler to re-link if we want to enforce local storage path
+                            os.remove(link_path)
+                        elif path_exists(link_path):
+                            # It's a file, replace with link to save space
+                            path_delete(link_path)
+                        
+                        path_mkdir(os.path.dirname(link_path), exist_ok=True)
+                        os.symlink(master_dest, link_path)
+
+                except (json.JSONDecodeError, OSError) as e:
+                    log.error("Error syncing file for task %s: %s", main_task_id, e)
+    except Exception as e:
+        log.exception("Failed to sync sharded files for task %s: %s", main_task_id, e)
+
+
 def _delete_many(node, ids, nodes, db):
     """
     Deletes multiple tasks from a specified node if the node is not the main server.
@@ -955,6 +1021,8 @@ def fetch_latest_reports_nfs(self):
                         t.main_task_id,
                     )
 
+                    sync_sharded_files_nfs(node.name, t.main_task_id)
+
                     # this doesn't exist for some reason
                     if path_exists(t.path):
                         sample_sha256 = None

From 8d8d511ca9e53d0b7824ca6a7fa3ec4f9b34fcb4 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 18:58:37 +0100
Subject: [PATCH 09/14] Update cleaners_utils.py

---
 lib/cuckoo/common/cleaners_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py
index 0472bf018a8..6e5b8022ef7 100644
--- a/lib/cuckoo/common/cleaners_utils.py
+++ b/lib/cuckoo/common/cleaners_utils.py
@@ -351,7 +351,6 @@ def cuckoo_clean_failed_tasks():
     # This need to init a console logger handler, because the standard
     # logger (init_logging()) logs to a file which will be deleted.
     create_structure()
-
     # ToDo multi status
     tasks_list = db.list_tasks(status=f"{TASK_FAILED_ANALYSIS}|{TASK_FAILED_PROCESSING}|{TASK_FAILED_REPORTING}|{TASK_RECOVERED}")
     # ToDo rewrite for bulk delete

From ac8e792536d1cf3d71a6e09dca64d08b253bd7be Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 19:03:49 +0100
Subject: [PATCH 10/14] Update dist.py

---
 utils/dist.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/utils/dist.py b/utils/dist.py
index 8212708dcdc..0d734ae9a68 100644
--- a/utils/dist.py
+++ b/utils/dist.py
@@ -341,29 +341,21 @@ def sync_sharded_files_nfs(worker_name, main_task_id):
                         if path_exists(worker_src):
                             path_mkdir(os.path.dirname(master_dest), exist_ok=True)
                             shutil.copy2(worker_src, master_dest)
-                        else:
-                            # Fallback check for flat structure on worker (migration support)
-                            flat_worker_src = os.path.join(worker_mount, "storage", "files", sha256)
-                            if path_exists(flat_worker_src):
-                                path_mkdir(os.path.dirname(master_dest), exist_ok=True)
-                                shutil.copy2(flat_worker_src, master_dest)
 
                     # Ensure symlink in analysis folder is correct
                     link_path = os.path.join(analysis_path, rel_path)
-                    
+
                     # If it's a broken link or doesn't exist or is a full file (we want link)
                     if path_exists(master_dest):
                         if os.path.islink(link_path):
-                            # Check if it points to the right place? 
+                            # Check if it points to the right place?
                             # For now, simpler to re-link if we want to enforce local storage path
                             os.remove(link_path)
                         elif path_exists(link_path):
                             # It's a file, replace with link to save space
                             path_delete(link_path)
-                        
                         path_mkdir(os.path.dirname(link_path), exist_ok=True)
                         os.symlink(master_dest, link_path)
-
                 except (json.JSONDecodeError, OSError) as e:
                     log.error("Error syncing file for task %s: %s", main_task_id, e)
     except Exception as e:

From f8ac429d58c14ebbaefa7e800bd4999f79a358bb Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 19:27:34 +0100
Subject: [PATCH 11/14] Update cuckoo.conf.default

---
 conf/default/cuckoo.conf.default | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conf/default/cuckoo.conf.default b/conf/default/cuckoo.conf.default
index a0da4fb527b..4ee1d3a8938 100644
--- a/conf/default/cuckoo.conf.default
+++ b/conf/default/cuckoo.conf.default
@@ -233,3 +233,5 @@ analysis = 0
 mongo = no
 # Clean orphan files in mongodb
 unused_files_in_mongodb = no
+# Deduplicated files
+files = no

From 2a1b78ac1c15fedfce0c0fa8b72267642b276594 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 19:46:43 +0100
Subject: [PATCH 12/14] fix fetch file

---
 lib/cuckoo/common/web_utils.py | 22 ++++++++++++++++++++--
 web/analysis/views.py          | 10 +++++++++-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/lib/cuckoo/common/web_utils.py b/lib/cuckoo/common/web_utils.py
index a8ab2cd0961..7bf652a112c 100644
--- a/lib/cuckoo/common/web_utils.py
+++ b/lib/cuckoo/common/web_utils.py
@@ -25,6 +25,7 @@
 from lib.cuckoo.common.path_utils import path_exists, path_mkdir, path_write_file
 from lib.cuckoo.common.utils import (
     generate_fake_name,
+    get_files_storage_path,
     get_ip_address,
     get_options,
     get_user_filename,
@@ -1118,10 +1119,27 @@ def category_all_files(task_id: str, category: str, base_path: str):
     #    analysis = es.search(index=get_analysis_index(), query=get_query_by_info_id(task_id))["hits"]["hits"][0]["_source"]
 
     if analysis:
+        files = []
         if query_category == "CAPE":
-            return [os.path.join(base_path, block["sha256"]) for block in analysis.get(query_category, {}).get("payloads", [])]
+            for block in analysis.get(query_category, {}).get("payloads", []):
+                p = os.path.join(base_path, block["sha256"])
+                if path_exists(p):
+                    files.append(p)
+                else:
+                    p = get_files_storage_path(block["sha256"])
+                    if path_exists(p):
+                        files.append(p)
         else:
-            return [os.path.join(base_path, block["sha256"]) for block in analysis.get(category, [])]
+            for block in analysis.get(category, []):
+                p = os.path.join(base_path, block["sha256"])
+                if path_exists(p):
+                    files.append(p)
+                else:
+                    p = get_files_storage_path(block["sha256"])
+                    if path_exists(p):
+                        files.append(p)
+
+        return files
 
 
 def validate_task(tid, status=TASK_REPORTED):
diff --git a/web/analysis/views.py b/web/analysis/views.py
index 3db18dd3250..d2bf914c8a7 100644
--- a/web/analysis/views.py
+++ b/web/analysis/views.py
@@ -33,7 +33,7 @@
 from lib.cuckoo.common.config import Config
 from lib.cuckoo.common.constants import ANALYSIS_BASE_PATH, CUCKOO_ROOT
 from lib.cuckoo.common.path_utils import path_exists, path_get_size, path_mkdir, path_read_file, path_safe
-from lib.cuckoo.common.utils import delete_folder, yara_detected
+from lib.cuckoo.common.utils import delete_folder, get_files_storage_path, yara_detected
 from lib.cuckoo.common.web_utils import category_all_files, my_rate_minutes, my_rate_seconds, perform_search, rateblock, statistics
 from lib.cuckoo.core.database import TASK_PENDING, Database, Task
 from modules.reporting.report_doc import CHUNK_CALL_SIZE
@@ -1826,6 +1826,10 @@ def file(request, category, task_id, dlfile):
         # Self Extracted support folder
         if not path_exists(path):
             path = os.path.join(CUCKOO_ROOT, "storage", "analyses", str(task_id), "selfextracted", file_name)
+
+        if not path_exists(path) and len(file_name) == 64:
+            path = get_files_storage_path(file_name)
+
     elif category in ("droppedzipall", "procdumpzipall", "CAPEzipall"):
         if web_cfg.zipped_download.download_all:
             sub_cat = category.replace("zipall", "")
@@ -1842,6 +1846,10 @@ def file(request, category, task_id, dlfile):
             path = buf
             if not path_exists(path):
                 path = os.path.join(CUCKOO_ROOT, "storage", "analyses", str(task_id), "selfextracted", file_name)
+
+            if not path_exists(path) and len(file_name) == 64:
+                path = get_files_storage_path(file_name)
+
     elif category == "networkzip":
         buf = os.path.join(CUCKOO_ROOT, "storage", "analyses", task_id, "network", file_name)
         path = buf

From 5c8b9bfbb3328d0c71b2a9866eee0710b335e892 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 20:31:06 +0100
Subject: [PATCH 13/14] sync

---
 .../common/integrations/file_extra_info.py     |  2 +-
 lib/cuckoo/common/web_utils.py                 | 18 ++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index c8f6d22421b..6af17f5a46f 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -409,7 +409,7 @@ def _extracted_files_metadata(
             print(
                 json.dumps(
                     {
-                        "path": os.path.join("selfextracted", file_info["sha256"]),
+                        "path": file_info["sha256"],  # Store just the SHA256
                         "filepath": file_info["name"],
                         "pids": [],
                         "ppids": [],
diff --git a/lib/cuckoo/common/web_utils.py b/lib/cuckoo/common/web_utils.py
index 7bf652a112c..99ddecc6795 100644
--- a/lib/cuckoo/common/web_utils.py
+++ b/lib/cuckoo/common/web_utils.py
@@ -1122,20 +1122,18 @@ def category_all_files(task_id: str, category: str, base_path: str):
         files = []
         if query_category == "CAPE":
             for block in analysis.get(query_category, {}).get("payloads", []):
-                p = os.path.join(base_path, block["sha256"])
-                if path_exists(p):
-                    files.append(p)
-                else:
-                    p = get_files_storage_path(block["sha256"])
+                # Path in files.json now stores only the SHA256, not a relative path
+                sha256 = block.get("path") or block.get("sha256")
+                if sha256:
+                    p = get_files_storage_path(sha256)
                     if path_exists(p):
                         files.append(p)
         else:
             for block in analysis.get(category, []):
-                p = os.path.join(base_path, block["sha256"])
-                if path_exists(p):
-                    files.append(p)
-                else:
-                    p = get_files_storage_path(block["sha256"])
+                # Path in files.json now stores only the SHA256, not a relative path
+                sha256 = block.get("path") or block.get("sha256")
+                if sha256:
+                    p = get_files_storage_path(sha256)
                     if path_exists(p):
                         files.append(p)
 

From 283e7471424c8607ca6dcd860a5e13b559ab9a19 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Sat, 13 Dec 2025 21:23:54 +0100
Subject: [PATCH 14/14] Update cleaners_utils.py

---
 lib/cuckoo/common/cleaners_utils.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/lib/cuckoo/common/cleaners_utils.py b/lib/cuckoo/common/cleaners_utils.py
index 6e5b8022ef7..f15a7f894a1 100644
--- a/lib/cuckoo/common/cleaners_utils.py
+++ b/lib/cuckoo/common/cleaners_utils.py
@@ -717,18 +717,19 @@ def files_clean_before(timerange: str):
             for entry in it:
                 if not entry.is_dir():
                     continue
-                selfextracted = os.path.join(entry.path, "selfextracted")
-                if path_exists(selfextracted):
-                    with os.scandir(selfextracted) as se_it:
-                        for se_entry in se_it:
-                            if se_entry.is_symlink():
-                                try:
-                                    target = os.readlink(se_entry.path)
-                                    # Check if it points to storage/files
-                                    if os.path.abspath(target).startswith(os.path.abspath(files_folder)):
-                                        referenced.add(os.path.basename(target))
-                                except OSError:
-                                    pass
+                for subdir in ("selfextracted", "files", "CAPE", "procdump"):
+                    check_dir = os.path.join(entry.path, subdir)
+                    if path_exists(check_dir):
+                        with os.scandir(check_dir) as se_it:
+                            for se_entry in se_it:
+                                if se_entry.is_symlink():
+                                    try:
+                                        target = os.readlink(se_entry.path)
+                                        # Check if it points to storage/files
+                                        if os.path.abspath(target).startswith(os.path.abspath(files_folder)):
+                                            referenced.add(os.path.basename(target))
+                                    except OSError:
+                                        pass
 
     # 2. Iterate storage/files and clean
     for root, _, filenames in os.walk(files_folder, topdown=False):