feat: deduplicate arguments

KSXGitHub · KSXGitHub · commit c1976aa48f20 · 2025-07-24T12:57:47.000+07:00
diff --git a/src/app.rs b/src/app.rs
@@ -16,7 +16,7 @@ use crate::{
 use clap::Parser;
 use hdd::any_path_is_in_hdd;
 use pipe_trait::Pipe;
-use std::{io::stdin, time::Duration};
+use std::{fs::canonicalize, io::stdin, time::Duration};
 use sub::JsonOutputParam;
 use sysinfo::Disks;
 
@@ -38,7 +38,7 @@ impl App {
     }
 
     /// Run the application.
-    pub fn run(self) -> Result<(), RuntimeError> {
+    pub fn run(mut self) -> Result<(), RuntimeError> {
         // DYNAMIC DISPATCH POLICY:
         //
         // Errors rarely occur, therefore, using dynamic dispatch to report errors have an acceptable
@@ -132,6 +132,17 @@ impl App {
                 .unwrap_or_else(|_| eprintln!("warning: Failed to set thread limit to {threads}"));
         }
 
+        if cfg!(unix) && self.args.deduplicate_hardlinks && self.args.files.len() > 1 {
+            // Hardlinks deduplication doesn't work properly if there are more than 1 paths pointing to
+            // the same tree or if a path points to a subtree of another path. Therefore, we must find
+            // and remove such duplications before they cause problem.
+            deduplicate_arguments::deduplicate_arguments(
+                &mut self.args.files,
+                |path| canonicalize(path),
+                |a, b| a.starts_with(b),
+            );
+        }
+
         let report_error = if self.args.silent_errors {
             ErrorReport::SILENT
         } else {
@@ -291,5 +302,6 @@ impl App {
     }
 }
 
+mod deduplicate_arguments;
 mod hdd;
 mod mount_point;
diff --git a/src/app/deduplicate_arguments.rs b/src/app/deduplicate_arguments.rs
@@ -0,0 +1,94 @@
+use pipe_trait::Pipe;
+use std::{collections::HashSet, mem::take};
+
+/// Hardlinks deduplication doesn't work properly if there are more than 1 paths pointing to
+/// the same tree or if a path points to a subtree of another path. Therefore, we must find
+/// and remove such duplications before they cause problem.
+pub fn deduplicate_arguments<'a, Argument, Canonicalize, StartsWith, RealPath, CanonicalizeError>(
+    arguments: &'a mut Vec<Argument>,
+    canonicalize: Canonicalize,
+    starts_with: StartsWith,
+) where
+    Canonicalize: for<'r> FnMut(&Argument) -> Result<RealPath, CanonicalizeError>,
+    StartsWith: for<'r> FnMut(&'r RealPath, &'r RealPath) -> bool,
+    RealPath: Eq,
+{
+    let to_remove = find_argument_duplications_to_remove(arguments, canonicalize, starts_with);
+    remove_items_from_vec_by_indices(arguments, &to_remove);
+}
+
+/// Find duplication in a list of arguments to remove and return their indices.
+///
+/// Prefer keeping the containing tree over the subtree (returning the index of the subtree).
+///
+/// Prefer keeping the first instance of the path over the later instances (returning the indices of
+/// the later instances).
+pub fn find_argument_duplications_to_remove<
+    Argument,
+    Canonicalize,
+    StartsWith,
+    RealPath,
+    CanonicalizeError,
+>(
+    arguments: &[Argument],
+    canonicalize: Canonicalize,
+    mut starts_with: StartsWith,
+) -> HashSet<usize>
+where
+    Canonicalize: for<'r> FnMut(&Argument) -> Result<RealPath, CanonicalizeError>,
+    StartsWith: for<'r> FnMut(&'r RealPath, &'r RealPath) -> bool,
+    RealPath: Eq,
+{
+    let real_paths: Vec<_> = arguments.iter().map(canonicalize).collect();
+    assert_eq!(arguments.len(), real_paths.len());
+
+    let mut to_remove = HashSet::new();
+    for left_index in 0..arguments.len() {
+        for right_index in (left_index + 1)..arguments.len() {
+            if let (Ok(left), Ok(right)) = (&real_paths[left_index], &real_paths[right_index]) {
+                // both paths are the same, remove the second one
+                if left == right {
+                    to_remove.insert(right_index);
+                    continue;
+                }
+
+                // `left` starts with `right` means `left` is subtree of `right`, remove `left`
+                if starts_with(left, right) {
+                    to_remove.insert(left_index);
+                    continue;
+                }
+
+                // `right` starts with `left` means `right` is subtree of `left`, remove `right`
+                if starts_with(right, left) {
+                    to_remove.insert(right_index);
+                    continue;
+                }
+            }
+        }
+    }
+    to_remove
+}
+
+/// Remove elements from a vector by indices.
+pub fn remove_items_from_vec_by_indices<Item>(vec: &mut Vec<Item>, indices: &HashSet<usize>) {
+    // Optimization: If there is no element to remove then there is nothing to do.
+    if indices.is_empty() {
+        return;
+    }
+
+    // Optimization: If there is only 1 element to remove, shifting elements would be cheaper than reallocate a whole array.
+    if indices.len() == 1 {
+        let index = *indices.iter().next().unwrap();
+        vec.remove(index);
+        return;
+    }
+
+    // Default: If there are more than 1 elements to remove, just copy the whole array without them.
+    *vec = vec
+        .pipe(take)
+        .into_iter()
+        .enumerate()
+        .filter(|(index, _)| !indices.contains(index))
+        .map(|(_, item)| item)
+        .collect();
+}