Skip to content

Commit c1976aa

Browse files
committed
feat: deduplicate arguments
1 parent 5503aa2 commit c1976aa

File tree

2 files changed

+108
-2
lines changed

2 files changed

+108
-2
lines changed

src/app.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use crate::{
1616
use clap::Parser;
1717
use hdd::any_path_is_in_hdd;
1818
use pipe_trait::Pipe;
19-
use std::{io::stdin, time::Duration};
19+
use std::{fs::canonicalize, io::stdin, time::Duration};
2020
use sub::JsonOutputParam;
2121
use sysinfo::Disks;
2222

@@ -38,7 +38,7 @@ impl App {
3838
}
3939

4040
/// Run the application.
41-
pub fn run(self) -> Result<(), RuntimeError> {
41+
pub fn run(mut self) -> Result<(), RuntimeError> {
4242
// DYNAMIC DISPATCH POLICY:
4343
//
4444
// Errors rarely occur, therefore, using dynamic dispatch to report errors have an acceptable
@@ -132,6 +132,17 @@ impl App {
132132
.unwrap_or_else(|_| eprintln!("warning: Failed to set thread limit to {threads}"));
133133
}
134134

135+
if cfg!(unix) && self.args.deduplicate_hardlinks && self.args.files.len() > 1 {
136+
// Hardlinks deduplication doesn't work properly if there are more than 1 paths pointing to
137+
// the same tree or if a path points to a subtree of another path. Therefore, we must find
138+
// and remove such duplications before they cause problem.
139+
deduplicate_arguments::deduplicate_arguments(
140+
&mut self.args.files,
141+
|path| canonicalize(path),
142+
|a, b| a.starts_with(b),
143+
);
144+
}
145+
135146
let report_error = if self.args.silent_errors {
136147
ErrorReport::SILENT
137148
} else {
@@ -291,5 +302,6 @@ impl App {
291302
}
292303
}
293304

305+
mod deduplicate_arguments;
294306
mod hdd;
295307
mod mount_point;

src/app/deduplicate_arguments.rs

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
use pipe_trait::Pipe;
2+
use std::{collections::HashSet, mem::take};
3+
4+
/// Hardlinks deduplication doesn't work properly if there are more than 1 paths pointing to
5+
/// the same tree or if a path points to a subtree of another path. Therefore, we must find
6+
/// and remove such duplications before they cause problem.
7+
pub fn deduplicate_arguments<'a, Argument, Canonicalize, StartsWith, RealPath, CanonicalizeError>(
8+
arguments: &'a mut Vec<Argument>,
9+
canonicalize: Canonicalize,
10+
starts_with: StartsWith,
11+
) where
12+
Canonicalize: for<'r> FnMut(&Argument) -> Result<RealPath, CanonicalizeError>,
13+
StartsWith: for<'r> FnMut(&'r RealPath, &'r RealPath) -> bool,
14+
RealPath: Eq,
15+
{
16+
let to_remove = find_argument_duplications_to_remove(arguments, canonicalize, starts_with);
17+
remove_items_from_vec_by_indices(arguments, &to_remove);
18+
}
19+
20+
/// Find duplication in a list of arguments to remove and return their indices.
21+
///
22+
/// Prefer keeping the containing tree over the subtree (returning the index of the subtree).
23+
///
24+
/// Prefer keeping the first instance of the path over the later instances (returning the indices of
25+
/// the later instances).
26+
pub fn find_argument_duplications_to_remove<
27+
Argument,
28+
Canonicalize,
29+
StartsWith,
30+
RealPath,
31+
CanonicalizeError,
32+
>(
33+
arguments: &[Argument],
34+
canonicalize: Canonicalize,
35+
mut starts_with: StartsWith,
36+
) -> HashSet<usize>
37+
where
38+
Canonicalize: for<'r> FnMut(&Argument) -> Result<RealPath, CanonicalizeError>,
39+
StartsWith: for<'r> FnMut(&'r RealPath, &'r RealPath) -> bool,
40+
RealPath: Eq,
41+
{
42+
let real_paths: Vec<_> = arguments.iter().map(canonicalize).collect();
43+
assert_eq!(arguments.len(), real_paths.len());
44+
45+
let mut to_remove = HashSet::new();
46+
for left_index in 0..arguments.len() {
47+
for right_index in (left_index + 1)..arguments.len() {
48+
if let (Ok(left), Ok(right)) = (&real_paths[left_index], &real_paths[right_index]) {
49+
// both paths are the same, remove the second one
50+
if left == right {
51+
to_remove.insert(right_index);
52+
continue;
53+
}
54+
55+
// `left` starts with `right` means `left` is subtree of `right`, remove `left`
56+
if starts_with(left, right) {
57+
to_remove.insert(left_index);
58+
continue;
59+
}
60+
61+
// `right` starts with `left` means `right` is subtree of `left`, remove `right`
62+
if starts_with(right, left) {
63+
to_remove.insert(right_index);
64+
continue;
65+
}
66+
}
67+
}
68+
}
69+
to_remove
70+
}
71+
72+
/// Remove elements from a vector by indices.
73+
pub fn remove_items_from_vec_by_indices<Item>(vec: &mut Vec<Item>, indices: &HashSet<usize>) {
74+
// Optimization: If there is no element to remove then there is nothing to do.
75+
if indices.is_empty() {
76+
return;
77+
}
78+
79+
// Optimization: If there is only 1 element to remove, shifting elements would be cheaper than reallocate a whole array.
80+
if indices.len() == 1 {
81+
let index = *indices.iter().next().unwrap();
82+
vec.remove(index);
83+
return;
84+
}
85+
86+
// Default: If there are more than 1 elements to remove, just copy the whole array without them.
87+
*vec = vec
88+
.pipe(take)
89+
.into_iter()
90+
.enumerate()
91+
.filter(|(index, _)| !indices.contains(index))
92+
.map(|(_, item)| item)
93+
.collect();
94+
}

0 commit comments

Comments
 (0)