Skip to content

Commit 64f8ca0

Browse files
authored
Add --remove-invalid option for validation (#19)
1 parent 32149ed commit 64f8ca0

File tree

7 files changed

+111
-18
lines changed

7 files changed

+111
-18
lines changed

README.md

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ but has few shortcomings that can appear quite fast after using it with existing
3737

3838
## Tech stack
3939

40-
Project is implemented in `Python 3.11` and is using `poetry` as package manager. Main libraries used:
40+
Project is implemented in `Python 3.11` and is using [`uv`](https://github.com/astral-sh/uv) as package and project manager.
41+
Main libraries used:
4142
- [`simple-salesforce`](https://github.com/simple-salesforce/simple-salesforce) - handling Salesforce API
4243
- [`click`](https://github.com/pallets/click) - working with CLI
4344
- [`PyYaml`](https://github.com/yaml/pyyaml/) - config parsing
@@ -224,9 +225,19 @@ When validation is complete, show statistics.
224225

225226
## HOWTOs
226227

228+
### Download and validate with one command
229+
230+
```shell
231+
docker run --interactive --tty \
232+
--volume /path/to/data/directory:/archivist/data \
233+
--volume /path/to/your.config.yaml:/archivist/config.yaml \
234+
ghcr.io/piotrekkr/salesforce-archivist:latest \
235+
download --validate
236+
```
237+
227238
### Re-download content version list and document link list
228239

229-
You can remove CSV files from disk, and next download will download full lists again from Salesforce.
240+
You can remove CSV files from disk, and the next download will fetch full lists again from Salesforce.
230241
```shell
231242
# for chosen type
232243
rm -rf {data_dir}/{object_type}/*.csv
@@ -240,6 +251,28 @@ rm -rf {data_dir}/*/*.csv
240251
Already calculated checksums for downloaded files are kept in `{data_dir}/validated_versions.csv`.
241252
You can remove this file or selected lines from inside this file. This will trigger full validation again.
242253

254+
### How to remove invalid files and redownload them
255+
256+
Validation can show that some checksum or size of downloaded files do not match with values from Salesforce.
257+
To remove them and download again, you can use `--remove-invalid` flag. This flag will remove invalid files from
258+
the validated files list and from disk.
259+
260+
```shell
261+
# download validate and remove invalid in one command
262+
docker run --interactive --tty \
263+
--volume /path/to/data/directory:/archivist/data \
264+
--volume /path/to/your.config.yaml:/archivist/config.yaml \
265+
ghcr.io/piotrekkr/salesforce-archivist:latest \
266+
download --validate --remove-invalid
267+
268+
# remove after validation
269+
docker run --interactive --tty \
270+
--volume /path/to/data/directory:/archivist/data \
271+
--volume /path/to/your.config.yaml:/archivist/config.yaml \
272+
ghcr.io/piotrekkr/salesforce-archivist:latest \
273+
validate --remove-invalid
274+
```
275+
243276
## Contributing
244277

245278
// TODO

src/salesforce_archivist/archivist.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,11 @@ def _download_attachments(
179179
global_stats.combine(stats)
180180

181181
def _validate_content_versions_download(
182-
self, archivist_obj: ArchivistObject, validated_list: ValidatedList, global_stats: ValidationStats
182+
self,
183+
archivist_obj: ArchivistObject,
184+
validated_list: ValidatedList,
185+
global_stats: ValidationStats,
186+
remove_invalid: bool = False,
183187
) -> bool:
184188
salesforce = Salesforce(
185189
archivist_obj=archivist_obj,
@@ -199,12 +203,17 @@ def _validate_content_versions_download(
199203
download_list=download_list,
200204
validated_list=validated_list,
201205
max_workers=self._max_workers,
206+
remove_invalid=remove_invalid,
202207
)
203208
global_stats.combine(stats)
204209
return stats.invalid == 0
205210

206211
def _validate_attachments_download(
207-
self, archivist_obj: ArchivistObject, validated_list: ValidatedList, global_stats: ValidationStats
212+
self,
213+
archivist_obj: ArchivistObject,
214+
validated_list: ValidatedList,
215+
global_stats: ValidationStats,
216+
remove_invalid: bool = False,
208217
) -> bool:
209218
salesforce = Salesforce(
210219
archivist_obj=archivist_obj,
@@ -220,20 +229,21 @@ def _validate_attachments_download(
220229
download_list=download_list,
221230
validated_list=validated_list,
222231
max_workers=self._max_workers,
232+
remove_invalid=remove_invalid,
223233
)
224234
global_stats.combine(stats)
225235
return stats.invalid == 0
226236

227-
def validate(self) -> bool:
237+
def validate(self, remove_invalid: bool = False) -> bool:
228238
validated_list = ValidatedList(self._data_dir)
229239
if validated_list.data_file_exist():
230240
validated_list.load_data_from_file()
231241
global_stats = ValidationStats()
232242
for archivist_obj in self._objects.values():
233243
if archivist_obj.obj_type == "Attachment":
234-
self._validate_attachments_download(archivist_obj, validated_list, global_stats)
244+
self._validate_attachments_download(archivist_obj, validated_list, global_stats, remove_invalid)
235245
else:
236-
self._validate_content_versions_download(archivist_obj, validated_list, global_stats)
246+
self._validate_content_versions_download(archivist_obj, validated_list, global_stats, remove_invalid)
237247
status = "SUCCESS" if global_stats.invalid == 0 else "FAILED"
238248
color = "green" if global_stats.invalid == 0 else "red"
239249
click.secho(

src/salesforce_archivist/cli.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ def cli(ctx: Context) -> None:
2828

2929
@cli.command()
3030
@click.option("--validate", is_flag=True, default=False, help="Trigger validation after download.")
31+
@click.option("--remove-invalid", is_flag=True, default=False, help="Remove invalid files after validation.")
3132
@click.pass_context
32-
def download(ctx: Context, validate: bool) -> None:
33+
def download(ctx: Context, validate: bool, remove_invalid: bool) -> None:
3334
config: ArchivistConfig = ctx.obj["config"]
3435
sf_client = SalesforceClient(
3536
instance_url=config.auth.instance_url,
@@ -45,13 +46,14 @@ def download(ctx: Context, validate: bool) -> None:
4546
max_api_usage_percent=config.max_api_usage_percent,
4647
max_workers=config.max_workers,
4748
)
48-
if not archivist.download() or validate and not archivist.validate():
49+
if not archivist.download() or validate and not archivist.validate(remove_invalid=remove_invalid):
4950
ctx.exit(code=1)
5051

5152

5253
@cli.command()
54+
@click.option("--remove-invalid", is_flag=True, default=False, help="Remove invalid files after validation.")
5355
@click.pass_context
54-
def validate(ctx: Context) -> None:
56+
def validate(ctx: Context, remove_invalid: bool) -> None:
5557
config: ArchivistConfig = ctx.obj["config"]
5658
sf_client = SalesforceClient(
5759
instance_url=config.auth.instance_url,
@@ -67,7 +69,7 @@ def validate(ctx: Context) -> None:
6769
max_api_usage_percent=config.max_api_usage_percent,
6870
max_workers=config.max_workers,
6971
)
70-
if not archivist.validate():
72+
if not archivist.validate(remove_invalid=remove_invalid):
7173
ctx.exit(code=1)
7274

7375

src/salesforce_archivist/salesforce/salesforce.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,11 +250,11 @@ def validate_download(
250250
download_list: Union[DownloadContentVersionList, DownloadAttachmentList],
251251
validated_list: ValidatedList,
252252
max_workers: int | None = None,
253+
remove_invalid: bool = False,
253254
) -> ValidationStats:
254255
try:
255256
validator = DownloadValidator(
256-
validated_list=validated_list,
257-
max_workers=max_workers,
257+
validated_list=validated_list, max_workers=max_workers, remove_invalid=remove_invalid
258258
)
259259
return validator.validate(download_list=download_list)
260260
finally:

src/salesforce_archivist/salesforce/validation.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ def is_validated(self, path: str) -> bool:
7878
def get(self, path: str) -> ValidatedFile | None:
7979
return self._data.get(path)
8080

81+
def remove(self, path: str) -> bool:
82+
return self._data.pop(path, None) is not None
83+
8184
@property
8285
def path(self) -> str:
8386
return self._path
@@ -122,11 +125,12 @@ def combine(self, other: Self) -> None:
122125

123126

124127
class DownloadValidator:
125-
def __init__(self, validated_list: ValidatedList, max_workers: int | None = None):
128+
def __init__(self, validated_list: ValidatedList, max_workers: int | None = None, remove_invalid: bool = False):
126129
self._validated_list = validated_list
127130
self._stats = ValidationStats()
128131
self._lock = threading.Lock()
129132
self._max_workers = max_workers
133+
self._remove_invalid = remove_invalid
130134

131135
def _print_validated_msg(self, msg: str, invalid: bool = False) -> None:
132136
percent = self._stats.processed / self._stats.total * 100 if self._stats.total > 0 else 0.0
@@ -173,6 +177,9 @@ def _validate_version(self, version: ContentVersion, download_path: str) -> bool
173177
self._validated_list.add(
174178
ValidatedFile(path=download_path, checksum=checksum, content_size=version.content_size)
175179
)
180+
if not valid and self._remove_invalid:
181+
self._validated_list.remove(download_path)
182+
os.remove(download_path)
176183
except Exception as e:
177184
msg = "[ KO ] {id} => Exception: {e}".format(id=version.id, e=e)
178185
valid = False

test/salesforce/test_validation.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_validated_list_save():
9999
assert validated_file == loaded_list.get(path=validated_file.path)
100100

101101

102-
def test_validated_list_add_get_version():
102+
def test_validated_list_add_get():
103103
validated_list = ValidatedList(data_dir="/fake/dir")
104104
file_1 = ValidatedFile(checksum="checksum1", path="data/path/file_1.txt", content_size=None)
105105
file_2 = ValidatedFile(checksum=None, path="data/path/file_2.txt", content_size=10)
@@ -108,6 +108,15 @@ def test_validated_list_add_get_version():
108108
assert validated_list.get(path=file_2.path) is None
109109

110110

111+
def test_validated_list_remove():
112+
validated_list = ValidatedList(data_dir="/fake/dir")
113+
file_1 = ValidatedFile(checksum="checksum1", path="data/path/file_1.txt", content_size=None)
114+
validated_list.add(validated_file=file_1)
115+
assert validated_list.remove(path=file_1.path)
116+
assert not validated_list.remove(path="fake/path")
117+
assert len(validated_list) == 0
118+
119+
111120
def test_validated_list_is_downloaded():
112121
validated_list = ValidatedList(data_dir="/fake/dir")
113122
validated_file_1 = ValidatedFile(path="path/file.txt", checksum="checksum", content_size=None)
@@ -339,6 +348,38 @@ def test_download_validator_validate_object_will_calculate_checksum_and_check_ve
339348
assert len(validated_list) == 1
340349

341350

351+
@pytest.mark.parametrize(
352+
"file_data, checksum, should_remove",
353+
[
354+
("test", hashlib.md5("test".encode("utf-8")).hexdigest(), False),
355+
("test1", hashlib.md5("test".encode("utf-8")).hexdigest(), True),
356+
],
357+
)
358+
def test_download_validator_validate_object_will_remove_invalid_version_file_from_disk_and_validated_list(
359+
file_data: str, checksum: str, should_remove
360+
):
361+
with tempfile.TemporaryDirectory() as tmp_dir:
362+
archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User")
363+
download_path = os.path.join(tmp_dir, "file.txt")
364+
with open(download_path, "wb") as file:
365+
file.write(file_data.encode("utf-8"))
366+
367+
version = ContentVersion(
368+
version_id="VID1",
369+
document_id="DID",
370+
checksum=checksum,
371+
extension="ext1",
372+
title="version1",
373+
version_number=1,
374+
content_size=10,
375+
)
376+
validated_list = ValidatedList(data_dir=archivist_obj.obj_dir)
377+
validator = DownloadValidator(validated_list=validated_list, remove_invalid=True)
378+
validator.validate_object(obj=version, download_path=download_path)
379+
assert os.path.exists(download_path) == (not should_remove)
380+
assert len(validated_list) == (0 if should_remove else 1)
381+
382+
342383
@pytest.mark.parametrize(
343384
"file_data, size, should_match",
344385
[

test/test_archivist.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def test_archivist_validate_will_load_lists_and_call_validate_method(
346346
assert load_version_list_mock.call_count == 2
347347
assert load_attachment_list_mock.call_count == 1
348348
assert validate_mock.mock_calls == [
349-
call(download_list=ANY, validated_list=ANY, max_workers=max_workers),
350-
call(download_list=ANY, validated_list=ANY, max_workers=max_workers),
351-
call(download_list=ANY, validated_list=ANY, max_workers=max_workers),
349+
call(download_list=ANY, validated_list=ANY, max_workers=max_workers, remove_invalid=False),
350+
call(download_list=ANY, validated_list=ANY, max_workers=max_workers, remove_invalid=False),
351+
call(download_list=ANY, validated_list=ANY, max_workers=max_workers, remove_invalid=False),
352352
]

0 commit comments

Comments
 (0)