Skip to content

Commit 43302a9

Browse files
XuanYang-cnalwayslove2013
authored andcommitted
enhance: Remove etag checks
Etag checks cost a lot, check the size, file_name, and, file count is enough Signed-off-by: yangxuan <[email protected]>
1 parent 22123d0 commit 43302a9

File tree

4 files changed

+11
-67
lines changed

4 files changed

+11
-67
lines changed

tests/test_data_source.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ def per_case_test(self, type_case):
1919
log.info(f"test case: {t.name}, {ca.name}")
2020

2121
filters = ca.filter_rate
22-
ca.dataset.prepare(source=DatasetSource.AliyunOSS, check=False, filters=filters)
22+
ca.dataset.prepare(source=DatasetSource.AliyunOSS, filters=filters)
2323
ali_trains = ca.dataset.train_files
2424

25-
ca.dataset.prepare(check=False, filters=filters)
25+
ca.dataset.prepare(filters=filters)
2626
s3_trains = ca.dataset.train_files
2727

2828
assert ali_trains == s3_trains

tests/test_dataset.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def test_cohere_error(self):
2626

2727
def test_iter_cohere(self):
2828
cohere_10m = Dataset.COHERE.manager(10_000_000)
29-
cohere_10m.prepare(check=False)
29+
cohere_10m.prepare()
3030

3131
import time
3232
before = time.time()
@@ -40,7 +40,7 @@ def test_iter_cohere(self):
4040
def test_iter_laion(self):
4141
laion_100m = Dataset.LAION.manager(100_000_000)
4242
from vectordb_bench.backend.data_source import DatasetSource
43-
laion_100m.prepare(source=DatasetSource.AliyunOSS, check=False)
43+
laion_100m.prepare(source=DatasetSource.AliyunOSS)
4444

4545
import time
4646
before = time.time()
@@ -66,14 +66,12 @@ def test_download_small(self):
6666
openai_50k.data.dir_name.lower(),
6767
files=files,
6868
local_ds_root=openai_50k.data_dir,
69-
check_etag=False,
7069
)
7170

7271
os.remove(file_path)
7372
DatasetSource.AliyunOSS.reader().read(
7473
openai_50k.data.dir_name.lower(),
7574
files=files,
7675
local_ds_root=openai_50k.data_dir,
77-
check_etag=False,
7876
)
7977

vectordb_bench/backend/data_source.py

Lines changed: 7 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import typing
44
from enum import Enum
55
from tqdm import tqdm
6-
from hashlib import md5
76
import os
87
from abc import ABC, abstractmethod
98

@@ -32,14 +31,13 @@ class DatasetReader(ABC):
3231
remote_root: str
3332

3433
@abstractmethod
35-
def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, check_etag: bool = True):
34+
def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
3635
"""read dataset files from remote_root to local_ds_root,
3736
3837
Args:
3938
dataset(str): for instance "sift_small_500k"
4039
files(list[str]): all filenames of the dataset
4140
local_ds_root(pathlib.Path): whether to write the remote data.
42-
check_etag(bool): whether to check the etag
4341
"""
4442
pass
4543

@@ -56,7 +54,7 @@ def __init__(self):
5654
import oss2
5755
self.bucket = oss2.Bucket(oss2.AnonymousAuth(), self.remote_root, "benchmark", True)
5856

59-
def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: bool) -> bool:
57+
def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
6058
info = self.bucket.get_object_meta(remote.as_posix())
6159

6260
# check size equal
@@ -65,13 +63,9 @@ def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: b
6563
log.info(f"local file: {local} size[{local_size}] not match with remote size[{remote_size}]")
6664
return False
6765

68-
# check etag equal
69-
if check_etag:
70-
return match_etag(info.etag.strip('"').lower(), local)
71-
7266
return True
7367

74-
def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, check_etag: bool = False):
68+
def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
7569
downloads = []
7670
if not local_ds_root.exists():
7771
log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
@@ -83,8 +77,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
8377
remote_file = pathlib.PurePosixPath("benchmark", dataset, file)
8478
local_file = local_ds_root.joinpath(file)
8579

86-
# Don't check etags for Dataset from Aliyun OSS
87-
if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, False)):
80+
if (not local_file.exists()) or (not self.validate_file(remote_file, local_file)):
8881
log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list")
8982
downloads.append((remote_file, local_file))
9083

@@ -120,7 +113,7 @@ def ls_all(self, dataset: str):
120113
return names
121114

122115

123-
def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, check_etag: bool = True):
116+
def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
124117
downloads = []
125118
if not local_ds_root.exists():
126119
log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
@@ -132,7 +125,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
132125
remote_file = pathlib.PurePosixPath(self.remote_root, dataset, file)
133126
local_file = local_ds_root.joinpath(file)
134127

135-
if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
128+
if (not local_file.exists()) or (not self.validate_file(remote_file, local_file)):
136129
log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list")
137130
downloads.append(remote_file)
138131

@@ -147,7 +140,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
147140
log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")
148141

149142

150-
def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: bool) -> bool:
143+
def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
151144
# info() uses ls() inside, maybe we only need to ls once
152145
info = self.fs.info(remote)
153146

@@ -157,48 +150,4 @@ def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: b
157150
log.info(f"local file: {local} size[{local_size}] not match with remote size[{remote_size}]")
158151
return False
159152

160-
# check etag equal
161-
if check_etag:
162-
return match_etag(info.get('ETag', "").strip('"'), local)
163-
164153
return True
165-
166-
167-
def match_etag(expected_etag: str, local_file) -> bool:
168-
"""Check if local files' etag match with S3"""
169-
def factor_of_1MB(filesize, num_parts):
170-
x = filesize / int(num_parts)
171-
y = x % 1048576
172-
return int(x + 1048576 - y)
173-
174-
def calc_etag(inputfile, partsize):
175-
md5_digests = []
176-
with open(inputfile, 'rb') as f:
177-
for chunk in iter(lambda: f.read(partsize), b''):
178-
md5_digests.append(md5(chunk).digest())
179-
return md5(b''.join(md5_digests)).hexdigest() + '-' + str(len(md5_digests))
180-
181-
def possible_partsizes(filesize, num_parts):
182-
return lambda partsize: partsize < filesize and (float(filesize) / float(partsize)) <= num_parts
183-
184-
filesize = os.path.getsize(local_file)
185-
le = ""
186-
if '-' not in expected_etag: # no spliting uploading
187-
with open(local_file, 'rb') as f:
188-
le = md5(f.read()).hexdigest()
189-
log.debug(f"calculated local etag {le}, expected etag: {expected_etag}")
190-
return expected_etag == le
191-
else:
192-
num_parts = int(expected_etag.split('-')[-1])
193-
partsizes = [ ## Default Partsizes Map
194-
8388608, # aws_cli/boto3
195-
15728640, # s3cmd
196-
factor_of_1MB(filesize, num_parts) # Used by many clients to upload large files
197-
]
198-
199-
for partsize in filter(possible_partsizes(filesize, num_parts), partsizes):
200-
le = calc_etag(local_file, partsize)
201-
log.debug(f"calculated local etag {le}, expected etag: {expected_etag}")
202-
if expected_etag == le:
203-
return True
204-
return False

vectordb_bench/backend/dataset.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,13 @@ def __iter__(self):
162162
# TODO passing use_shuffle from outside
163163
def prepare(self,
164164
source: DatasetSource=DatasetSource.S3,
165-
check: bool=True,
166165
filters: int | float | str | None = None,
167166
) -> bool:
168167
"""Download the dataset from DatasetSource
169168
url = f"{source}/{self.data.dir_name}"
170169
171170
Args:
172171
source(DatasetSource): S3 or AliyunOSS, default as S3
173-
check(bool): Whether to do etags check, default as ture
174172
filters(Optional[int | float | str]): combined with dataset's with_gt to
175173
compose the correct ground_truth file
176174
@@ -192,7 +190,6 @@ def prepare(self,
192190
dataset=self.data.dir_name.lower(),
193191
files=all_files,
194192
local_ds_root=self.data_dir,
195-
check_etag=check,
196193
)
197194

198195
if gt_file is not None and test_file is not None:

0 commit comments

Comments
 (0)