33import typing
44from enum import Enum
55from tqdm import tqdm
6- from hashlib import md5
76import os
87from abc import ABC , abstractmethod
98
@@ -32,14 +31,13 @@ class DatasetReader(ABC):
3231 remote_root : str
3332
3433 @abstractmethod
35- def read (self , dataset : str , files : list [str ], local_ds_root : pathlib .Path , check_etag : bool = True ):
34+ def read (self , dataset : str , files : list [str ], local_ds_root : pathlib .Path ):
3635 """read dataset files from remote_root to local_ds_root,
3736
3837 Args:
3938 dataset(str): for instance "sift_small_500k"
4039 files(list[str]): all filenames of the dataset
4140 local_ds_root(pathlib.Path): whether to write the remote data.
42- check_etag(bool): whether to check the etag
4341 """
4442 pass
4543
@@ -56,7 +54,7 @@ def __init__(self):
5654 import oss2
5755 self .bucket = oss2 .Bucket (oss2 .AnonymousAuth (), self .remote_root , "benchmark" , True )
5856
59- def validate_file (self , remote : pathlib .Path , local : pathlib .Path , check_etag : bool ) -> bool :
57+ def validate_file (self , remote : pathlib .Path , local : pathlib .Path ) -> bool :
6058 info = self .bucket .get_object_meta (remote .as_posix ())
6159
6260 # check size equal
@@ -65,13 +63,9 @@ def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: b
6563 log .info (f"local file: { local } size[{ local_size } ] not match with remote size[{ remote_size } ]" )
6664 return False
6765
68- # check etag equal
69- if check_etag :
70- return match_etag (info .etag .strip ('"' ).lower (), local )
71-
7266 return True
7367
74- def read (self , dataset : str , files : list [str ], local_ds_root : pathlib .Path , check_etag : bool = False ):
68+ def read (self , dataset : str , files : list [str ], local_ds_root : pathlib .Path ):
7569 downloads = []
7670 if not local_ds_root .exists ():
7771 log .info (f"local dataset root path not exist, creating it: { local_ds_root } " )
@@ -83,8 +77,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
8377 remote_file = pathlib .PurePosixPath ("benchmark" , dataset , file )
8478 local_file = local_ds_root .joinpath (file )
8579
86- # Don't check etags for Dataset from Aliyun OSS
87- if (not local_file .exists ()) or (not self .validate_file (remote_file , local_file , False )):
80+ if (not local_file .exists ()) or (not self .validate_file (remote_file , local_file )):
8881 log .info (f"local file: { local_file } not match with remote: { remote_file } ; add to downloading list" )
8982 downloads .append ((remote_file , local_file ))
9083
@@ -120,7 +113,7 @@ def ls_all(self, dataset: str):
120113 return names
121114
122115
123- def read (self , dataset : str , files : list [str ], local_ds_root : pathlib .Path , check_etag : bool = True ):
116+ def read (self , dataset : str , files : list [str ], local_ds_root : pathlib .Path ):
124117 downloads = []
125118 if not local_ds_root .exists ():
126119 log .info (f"local dataset root path not exist, creating it: { local_ds_root } " )
@@ -132,7 +125,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
132125 remote_file = pathlib .PurePosixPath (self .remote_root , dataset , file )
133126 local_file = local_ds_root .joinpath (file )
134127
135- if (not local_file .exists ()) or (not self .validate_file (remote_file , local_file , check_etag )):
128+ if (not local_file .exists ()) or (not self .validate_file (remote_file , local_file )):
136129 log .info (f"local file: { local_file } not match with remote: { remote_file } ; add to downloading list" )
137130 downloads .append (remote_file )
138131
@@ -147,7 +140,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
147140 log .info (f"Succeed to download all files, downloaded file count = { len (downloads )} " )
148141
149142
150- def validate_file (self , remote : pathlib .Path , local : pathlib .Path , check_etag : bool ) -> bool :
143+ def validate_file (self , remote : pathlib .Path , local : pathlib .Path ) -> bool :
151144 # info() uses ls() inside, maybe we only need to ls once
152145 info = self .fs .info (remote )
153146
@@ -157,48 +150,4 @@ def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: b
157150 log .info (f"local file: { local } size[{ local_size } ] not match with remote size[{ remote_size } ]" )
158151 return False
159152
160- # check etag equal
161- if check_etag :
162- return match_etag (info .get ('ETag' , "" ).strip ('"' ), local )
163-
164153 return True
165-
166-
167- def match_etag (expected_etag : str , local_file ) -> bool :
168- """Check if local files' etag match with S3"""
169- def factor_of_1MB (filesize , num_parts ):
170- x = filesize / int (num_parts )
171- y = x % 1048576
172- return int (x + 1048576 - y )
173-
174- def calc_etag (inputfile , partsize ):
175- md5_digests = []
176- with open (inputfile , 'rb' ) as f :
177- for chunk in iter (lambda : f .read (partsize ), b'' ):
178- md5_digests .append (md5 (chunk ).digest ())
179- return md5 (b'' .join (md5_digests )).hexdigest () + '-' + str (len (md5_digests ))
180-
181- def possible_partsizes (filesize , num_parts ):
182- return lambda partsize : partsize < filesize and (float (filesize ) / float (partsize )) <= num_parts
183-
184- filesize = os .path .getsize (local_file )
185- le = ""
186- if '-' not in expected_etag : # no spliting uploading
187- with open (local_file , 'rb' ) as f :
188- le = md5 (f .read ()).hexdigest ()
189- log .debug (f"calculated local etag { le } , expected etag: { expected_etag } " )
190- return expected_etag == le
191- else :
192- num_parts = int (expected_etag .split ('-' )[- 1 ])
193- partsizes = [ ## Default Partsizes Map
194- 8388608 , # aws_cli/boto3
195- 15728640 , # s3cmd
196- factor_of_1MB (filesize , num_parts ) # Used by many clients to upload large files
197- ]
198-
199- for partsize in filter (possible_partsizes (filesize , num_parts ), partsizes ):
200- le = calc_etag (local_file , partsize )
201- log .debug (f"calculated local etag { le } , expected etag: { expected_etag } " )
202- if expected_etag == le :
203- return True
204- return False
0 commit comments