Skip to content

Commit 08f3e03

Browse files
committed
Allow user to specify encoding
The code was always hardcoding utf-8 as an encoding which was producing wrong results for SHIFT-JIS (Japanese) file names. Thus I have added an optional argument for encoding which by default is set to utf-8 but can be specified to any other value if encoding is not utf-8
1 parent 2732b6b commit 08f3e03

File tree

1 file changed

+48
-36
lines changed

1 file changed

+48
-36
lines changed

pycdlib/pycdlib.py

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,6 @@ def _find_dr_record_by_name(vd, path, encoding):
482482
return root_dir_record
483483

484484
splitpath = utils.split_path(path)
485-
486485
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)
487486

488487
entry = root_dir_record
@@ -505,7 +504,6 @@ def _find_dr_record_by_name(vd, path, encoding):
505504
index = lo
506505
if index != len(thelist) and thelist[index].file_ident == currpath:
507506
child = thelist[index]
508-
509507
if child is None:
510508
# We failed to find this component of the path, so break out of the
511509
# loop and fail.
@@ -520,7 +518,6 @@ def _find_dr_record_by_name(vd, path, encoding):
520518
# We found the last child we are looking for; return it.
521519
if not splitpath:
522520
return child
523-
524521
if not child.is_dir():
525522
break
526523
entry = child
@@ -705,8 +702,8 @@ def _seek_to_extent(self, extent):
705702
self._cdfp.seek(extent * self.logical_block_size)
706703

707704
@functools.lru_cache(maxsize=256)
708-
def _find_iso_record(self, iso_path):
709-
# type: (bytes) -> dr.DirectoryRecord
705+
def _find_iso_record(self, iso_path, encoding='utf-8'):
706+
# type: (bytes, str) -> dr.DirectoryRecord
710707
"""
711708
An internal method to find a directory record on the ISO given an ISO
712709
path. If the entry is found, it returns the directory record object
@@ -718,11 +715,11 @@ def _find_iso_record(self, iso_path):
718715
Returns:
719716
The directory record entry representing the entry on the ISO.
720717
"""
721-
return _find_dr_record_by_name(self.pvd, iso_path, 'utf-8')
718+
return _find_dr_record_by_name(self.pvd, iso_path, encoding)
722719

723720
@functools.lru_cache(maxsize=256)
724-
def _find_rr_record(self, rr_path):
725-
# type: (bytes) -> dr.DirectoryRecord
721+
def _find_rr_record(self, rr_path, encoding='utf-8'):
722+
# type: (bytes, str) -> dr.DirectoryRecord
726723
"""
727724
An internal method to find a directory record on the ISO given a Rock
728725
Ridge path. If the entry is found, it returns the directory record
@@ -742,7 +739,7 @@ def _find_rr_record(self, rr_path):
742739

743740
splitpath = utils.split_path(rr_path)
744741

745-
currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
742+
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)
746743

747744
entry = root_dir_record
748745

@@ -793,13 +790,13 @@ def _find_rr_record(self, rr_path):
793790
if not child.is_dir():
794791
break
795792
entry = child
796-
currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
793+
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)
797794

798795
raise pycdlibexception.PyCdlibInvalidInput('Could not find path')
799796

800797
@functools.lru_cache(maxsize=256)
801-
def _find_joliet_record(self, joliet_path):
802-
# type: (bytes) -> dr.DirectoryRecord
798+
def _find_joliet_record(self, joliet_path, encoding='utf-16_be'):
799+
# type: (bytes, str) -> dr.DirectoryRecord
803800
"""
804801
An internal method to find a directory record on the ISO given a Joliet
805802
path. If the entry is found, it returns the directory record object
@@ -813,7 +810,7 @@ def _find_joliet_record(self, joliet_path):
813810
"""
814811
if self.joliet_vd is None:
815812
raise pycdlibexception.PyCdlibInternalError('Joliet path requested on non-Joliet ISO')
816-
return _find_dr_record_by_name(self.joliet_vd, joliet_path, 'utf-16_be')
813+
return _find_dr_record_by_name(self.joliet_vd, joliet_path, encoding)
817814

818815
@functools.lru_cache(maxsize=256)
819816
def _find_udf_record(self, udf_path):
@@ -2412,8 +2409,8 @@ def _udf_get_file_from_iso_fp(self, outfp, blocksize, udf_path):
24122409
utils.copy_data(data_len, blocksize, data_fp, outfp)
24132410

24142411
def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
2415-
joliet_path):
2416-
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes]) -> None
2412+
joliet_path, encoding=None):
2413+
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes], str) -> None
24172414
"""
24182415
An internal method to fetch a single file from the ISO and write it out
24192416
to the file object.
@@ -2433,13 +2430,16 @@ def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
24332430
if joliet_path is not None:
24342431
if self.joliet_vd is None:
24352432
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a joliet_path from a non-Joliet ISO')
2436-
found_record = self._find_joliet_record(joliet_path)
2433+
encoding = encoding or 'utf-16_be'
2434+
found_record = self._find_joliet_record(joliet_path, encoding)
24372435
elif rr_path is not None:
24382436
if not self.rock_ridge:
24392437
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a rr_path from a non-Rock Ridge ISO')
2440-
found_record = self._find_rr_record(rr_path)
2438+
encoding = encoding or 'utf-8'
2439+
found_record = self._find_rr_record(rr_path, encoding)
24412440
elif iso_path is not None:
2442-
found_record = self._find_iso_record(iso_path)
2441+
encoding = encoding or 'utf-8'
2442+
found_record = self._find_iso_record(iso_path, encoding)
24432443
else:
24442444
raise pycdlibexception.PyCdlibInternalError('Invalid path passed to get_file_from_iso_fp')
24452445

@@ -3471,8 +3471,8 @@ def _rm_joliet_dir(self, joliet_path):
34713471

34723472
return num_bytes_to_remove
34733473

3474-
def _get_iso_entry(self, iso_path):
3475-
# type: (bytes) -> dr.DirectoryRecord
3474+
def _get_iso_entry(self, iso_path, encoding='utf-8'):
3475+
# type: (bytes, str) -> dr.DirectoryRecord
34763476
"""
34773477
Internal method to get the directory record for an ISO path.
34783478
@@ -3484,10 +3484,10 @@ def _get_iso_entry(self, iso_path):
34843484
if self._needs_reshuffle:
34853485
self._reshuffle_extents()
34863486

3487-
return self._find_iso_record(iso_path)
3487+
return self._find_iso_record(iso_path, encoding)
34883488

3489-
def _get_rr_entry(self, rr_path):
3490-
# type: (bytes) -> dr.DirectoryRecord
3489+
def _get_rr_entry(self, rr_path, encoding='utf-8'):
3490+
# type: (bytes, str) -> dr.DirectoryRecord
34913491
"""
34923492
Internal method to get the directory record for a Rock Ridge path.
34933493
@@ -3500,10 +3500,10 @@ def _get_rr_entry(self, rr_path):
35003500
if self._needs_reshuffle:
35013501
self._reshuffle_extents()
35023502

3503-
return self._find_rr_record(rr_path)
3503+
return self._find_rr_record(rr_path, encoding)
35043504

3505-
def _get_joliet_entry(self, joliet_path):
3506-
# type: (bytes) -> dr.DirectoryRecord
3505+
def _get_joliet_entry(self, joliet_path, encoding='utf-16_be'):
3506+
# type: (bytes, str) -> dr.DirectoryRecord
35073507
"""
35083508
Internal method to get the directory record for a Joliet path.
35093509
@@ -3516,7 +3516,7 @@ def _get_joliet_entry(self, joliet_path):
35163516
if self._needs_reshuffle:
35173517
self._reshuffle_extents()
35183518

3519-
return self._find_joliet_record(joliet_path)
3519+
return self._find_joliet_record(joliet_path, encoding)
35203520

35213521
def _get_udf_entry(self, udf_path):
35223522
# type: (str) -> udfmod.UDFFileEntry
@@ -4183,6 +4183,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
41834183
iso_path = None
41844184
rr_path = None
41854185
udf_path = None
4186+
encoding = None
41864187
num_paths = 0
41874188
for key, value in kwargs.items():
41884189
if key == 'blocksize':
@@ -4213,6 +4214,8 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
42134214
num_paths += 1
42144215
elif value is not None:
42154216
raise pycdlibexception.PyCdlibInvalidInput('udf_path must be a string')
4217+
elif key == 'encoding':
4218+
encoding = value
42164219
else:
42174220
raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key))
42184221

@@ -4223,7 +4226,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
42234226
self._udf_get_file_from_iso_fp(outfp, blocksize, udf_path)
42244227
else:
42254228
self._get_file_from_iso_fp(outfp, blocksize, iso_path, rr_path,
4226-
joliet_path)
4229+
joliet_path, encoding)
42274230

42284231
def get_and_write(self, iso_path, local_path, blocksize=8192):
42294232
# type: (str, str, int) -> None
@@ -5459,6 +5462,8 @@ def list_children(self, **kwargs):
54595462
if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'):
54605463
if value is not None:
54615464
num_paths += 1
5465+
elif key in ('encoding'):
5466+
continue
54625467
else:
54635468
raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'")
54645469

@@ -5476,12 +5481,15 @@ def list_children(self, **kwargs):
54765481
else:
54775482
use_rr = False
54785483
if 'joliet_path' in kwargs:
5479-
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']))
5484+
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-16_be'
5485+
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']), kwargs['encoding'])
54805486
elif 'rr_path' in kwargs:
5481-
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']))
5487+
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-8'
5488+
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']), kwargs['encoding'])
54825489
use_rr = True
54835490
else:
5484-
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']))
5491+
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-8'
5492+
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']), kwargs['encoding'])
54855493

54865494
for c in _yield_children(rec, use_rr):
54875495
yield c
@@ -5626,8 +5634,8 @@ def rm_isohybrid(self):
56265634

56275635
self.isohybrid_mbr = None
56285636

5629-
def full_path_from_dirrecord(self, rec, rockridge=False):
5630-
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool) -> str
5637+
def full_path_from_dirrecord(self, rec, rockridge=False, user_encoding=None):
5638+
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool, str) -> str
56315639
"""
56325640
Get the absolute path of a directory record.
56335641
@@ -5646,6 +5654,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
56465654
if self.joliet_vd is not None and id(rec.vd) == id(self.joliet_vd):
56475655
encoding = 'utf-16_be'
56485656

5657+
if user_encoding:
5658+
encoding = user_encoding
56495659
# A root entry has no Rock Ridge entry, even on a Rock Ridge ISO.
56505660
# Always return / here.
56515661
if rec.is_root:
@@ -5685,6 +5695,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
56855695
encoding = rec.file_ident.encoding
56865696
else:
56875697
encoding = 'utf-8'
5698+
if user_encoding:
5699+
encoding = user_encoding
56885700
udf_rec = rec # type: Optional[udfmod.UDFFileEntry]
56895701
while udf_rec is not None:
56905702
ident = udf_rec.file_identifier()
@@ -5893,13 +5905,13 @@ def walk(self, **kwargs):
58935905
while dirs:
58945906
dir_record = dirs.popleft()
58955907

5896-
relpath = self.full_path_from_dirrecord(dir_record,
5897-
rockridge=path_type == 'rr_path')
5908+
relpath = self.full_path_from_dirrecord(dir_record, rockridge=path_type == 'rr_path',
5909+
user_encoding=user_encoding)
58985910
dirlist = []
58995911
filelist = []
59005912
dirdict = {}
59015913

5902-
for child in reversed(list(self.list_children(**{path_type: relpath}))):
5914+
for child in reversed(list(self.list_children(**{path_type: relpath, 'encoding': kwargs.get('encoding', None)}))):
59035915
if child is None or child.is_dot() or child.is_dotdot():
59045916
continue
59055917

0 commit comments

Comments
 (0)