Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 87 additions & 17 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,15 +1440,7 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase):

def comparable_zinfo(zinfo):
"""Return a dict of public ZipInfo attributes for assertEqual comparison."""
attrs = {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS}

# Since patch gh-84353, the _MASK_UTF_FILENAME (0x800) bit may be
# changed when writing to the end record depending on whether filename
# can be encoded with ascii or cp437. Skip checking this bit by
# pretending it's always set.
attrs['flag_bits'] |= 0x800

return attrs
return {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS}

_struct_pack = struct.pack

Expand Down Expand Up @@ -5710,15 +5702,27 @@ def setUp(self):
with open(TESTFN, "wb") as tf:
tf.write(data)

def _test_read(self, zipfp, expected_names, expected_content):
def _test_read(self, zipfp, expected_names, expected_content,
expected_comments=None, expected_efs_flags=None):
# Check the namelist
names = zipfp.namelist()
self.assertEqual(sorted(names), sorted(expected_names))
self.assertEqual(names, expected_names)

# Check infolist
infos = zipfp.infolist()
names = [zi.filename for zi in infos]
self.assertEqual(sorted(names), sorted(expected_names))
self.assertEqual(names, expected_names)

if expected_comments is not None:
comments = [zi.comment for zi in infos]
self.assertEqual(comments, expected_comments)

if expected_efs_flags is not None:
efs_flags = [
bool(zi.flag_bits & zipfile._MASK_UTF_FILENAME)
for zi in infos
]
self.assertEqual(efs_flags, expected_efs_flags)

# check getinfo
for name, content in zip(expected_names, expected_content):
Expand All @@ -5731,20 +5735,26 @@ def test_read_with_metadata_encoding(self):
# Read the ZIP archive with correct metadata_encoding
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
self._test_read(zipfp, self.file_names, self.file_content)
with zipfile.ZipFile(TESTFN, "a", metadata_encoding='shift_jis') as zipfp:
self._test_read(zipfp, self.file_names, self.file_content)

def test_read_without_metadata_encoding(self):
# Read the ZIP archive without metadata_encoding
expected_names = [name.encode('shift_jis').decode('cp437')
for name in self.file_names[:2]] + self.file_names[2:]
with zipfile.ZipFile(TESTFN, "r") as zipfp:
self._test_read(zipfp, expected_names, self.file_content)
with zipfile.ZipFile(TESTFN, "a") as zipfp:
self._test_read(zipfp, expected_names, self.file_content)

def test_read_with_incorrect_metadata_encoding(self):
# Read the ZIP archive with incorrect metadata_encoding
expected_names = [name.encode('shift_jis').decode('koi8-u')
for name in self.file_names[:2]] + self.file_names[2:]
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp:
self._test_read(zipfp, expected_names, self.file_content)
with zipfile.ZipFile(TESTFN, "a", metadata_encoding='koi8-u') as zipfp:
self._test_read(zipfp, expected_names, self.file_content)

def test_read_with_unsuitable_metadata_encoding(self):
# Read the ZIP archive with metadata_encoding unsuitable for
Expand All @@ -5753,6 +5763,10 @@ def test_read_with_unsuitable_metadata_encoding(self):
zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii')
with self.assertRaises(UnicodeDecodeError):
zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8')
with self.assertRaises(UnicodeDecodeError):
zipfile.ZipFile(TESTFN, "a", metadata_encoding='ascii')
with self.assertRaises(UnicodeDecodeError):
zipfile.ZipFile(TESTFN, "a", metadata_encoding='utf-8')

def test_read_after_append(self):
newname = '\u56db' # Han 'four'
Expand All @@ -5766,20 +5780,76 @@ def test_read_after_append(self):
with zipfile.ZipFile(TESTFN, "a") as zipfp:
zipfp.writestr(newname, "newcontent")
zipfp.writestr(newname2, "newcontent2")
self.assertEqual(sorted(zipfp.namelist()), sorted(mojibake_expected_names))
self.assertEqual(zipfp.namelist(), mojibake_expected_names)

with zipfile.ZipFile(TESTFN, "r") as zipfp:
self._test_read(zipfp, mojibake_expected_names, expected_content)
with zipfile.ZipFile(TESTFN, "a") as zipfp:
self._test_read(zipfp, mojibake_expected_names, expected_content)

with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
self._test_read(zipfp, expected_names, expected_content)
with zipfile.ZipFile(TESTFN, "a", metadata_encoding='shift_jis') as zipfp:
self._test_read(zipfp, expected_names, expected_content)

def test_append_keep_efs_flag(self):
"""Files loaded from an archive should keep original EFS flags when
rewritten to central directory in append mode."""
names = ['file1', 'file2', 'file3', 'file4']
contents = [b'content1', b'content2', b'content3', b'content4']
comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('shift_jis'), b'bar']
efs_flags = [True, True, False, False]

def mock_encode(self):
if efs_flags[i]:
zinfo.flag_bits |= zipfile._MASK_UTF_FILENAME
return (self.filename.encode('ascii'), self.flag_bits)

with mock.patch('zipfile.ZipInfo._encodeFilenameFlags', mock_encode), \
zipfile.ZipFile(TESTFN, "w") as zipfp:
for i, name in enumerate(names):
zinfo = zipfile.ZipInfo(name)
zinfo.comment = comments[i]
zipfp.writestr(zinfo, contents[i])

with zipfile.ZipFile(TESTFN, "a") as zipfp:
# trigger archive rewriting
zipfp.comment = b'comment'

with zipfile.ZipFile(TESTFN, "r") as zipfp:
self.assertEqual(zipfp.comment, b'comment')
self._test_read(zipfp, names, contents, comments, efs_flags)

def test_write_enforce_efs_flag(self):
"""New files should enforce EFS flag if filename or comment is not ASCII."""
names = ['\u4e00', '\u4e8c', 'file3', 'file4']
contents = [b'content1', b'content2', b'content3', b'content4']
comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('utf-8'), b'bar']
expected_efs_flags = [True, True, True, False]

with zipfile.ZipFile(TESTFN, "w") as zipfp:
for i, name in enumerate(names):
zinfo = zipfile.ZipInfo(name)
zinfo.comment = comments[i]
zipfp.writestr(zinfo, contents[i])
self.assertEqual(zipfp.namelist(), names)

with zipfile.ZipFile(TESTFN, "r") as zipfp:
self._test_read(zipfp, names, contents, comments, expected_efs_flags)

def test_write_with_metadata_encoding(self):
ZF = zipfile.ZipFile
"""metadata_encoding should not affect the encoding of new files."""
names = ['\u4e00', 'file2']
contents = ['\u4e00'.encode('utf-8'), '\u4e8c'.encode('utf-8')]
expected_efs_flags = [True, False]

for mode in ("w", "x", "a"):
with self.assertRaisesRegex(ValueError,
"^metadata_encoding is only"):
ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
unlink(TESTFN)
with zipfile.ZipFile(TESTFN, mode, metadata_encoding='shift_jis') as zipfp:
for i, name in enumerate(names):
zipfp.writestr(name, contents[i])
with zipfile.ZipFile(TESTFN, 'r') as zipfp:
self._test_read(zipfp, names, contents, None, expected_efs_flags)

def test_add_comment(self):
with zipfile.ZipFile(TESTFN, "r") as zipfp:
Expand Down
26 changes: 15 additions & 11 deletions Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ class ZipInfo:
'file_size',
'_raw_time',
'_end_offset',
'_metadata_encoding',
)

def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
Expand Down Expand Up @@ -488,6 +489,7 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
self.compress_size = 0 # Size of the compressed file
self.file_size = 0 # Size of the uncompressed file
self._end_offset = None # Start of the next local header or central directory
self._metadata_encoding = None # Encoding used when read from the archive
# Other attributes are set by class ZipFile:
# header_offset Byte offset to the file header
# CRC CRC-32 of the uncompressed file
Expand Down Expand Up @@ -575,12 +577,18 @@ def FileHeader(self, zip64=None):

def _encodeFilenameFlags(self):
if self.flag_bits & _MASK_UTF_FILENAME:
encoding = 'ascii'
else:
encoding = 'cp437'
return self.filename.encode('utf-8'), self.flag_bits

# For a file read from the archive, preserve its original encoding.
encoding = self._metadata_encoding
if encoding:
return self.filename.encode(encoding), self.flag_bits

# For a newly added file, enforce EFS if filename or comment is non-ASCII.
try:
return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME
except UnicodeEncodeError:
self.comment.decode('ascii')
return self.filename.encode('ascii'), self.flag_bits
except (UnicodeEncodeError, UnicodeDecodeError):
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME

def _decodeExtra(self, filename_crc):
Expand Down Expand Up @@ -1917,11 +1925,6 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
self._strict_timestamps = strict_timestamps
self.metadata_encoding = metadata_encoding

# Check that we don't try to write with nonconforming codecs
if self.metadata_encoding and mode != 'r':
raise ValueError(
"metadata_encoding is only supported for reading files")

# Check if we were passed a file-like object
if isinstance(file, os.PathLike):
file = os.fspath(file)
Expand Down Expand Up @@ -2072,6 +2075,7 @@ def _RealGetContents(self):
t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
x._decodeExtra(orig_filename_crc)
x.header_offset = x.header_offset + concat
x._metadata_encoding = self.metadata_encoding or 'cp437'
self.filelist.append(x)
self.NameToInfo[x.filename] = x

Expand Down Expand Up @@ -2286,7 +2290,7 @@ def _open_to_write(self, zinfo, force_zip64=False):
zinfo.compress_size = 0
zinfo.CRC = 0

zinfo.flag_bits = _MASK_UTF_FILENAME
zinfo.flag_bits = 0x00
if zinfo.compress_type == ZIP_LZMA:
# Compressed data includes an end-of-stream (EOS) marker
zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1
Expand Down
Comment thread
danny0838 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix an issue where the EFS flag is not set when a file with ASCII
filename and UTF-8 comment is written through :mod:`zipfile`. Also
preserves the original encoding and EFS flag for a file read from an
archive and rewritten through the ``'a'`` mode. Additionallly allows
the ``metadata_encoding`` parameter in all modes.
Loading