|
ZLIB_MAX_OUTPUT_LENGTH = 75_000_000 |
|
|
|
|
|
def _decompress_with_limit(data: bytes) -> bytes: |
|
decompressor = zlib.decompressobj() |
|
result = decompressor.decompress(data, max_length=ZLIB_MAX_OUTPUT_LENGTH) |
|
if decompressor.unconsumed_tail: |
|
raise LimitReachedError( |
|
f"Limit reached while decompressing. {len(decompressor.unconsumed_tail)} bytes remaining." |
|
) |
|
return result |
|
|
|
|
|
def decompress(data: bytes) -> bytes: |
|
""" |
|
Decompress the given data using zlib. |
|
|
|
Attempts to decompress the input data using zlib. |
|
If the decompression fails due to a zlib error, it falls back |
|
to using a decompression object with a larger window size. |
|
|
|
Please note that the output length is limited to avoid memory |
|
issues. If you need to process larger content streams, consider |
|
adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you |
|
are only dealing with trusted inputs and/or want to disable these |
|
limits, set the value to `0`. |
|
|
|
Args: |
|
data: The input data to be decompressed. |
|
|
|
Returns: |
|
The decompressed data. |
|
|
|
""" |
|
try: |
|
return _decompress_with_limit(data) |
|
except zlib.error: |
|
# First quick approach: There are known issues with faulty added bytes to the |
|
# tail of the encoded stream from early Adobe Distiller or Pitstop versions |
|
# with CR char as the default line separator (assumed by reverse engineering) |
|
# that breaks the decoding process in the end. |
|
# |
|
# Try first to cut off some of the tail byte by byte, but limited to not |
|
# iterate through too many loops and kill the performance for large streams, |
|
# to then allow the final fallback to run. Added this intermediate attempt, |
|
# because starting from the head of the stream byte by byte kills completely |
|
# the performance for large streams (e.g., 6 MB) with the tail-byte-issue |
|
# and takes ages. This solution is really fast: |
|
max_tail_cut_off_bytes: int = 8 |
|
for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))): |
|
try: |
|
return _decompress_with_limit(data[:-i]) |
|
except zlib.error: |
|
pass |
|
|
|
# If still failing, then try with increased window size. |
|
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32) |
|
result_str = b"" |
|
remaining_limit = ZLIB_MAX_OUTPUT_LENGTH |
|
data_single_bytes = [data[i : i + 1] for i in range(len(data))] |
|
for index, b in enumerate(data_single_bytes): |
|
try: |
|
decompressed = decompressor.decompress(b, max_length=remaining_limit) |
|
result_str += decompressed |
|
remaining_limit -= len(decompressed) |
|
if remaining_limit <= 0: |
|
raise LimitReachedError( |
|
f"Limit reached while decompressing. {len(data_single_bytes) - index} bytes remaining." |
|
) |
|
except zlib.error: |
|
pass |
|
return result_str |
Impact
An attacker who uses this vulnerability can craft a PDF which leads to the RAM being exhausted. This requires just reading the file if a series of FlateDecode filters is used on a malicious cross-reference stream. Other content streams are affected on explicit access.
Patches
This has been fixed in pypdf==6.0.0.
Workarounds
If you cannot upgrade yet, you might want to implement the workaround for
pypdf.filters.decompress
yourself:pypdf/pypdf/filters.py
Lines 72 to 143 in 0dd5773
References
This issue has been reported in #3429 and fixed in #3430.