diff options
author | ben | 2018-09-18 10:52:38 +0200 |
---|---|---|
committer | ben | 2018-09-18 10:52:38 +0200 |
commit | f57654b84b4cf0ffa1287034fc9f66ba200bb259 (patch) | |
tree | 5ffb371ce5b5008052e425955f45c8b808ba7fa0 /PdfFileTransformer/PyPDF2/filters.py | |
download | truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.gz truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.bz2 truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.xz |
First public commit
Diffstat (limited to 'PdfFileTransformer/PyPDF2/filters.py')
-rw-r--r-- | PdfFileTransformer/PyPDF2/filters.py | 424 |
1 files changed, 424 insertions, 0 deletions
diff --git a/PdfFileTransformer/PyPDF2/filters.py b/PdfFileTransformer/PyPDF2/filters.py new file mode 100644 index 0000000..57446f4 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/filters.py @@ -0,0 +1,424 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of stream filters for PDF. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import math + +from .utils import PdfReadError, ord_, chr_, paethPredictor +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO +else: + from io import StringIO + import struct + +try: + import zlib + + def decompress(data): + return zlib.decompress(data) + + def compress(data): + return zlib.compress(data) + +except ImportError: + # Unable to import zlib. Attempt to use the System.IO.Compression + # library from the .NET framework. (IronPython only) + import System + from System import IO, Collections, Array + + def _string_to_bytearr(buf): + retval = Array.CreateInstance(System.Byte, len(buf)) + for i in range(len(buf)): + retval[i] = ord(buf[i]) + return retval + + def _bytearr_to_string(bytes): + retval = "" + for i in range(bytes.Length): + retval += chr(bytes[i]) + return retval + + def _read_bytes(stream): + ms = IO.MemoryStream() + buf = Array.CreateInstance(System.Byte, 2048) + while True: + bytes = stream.Read(buf, 0, buf.Length) + if bytes == 0: + break + else: + ms.Write(buf, 0, bytes) + retval = ms.ToArray() + ms.Close() + return retval + + def decompress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + ms.Write(bytes, 0, bytes.Length) + ms.Position = 0 # fseek 0 + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress) + bytes = _read_bytes(gz) + retval = _bytearr_to_string(bytes) + gz.Close() + return retval + + def compress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True) + gz.Write(bytes, 0, bytes.Length) + gz.Close() + ms.Position = 0 # fseek 0 + bytes = ms.ToArray() + retval = _bytearr_to_string(bytes) + ms.Close() + return retval + + +class FlateDecode(object): + def decode(data, decodeParms): + data = decompress(data) + predictor = 1 + if decodeParms: + try: + predictor = decodeParms.get("/Predictor", 1) + except AttributeError: + pass # usually an array with a null object was read + + # predictor 1 == no predictor + if predictor != 1: + columns = decodeParms["/Columns"] + # PNG prediction: + if predictor >= 10 and predictor <= 15: + output = StringIO() + # PNG prediction can vary from row to row + rowlength = columns + 1 + assert len(data) % rowlength == 0 + prev_rowdata = (0,) * rowlength + for row in range(len(data) // rowlength): + rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]] + filterByte = rowdata[0] + if filterByte == 0: + pass + elif filterByte == 1: + for i in range(2, rowlength): + rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 + elif filterByte == 2: + for i in range(1, rowlength): + rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 + elif filterByte == 3: + for i in range(1, rowlength): + left = rowdata[i-1] if i > 1 else 0 + floor = math.floor(left + prev_rowdata[i])/2 + rowdata[i] = (rowdata[i] + int(floor)) % 256 + elif filterByte == 4: + for i in range(1, rowlength): + left = rowdata[i - 1] if i > 1 else 0 + up = prev_rowdata[i] + up_left = prev_rowdata[i - 1] if i > 1 else 0 + paeth = paethPredictor(left, up, up_left) + rowdata[i] = (rowdata[i] + paeth) % 256 + else: + # unsupported PNG filter + raise PdfReadError("Unsupported PNG filter %r" % filterByte) + prev_rowdata = rowdata + output.write(''.join([chr(x) for x in rowdata[1:]])) + data = output.getvalue() + else: + # unsupported predictor + raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) + return data + decode = staticmethod(decode) + + def encode(data): + return compress(data) + encode = staticmethod(encode) + + +class ASCIIHexDecode(object): + def decode(data, decodeParms=None): + retval = "" + char = "" + x = 0 + while True: + c = data[x] + if c == ">": + break + elif c.isspace(): + x += 1 + continue + char += c + if len(char) == 2: + retval += chr(int(char, base=16)) + char = "" + x += 1 + assert char == "" + return retval + decode = staticmethod(decode) + + +class LZWDecode(object): + """Taken from: + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + """ + class decoder(object): + def __init__(self, data): + self.STOP=257 + self.CLEARDICT=256 + self.data=data + self.bytepos=0 + self.bitpos=0 + self.dict=[""]*4096 + for i in range(256): + self.dict[i]=chr(i) + self.resetDict() + + def resetDict(self): + self.dictlen=258 + self.bitspercode=9 + + def nextCode(self): + fillbits=self.bitspercode + value=0 + while fillbits>0 : + if self.bytepos >= len(self.data): + return -1 + nextbits=ord_(self.data[self.bytepos]) + bitsfromhere=8-self.bitpos + if bitsfromhere>fillbits: + bitsfromhere=fillbits + value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) & + (0xff >> (8-bitsfromhere))) << + (fillbits-bitsfromhere)) + fillbits -= bitsfromhere + self.bitpos += bitsfromhere + if self.bitpos >=8: + self.bitpos=0 + self.bytepos = self.bytepos+1 + return value + + def decode(self): + """ algorithm derived from: + http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html + and the PDFReference + """ + cW = self.CLEARDICT; + baos="" + while True: + pW = cW; + cW = self.nextCode(); + if cW == -1: + raise PdfReadError("Missed the stop code in LZWDecode!") + if cW == self.STOP: + break; + elif cW == self.CLEARDICT: + self.resetDict(); + elif pW == self.CLEARDICT: + baos+=self.dict[cW] + else: + if cW < self.dictlen: + baos += self.dict[cW] + p=self.dict[pW]+self.dict[cW][0] + self.dict[self.dictlen]=p + self.dictlen+=1 + else: + p=self.dict[pW]+self.dict[pW][0] + baos+=p + self.dict[self.dictlen] = p; + self.dictlen+=1 + if (self.dictlen >= (1 << self.bitspercode) - 1 and + self.bitspercode < 12): + self.bitspercode+=1 + return baos + + @staticmethod + def decode(data,decodeParams=None): + return LZWDecode.decoder(data).decode() + + +class ASCII85Decode(object): + def decode(data, decodeParms=None): + if version_info < ( 3, 0 ): + retval = "" + group = [] + x = 0 + hitEod = False + # remove all whitespace from data + data = [y for y in data if not (y in ' \n\r\t')] + while not hitEod: + c = data[x] + if len(retval) == 0 and c == "<" and data[x+1] == "~": + x += 2 + continue + #elif c.isspace(): + # x += 1 + # continue + elif c == 'z': + assert len(group) == 0 + retval += '\x00\x00\x00\x00' + x += 1 + continue + elif c == "~" and data[x+1] == ">": + if len(group) != 0: + # cannot have a final group of just 1 char + assert len(group) > 1 + cnt = len(group) - 1 + group += [ 85, 85, 85 ] + hitEod = cnt + else: + break + else: + c = ord(c) - 33 + assert c >= 0 and c < 85 + group += [ c ] + if len(group) >= 5: + b = group[0] * (85**4) + \ + group[1] * (85**3) + \ + group[2] * (85**2) + \ + group[3] * 85 + \ + group[4] + assert b < (2**32 - 1) + c4 = chr((b >> 0) % 256) + c3 = chr((b >> 8) % 256) + c2 = chr((b >> 16) % 256) + c1 = chr(b >> 24) + retval += (c1 + c2 + c3 + c4) + if hitEod: + retval = retval[:-4+hitEod] + group = [] + x += 1 + return retval + else: + if isinstance(data, str): + data = data.encode('ascii') + n = b = 0 + out = bytearray() + for c in data: + if ord('!') <= c and c <= ord('u'): + n += 1 + b = b*85+(c-33) + if n == 5: + out += struct.pack(b'>L',b) + n = b = 0 + elif c == ord('z'): + assert n == 0 + out += b'\0\0\0\0' + elif c == ord('~'): + if n: + for _ in range(5-n): + b = b*85+84 + out += struct.pack(b'>L',b)[:n-1] + break + return bytes(out) + decode = staticmethod(decode) + +class DCTDecode(object): + def decode(data, decodeParms=None): + return data + decode = staticmethod(decode) + +class JPXDecode(object): + def decode(data, decodeParms=None): + return data + decode = staticmethod(decode) + +class CCITTFaxDecode(object): + def decode(data, decodeParms=None, height=0): + if decodeParms: + if decodeParms.get("/K", 1) == -1: + CCITTgroup = 4 + else: + CCITTgroup = 3 + + width = decodeParms["/Columns"] + imgSize = len(data) + tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h' + tiffHeader = struct.pack(tiff_header_struct, + b'II', # Byte order indication: Little endian + 42, # Version number (always 42) + 8, # Offset to first IFD + 8, # Number of tags in IFD + 256, 4, 1, width, # ImageWidth, LONG, 1, width + 257, 4, 1, height, # ImageLength, LONG, 1, length + 258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1 + 259, 3, 1, CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding + 262, 3, 1, 0, # Thresholding, SHORT, 1, 0 = WhiteIsZero + 273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header + 278, 4, 1, height, # RowsPerStrip, LONG, 1, length + 279, 4, 1, imgSize, # StripByteCounts, LONG, 1, size of image + 0 # last IFD + ) + + return tiffHeader + data + + decode = staticmethod(decode) + +def decodeStreamData(stream): + from .generic import NameObject + filters = stream.get("/Filter", ()) + + if len(filters) and not isinstance(filters[0], NameObject): + # we have a single filter instance + filters = (filters,) + data = stream._data + # If there is not data to decode we should not try to decode the data. + if data: + for filterType in filters: + if filterType == "/FlateDecode" or filterType == "/Fl": + data = FlateDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCIIHexDecode" or filterType == "/AHx": + data = ASCIIHexDecode.decode(data) + elif filterType == "/LZWDecode" or filterType == "/LZW": + data = LZWDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCII85Decode" or filterType == "/A85": + data = ASCII85Decode.decode(data) + elif filterType == "/DCTDecode": + data = DCTDecode.decode(data) + elif filterType == "/JPXDecode": + data = JPXDecode.decode(data) + elif filterType == "/CCITTFaxDecode": + height = stream.get("/Height", ()) + data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height) + elif filterType == "/Crypt": + decodeParams = stream.get("/DecodeParams", {}) + if "/Name" not in decodeParams and "/Type" not in decodeParams: + pass + else: + raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") + else: + # unsupported filter + raise NotImplementedError("unsupported filter %s" % filterType) + return data |