diff options
author | ben | 2018-09-18 10:52:38 +0200 |
---|---|---|
committer | ben | 2018-09-18 10:52:38 +0200 |
commit | f57654b84b4cf0ffa1287034fc9f66ba200bb259 (patch) | |
tree | 5ffb371ce5b5008052e425955f45c8b808ba7fa0 | |
download | truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.gz truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.bz2 truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.xz |
First public commit
58 files changed, 9728 insertions, 0 deletions
diff --git a/PdfFileTransformer/PyPDF2/__init__.py b/PdfFileTransformer/PyPDF2/__init__.py new file mode 100644 index 0000000..f458c0e --- /dev/null +++ b/PdfFileTransformer/PyPDF2/__init__.py @@ -0,0 +1,5 @@ +from .pdf import PdfFileReader, PdfFileWriter +from .merger import PdfFileMerger +from .pagerange import PageRange, parse_filename_page_ranges +from ._version import __version__ +__all__ = ["pdf", "PdfFileMerger"] diff --git a/PdfFileTransformer/PyPDF2/_version.py b/PdfFileTransformer/PyPDF2/_version.py new file mode 100644 index 0000000..5fc7041 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/_version.py @@ -0,0 +1 @@ +__version__ = '1.26.0' diff --git a/PdfFileTransformer/PyPDF2/filters.py b/PdfFileTransformer/PyPDF2/filters.py new file mode 100644 index 0000000..57446f4 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/filters.py @@ -0,0 +1,424 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of stream filters for PDF. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import math + +from .utils import PdfReadError, ord_, chr_, paethPredictor +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO +else: + from io import StringIO + import struct + +try: + import zlib + + def decompress(data): + return zlib.decompress(data) + + def compress(data): + return zlib.compress(data) + +except ImportError: + # Unable to import zlib. Attempt to use the System.IO.Compression + # library from the .NET framework. (IronPython only) + import System + from System import IO, Collections, Array + + def _string_to_bytearr(buf): + retval = Array.CreateInstance(System.Byte, len(buf)) + for i in range(len(buf)): + retval[i] = ord(buf[i]) + return retval + + def _bytearr_to_string(bytes): + retval = "" + for i in range(bytes.Length): + retval += chr(bytes[i]) + return retval + + def _read_bytes(stream): + ms = IO.MemoryStream() + buf = Array.CreateInstance(System.Byte, 2048) + while True: + bytes = stream.Read(buf, 0, buf.Length) + if bytes == 0: + break + else: + ms.Write(buf, 0, bytes) + retval = ms.ToArray() + ms.Close() + return retval + + def decompress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + ms.Write(bytes, 0, bytes.Length) + ms.Position = 0 # fseek 0 + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress) + bytes = _read_bytes(gz) + retval = _bytearr_to_string(bytes) + gz.Close() + return retval + + def compress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True) + gz.Write(bytes, 0, bytes.Length) + gz.Close() + ms.Position = 0 # fseek 0 + bytes = ms.ToArray() + retval = _bytearr_to_string(bytes) + ms.Close() + return retval + + +class FlateDecode(object): + def decode(data, decodeParms): + data = decompress(data) + predictor = 1 + if decodeParms: + try: + predictor = decodeParms.get("/Predictor", 1) + except AttributeError: + pass # usually an array with a null object was read + + # predictor 1 == no predictor + if predictor != 1: + columns = decodeParms["/Columns"] + # PNG prediction: + if predictor >= 10 and predictor <= 15: + output = StringIO() + # PNG prediction can vary from row to row + rowlength = columns + 1 + assert len(data) % rowlength == 0 + prev_rowdata = (0,) * rowlength + for row in range(len(data) // rowlength): + rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]] + filterByte = rowdata[0] + if filterByte == 0: + pass + elif filterByte == 1: + for i in range(2, rowlength): + rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 + elif filterByte == 2: + for i in range(1, rowlength): + rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 + elif filterByte == 3: + for i in range(1, rowlength): + left = rowdata[i-1] if i > 1 else 0 + floor = math.floor(left + prev_rowdata[i])/2 + rowdata[i] = (rowdata[i] + int(floor)) % 256 + elif filterByte == 4: + for i in range(1, rowlength): + left = rowdata[i - 1] if i > 1 else 0 + up = prev_rowdata[i] + up_left = prev_rowdata[i - 1] if i > 1 else 0 + paeth = paethPredictor(left, up, up_left) + rowdata[i] = (rowdata[i] + paeth) % 256 + else: + # unsupported PNG filter + raise PdfReadError("Unsupported PNG filter %r" % filterByte) + prev_rowdata = rowdata + output.write(''.join([chr(x) for x in rowdata[1:]])) + data = output.getvalue() + else: + # unsupported predictor + raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) + return data + decode = staticmethod(decode) + + def encode(data): + return compress(data) + encode = staticmethod(encode) + + +class ASCIIHexDecode(object): + def decode(data, decodeParms=None): + retval = "" + char = "" + x = 0 + while True: + c = data[x] + if c == ">": + break + elif c.isspace(): + x += 1 + continue + char += c + if len(char) == 2: + retval += chr(int(char, base=16)) + char = "" + x += 1 + assert char == "" + return retval + decode = staticmethod(decode) + + +class LZWDecode(object): + """Taken from: + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + """ + class decoder(object): + def __init__(self, data): + self.STOP=257 + self.CLEARDICT=256 + self.data=data + self.bytepos=0 + self.bitpos=0 + self.dict=[""]*4096 + for i in range(256): + self.dict[i]=chr(i) + self.resetDict() + + def resetDict(self): + self.dictlen=258 + self.bitspercode=9 + + def nextCode(self): + fillbits=self.bitspercode + value=0 + while fillbits>0 : + if self.bytepos >= len(self.data): + return -1 + nextbits=ord_(self.data[self.bytepos]) + bitsfromhere=8-self.bitpos + if bitsfromhere>fillbits: + bitsfromhere=fillbits + value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) & + (0xff >> (8-bitsfromhere))) << + (fillbits-bitsfromhere)) + fillbits -= bitsfromhere + self.bitpos += bitsfromhere + if self.bitpos >=8: + self.bitpos=0 + self.bytepos = self.bytepos+1 + return value + + def decode(self): + """ algorithm derived from: + http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html + and the PDFReference + """ + cW = self.CLEARDICT; + baos="" + while True: + pW = cW; + cW = self.nextCode(); + if cW == -1: + raise PdfReadError("Missed the stop code in LZWDecode!") + if cW == self.STOP: + break; + elif cW == self.CLEARDICT: + self.resetDict(); + elif pW == self.CLEARDICT: + baos+=self.dict[cW] + else: + if cW < self.dictlen: + baos += self.dict[cW] + p=self.dict[pW]+self.dict[cW][0] + self.dict[self.dictlen]=p + self.dictlen+=1 + else: + p=self.dict[pW]+self.dict[pW][0] + baos+=p + self.dict[self.dictlen] = p; + self.dictlen+=1 + if (self.dictlen >= (1 << self.bitspercode) - 1 and + self.bitspercode < 12): + self.bitspercode+=1 + return baos + + @staticmethod + def decode(data,decodeParams=None): + return LZWDecode.decoder(data).decode() + + +class ASCII85Decode(object): + def decode(data, decodeParms=None): + if version_info < ( 3, 0 ): + retval = "" + group = [] + x = 0 + hitEod = False + # remove all whitespace from data + data = [y for y in data if not (y in ' \n\r\t')] + while not hitEod: + c = data[x] + if len(retval) == 0 and c == "<" and data[x+1] == "~": + x += 2 + continue + #elif c.isspace(): + # x += 1 + # continue + elif c == 'z': + assert len(group) == 0 + retval += '\x00\x00\x00\x00' + x += 1 + continue + elif c == "~" and data[x+1] == ">": + if len(group) != 0: + # cannot have a final group of just 1 char + assert len(group) > 1 + cnt = len(group) - 1 + group += [ 85, 85, 85 ] + hitEod = cnt + else: + break + else: + c = ord(c) - 33 + assert c >= 0 and c < 85 + group += [ c ] + if len(group) >= 5: + b = group[0] * (85**4) + \ + group[1] * (85**3) + \ + group[2] * (85**2) + \ + group[3] * 85 + \ + group[4] + assert b < (2**32 - 1) + c4 = chr((b >> 0) % 256) + c3 = chr((b >> 8) % 256) + c2 = chr((b >> 16) % 256) + c1 = chr(b >> 24) + retval += (c1 + c2 + c3 + c4) + if hitEod: + retval = retval[:-4+hitEod] + group = [] + x += 1 + return retval + else: + if isinstance(data, str): + data = data.encode('ascii') + n = b = 0 + out = bytearray() + for c in data: + if ord('!') <= c and c <= ord('u'): + n += 1 + b = b*85+(c-33) + if n == 5: + out += struct.pack(b'>L',b) + n = b = 0 + elif c == ord('z'): + assert n == 0 + out += b'\0\0\0\0' + elif c == ord('~'): + if n: + for _ in range(5-n): + b = b*85+84 + out += struct.pack(b'>L',b)[:n-1] + break + return bytes(out) + decode = staticmethod(decode) + +class DCTDecode(object): + def decode(data, decodeParms=None): + return data + decode = staticmethod(decode) + +class JPXDecode(object): + def decode(data, decodeParms=None): + return data + decode = staticmethod(decode) + +class CCITTFaxDecode(object): + def decode(data, decodeParms=None, height=0): + if decodeParms: + if decodeParms.get("/K", 1) == -1: + CCITTgroup = 4 + else: + CCITTgroup = 3 + + width = decodeParms["/Columns"] + imgSize = len(data) + tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h' + tiffHeader = struct.pack(tiff_header_struct, + b'II', # Byte order indication: Little endian + 42, # Version number (always 42) + 8, # Offset to first IFD + 8, # Number of tags in IFD + 256, 4, 1, width, # ImageWidth, LONG, 1, width + 257, 4, 1, height, # ImageLength, LONG, 1, length + 258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1 + 259, 3, 1, CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding + 262, 3, 1, 0, # Thresholding, SHORT, 1, 0 = WhiteIsZero + 273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header + 278, 4, 1, height, # RowsPerStrip, LONG, 1, length + 279, 4, 1, imgSize, # StripByteCounts, LONG, 1, size of image + 0 # last IFD + ) + + return tiffHeader + data + + decode = staticmethod(decode) + +def decodeStreamData(stream): + from .generic import NameObject + filters = stream.get("/Filter", ()) + + if len(filters) and not isinstance(filters[0], NameObject): + # we have a single filter instance + filters = (filters,) + data = stream._data + # If there is not data to decode we should not try to decode the data. + if data: + for filterType in filters: + if filterType == "/FlateDecode" or filterType == "/Fl": + data = FlateDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCIIHexDecode" or filterType == "/AHx": + data = ASCIIHexDecode.decode(data) + elif filterType == "/LZWDecode" or filterType == "/LZW": + data = LZWDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCII85Decode" or filterType == "/A85": + data = ASCII85Decode.decode(data) + elif filterType == "/DCTDecode": + data = DCTDecode.decode(data) + elif filterType == "/JPXDecode": + data = JPXDecode.decode(data) + elif filterType == "/CCITTFaxDecode": + height = stream.get("/Height", ()) + data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height) + elif filterType == "/Crypt": + decodeParams = stream.get("/DecodeParams", {}) + if "/Name" not in decodeParams and "/Type" not in decodeParams: + pass + else: + raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") + else: + # unsupported filter + raise NotImplementedError("unsupported filter %s" % filterType) + return data diff --git a/PdfFileTransformer/PyPDF2/generic.py b/PdfFileTransformer/PyPDF2/generic.py new file mode 100644 index 0000000..959957d --- /dev/null +++ b/PdfFileTransformer/PyPDF2/generic.py @@ -0,0 +1,1228 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import re +from .utils import readNonWhitespace, RC4_encrypt, skipOverComment +from .utils import b_, u_, chr_, ord_ +from .utils import PdfStreamError +import warnings +from . import filters +from . import utils +import decimal +import codecs +import sys +#import debugging + +ObjectPrefix = b_('/<[tf(n%') +NumberSigns = b_('+-') +IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")) + + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + idx = ObjectPrefix.find(tok) + if idx == 0: + # name object + return NameObject.readFromStream(stream, pdf) + elif idx == 1: + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == b_('<<'): + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif idx == 2: + # array object + return ArrayObject.readFromStream(stream, pdf) + elif idx == 3 or idx == 4: + # boolean object + return BooleanObject.readFromStream(stream) + elif idx == 5: + # string object + return readStringFromStream(stream) + elif idx == 6: + # null object + return NullObject.readFromStream(stream) + elif idx == 7: + # comment + while tok not in (b_('\r'), b_('\n')): + tok = stream.read(1) + # Prevents an infinite loop by raising an error if the stream is at + # the EOF + if len(tok) <= 0: + raise PdfStreamError("File ended unexpectedly.") + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + + +class PdfObject(object): + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("null")) + + def readFromStream(stream): + nulltxt = stream.read(4) + if nulltxt != b_("null"): + raise utils.PdfReadError("Could not read Null object") + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write(b_("true")) + else: + stream.write(b_("false")) + + def readFromStream(stream): + word = stream.read(4) + if word == b_("true"): + return BooleanObject(True) + elif word == b_("fals"): + stream.read(1) + return BooleanObject(False) + else: + raise utils.PdfReadError('Could not read Boolean object') + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("[")) + for data in self: + stream.write(b_(" ")) + data.writeToStream(stream, encryption_key) + stream.write(b_(" ]")) + + def readFromStream(stream, pdf): + arr = ArrayObject() + tmp = stream.read(1) + if tmp != b_("["): + raise utils.PdfReadError("Could not read array") + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == b_("]"): + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("%s %s R" % (self.idnum, self.generation))) + + def readFromStream(stream, pdf): + idnum = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + break + idnum += tok + generation = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + if not generation: + continue + break + generation += tok + r = readNonWhitespace(stream) + if r != b_("R"): + raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(decimal.Decimal, PdfObject): + def __new__(cls, value="0", context=None): + try: + return decimal.Decimal.__new__(cls, utils.str_(value), context) + except: + return decimal.Decimal.__new__(cls, str(value)) + + def __repr__(self): + if self == self.to_integral(): + return str(self.quantize(decimal.Decimal(1))) + else: + # Standard formatting adds useless extraneous zeros. + o = "%.5f" % self + # Remove the zeros. + while o and o[-1] == '0': + o = o[:-1] + return o + + def as_numeric(self): + return float(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + +class NumberObject(int, PdfObject): + NumberPattern = re.compile(b_('[^+-.0-9]')) + ByteDot = b_(".") + + def __new__(cls, value): + val = int(value) + try: + return int.__new__(cls, val) + except OverflowError: + return int.__new__(cls, 0) + + def as_numeric(self): + return int(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + def readFromStream(stream): + num = utils.readUntilRegex(stream, NumberObject.NumberPattern) + if num.find(NumberObject.ByteDot) != -1: + return FloatObject(num) + else: + return NumberObject(num) + readFromStream = staticmethod(readFromStream) + + +## +# Given a string (either a "str" or "unicode"), create a ByteStringObject or a +# TextStringObject to represent the string. +def createStringObject(string): + if isinstance(string, utils.string_type): + return TextStringObject(string) + elif isinstance(string, utils.bytes_type): + try: + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = b_("") + while True: + tok = readNonWhitespace(stream) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_(">"): + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = b_("") + if len(x) == 1: + x += b_("0") + if len(x) == 2: + txt += chr(int(x, base=16)) + return createStringObject(b_(txt)) + + +def readStringFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_("("): + parens += 1 + elif tok == b_(")"): + parens -= 1 + if parens == 0: + break + elif tok == b_("\\"): + tok = stream.read(1) + ESCAPE_DICT = {b_("n") : b_("\n"), + b_("r") : b_("\r"), + b_("t") : b_("\t"), + b_("b") : b_("\b"), + b_("f") : b_("\f"), + b_("c") : b_("\c"), + b_("(") : b_("("), + b_(")") : b_(")"), + b_("/") : b_("/"), + b_("\\") : b_("\\"), + b_(" ") : b_(" "), + b_("/") : b_("/"), + b_("%") : b_("%"), + b_("<") : b_("<"), + b_(">") : b_(">"), + b_("[") : b_("["), + b_("]") : b_("]"), + b_("#") : b_("#"), + b_("_") : b_("_"), + b_("&") : b_("&"), + b_('$') : b_('$'), + } + try: + tok = ESCAPE_DICT[tok] + except KeyError: + if tok.isdigit(): + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for i in range(2): + ntok = stream.read(1) + if ntok.isdigit(): + tok += ntok + else: + break + tok = b_(chr(int(tok, base=8))) + elif tok in b_("\n\r"): + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in b_("\n\r"): + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b_('') + else: + raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) + txt += tok + return createStringObject(txt) + + +## +# Represents a string object where the text encoding could not be determined. +# This occurs quite often, as the PDF spec doesn't provide an alternate way to +# represent strings -- for example, the encryption data stored in files (like +# /O) is clearly not text, but is still stored in a "String" object. +class ByteStringObject(utils.bytes_type, PdfObject): + + ## + # For compatibility with TextStringObject.original_bytes. This method + # returns self. + original_bytes = property(lambda self: self) + + def writeToStream(self, stream, encryption_key): + bytearr = self + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + stream.write(b_("<")) + stream.write(utils.hexencode(bytearr)) + stream.write(b_(">")) + + +## +# Represents a string object that has been decoded into a real unicode string. +# If read from a PDF document, this string appeared to match the +# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to +# occur. +class TextStringObject(utils.string_type, PdfObject): + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + ## + # It is occasionally possible that a text string object gets created where + # a byte string object was expected due to the autodetection mechanism -- + # if that occurs, this "original_bytes" property can be used to + # back-calculate what the original encoded bytes were. + original_bytes = property(lambda self: self.get_original_bytes()) + + def get_original_bytes(self): + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def writeToStream(self, stream, encryption_key): + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.writeToStream(stream, None) + else: + stream.write(b_("(")) + for c in bytearr: + if not chr_(c).isalnum() and c != b_(' '): + stream.write(b_("\\%03o" % ord_(c))) + else: + stream.write(b_(chr_(c))) + stream.write(b_(")")) + + +class NameObject(str, PdfObject): + delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) + surfix = b_("/") + + def writeToStream(self, stream, encryption_key): + stream.write(b_(self)) + + def readFromStream(stream, pdf): + debug = False + if debug: print((stream.tell())) + name = stream.read(1) + if name != NameObject.surfix: + raise utils.PdfReadError("name read error") + name += utils.readUntilRegex(stream, NameObject.delimiterPattern, + ignore_eof=True) + if debug: print(name) + try: + return NameObject(name.decode('utf-8')) + except (UnicodeEncodeError, UnicodeDecodeError) as e: + # Name objects should represent irregular characters + # with a '#' followed by the symbol's hex number + if not pdf.strict: + warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) + return NameObject(name) + else: + raise utils.PdfReadError("Illegal character in Name Object") + + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + def raw_get(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, value): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value=None): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) + + def __getitem__(self, key): + return dict.__getitem__(self, key).getObject() + + ## + # Retrieves XMP (Extensible Metadata Platform) data relevant to the + # this object, if available. + # <p> + # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + from . import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + # <p> + # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + for key, value in list(self.items()): + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + stream.write(b_(">>")) + + def readFromStream(stream, pdf): + debug = False + tmp = stream.read(2) + if tmp != b_("<<"): + raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) + data = {} + while True: + tok = readNonWhitespace(stream) + if tok == b_('\x00'): + continue + elif tok == b_('%'): + stream.seek(-1, 1) + skipOverComment(stream) + continue + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + + if debug: print(("Tok:", tok)) + if tok == b_(">"): + stream.read(1) + break + stream.seek(-1, 1) + key = readObject(stream, pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, pdf) + if not data.get(key): + data[key] = value + elif pdf.strict: + # multiple definitions of key not permitted + raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key)) + else: + warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning) + + pos = stream.tell() + s = readNonWhitespace(stream) + if s == b_('s') and stream.read(5) == b_('tream'): + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == b_(' '): + eol = stream.read(1) + assert eol in (b_("\n"), b_("\r")) + if eol == b_("\r"): + # read \n after + if stream.read(1) != b_('\n'): + stream.seek(-1, 1) + # this is a stream object, not a dictionary + assert "/Length" in data + length = data["/Length"] + if debug: print(data) + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.getObject(length) + stream.seek(t, 0) + data["__streamdata__"] = stream.read(length) + if debug: print("here") + #if debug: print(binascii.hexlify(data["__streamdata__"])) + e = readNonWhitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != b_("endstream"): + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == b_("endstream"): + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + else: + if debug: print(("E", e, ndstream, debugging.toHex(end))) + stream.seek(pos, 0) + raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) + else: + stream.seek(pos, 0) + if "__streamdata__" in data: + return StreamObject.initializeFromDictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + readFromStream = staticmethod(readFromStream) + + +class TreeObject(DictionaryObject): + def __init__(self): + DictionaryObject.__init__(self) + + def hasChildren(self): + return '/First' in self + + def __iter__(self): + return self.children() + + def children(self): + if not self.hasChildren(): + raise StopIteration + + child = self['/First'] + while True: + yield child + if child == self['/Last']: + raise StopIteration + child = child['/Next'] + + def addChild(self, child, pdf): + childObj = child.getObject() + child = pdf.getReference(childObj) + assert isinstance(child, IndirectObject) + + if '/First' not in self: + self[NameObject('/First')] = child + self[NameObject('/Count')] = NumberObject(0) + prev = None + else: + prev = self['/Last'] + + self[NameObject('/Last')] = child + self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1) + + if prev: + prevRef = pdf.getReference(prev) + assert isinstance(prevRef, IndirectObject) + childObj[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = child + + parentRef = pdf.getReference(self) + assert isinstance(parentRef, IndirectObject) + childObj[NameObject('/Parent')] = parentRef + + def removeChild(self, child): + childObj = child.getObject() + + if NameObject('/Parent') not in childObj: + raise ValueError("Removed child does not appear to be a tree item") + elif childObj[NameObject('/Parent')] != self: + raise ValueError("Removed child is not a member of this tree") + + found = False + prevRef = None + prev = None + curRef = self[NameObject('/First')] + cur = curRef.getObject() + lastRef = self[NameObject('/Last')] + last = lastRef.getObject() + while cur != None: + if cur == childObj: + if prev == None: + if NameObject('/Next') in cur: + # Removing first tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + del next[NameObject('/Prev')] + self[NameObject('/First')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + + else: + # Removing only tree node + assert self[NameObject('/Count')] == 1 + del self[NameObject('/Count')] + del self[NameObject('/First')] + if NameObject('/Last') in self: + del self[NameObject('/Last')] + else: + if NameObject('/Next') in cur: + # Removing middle tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + next[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + else: + # Removing last tree node + assert cur == last + del prev[NameObject('/Next')] + self[NameObject('/Last')] = prevRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + found = True + break + + prevRef = curRef + prev = cur + if NameObject('/Next') in cur: + curRef = cur[NameObject('/Next')] + cur = curRef.getObject() + else: + curRef = None + cur = None + + if not found: + raise ValueError("Removal couldn't find item in tree") + + del childObj[NameObject('/Parent')] + if NameObject('/Next') in childObj: + del childObj[NameObject('/Next')] + if NameObject('/Prev') in childObj: + del childObj[NameObject('/Prev')] + + def emptyTree(self): + for child in self: + childObj = child.getObject() + del childObj[NameObject('/Parent')] + if NameObject('/Next') in childObj: + del childObj[NameObject('/Next')] + if NameObject('/Prev') in childObj: + del childObj[NameObject('/Prev')] + + if NameObject('/Count') in self: + del self[NameObject('/Count')] + if NameObject('/First') in self: + del self[NameObject('/First')] + if NameObject('/Last') in self: + del self[NameObject('/Last')] + + +class StreamObject(DictionaryObject): + def __init__(self): + self._data = None + self.decodedSelf = None + + def writeToStream(self, stream, encryption_key): + self[NameObject("/Length")] = NumberObject(len(self._data)) + DictionaryObject.writeToStream(self, stream, encryption_key) + del self["/Length"] + stream.write(b_("\nstream\n")) + data = self._data + if encryption_key: + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write(b_("\nendstream")) + + def initializeFromDictionary(data): + if "/Filter" in data: + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data["/Length"] + retval.update(data) + return retval + initializeFromDictionary = staticmethod(initializeFromDictionary) + + def flateEncode(self): + if "/Filter" in self: + f = self["/Filter"] + if isinstance(f, ArrayObject): + f.insert(0, NameObject("/FlateDecode")) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject("/Filter")] = f + retval._data = filters.FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def getData(self): + return self._data + + def setData(self, data): + self._data = data + + +class EncodedStreamObject(StreamObject): + def __init__(self): + self.decodedSelf = None + + def getData(self): + if self.decodedSelf: + # cached version of decoded object + return self.decodedSelf.getData() + else: + # create decoded object + decoded = DecodedStreamObject() + + decoded._data = filters.decodeStreamData(self) + for key, value in list(self.items()): + if not key in ("/Length", "/Filter", "/DecodeParms"): + decoded[key] = value + self.decodedSelf = decoded + return decoded._data + + def setData(self, data): + raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported") + + +class RectangleObject(ArrayObject): + """ + This class is used to represent *page boxes* in PyPDF2. These boxes include: + + * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>` + * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>` + * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>` + * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>` + * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>` + """ + def __init__(self, arr): + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + + def ensureIsNumber(self, value): + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) + return value + + def __repr__(self): + return "RectangleObject(%s)" % repr(list(self)) + + def getLowerLeft_x(self): + return self[0] + + def getLowerLeft_y(self): + return self[1] + + def getUpperRight_x(self): + return self[2] + + def getUpperRight_y(self): + return self[3] + + def getUpperLeft_x(self): + return self.getLowerLeft_x() + + def getUpperLeft_y(self): + return self.getUpperRight_y() + + def getLowerRight_x(self): + return self.getUpperRight_x() + + def getLowerRight_y(self): + return self.getLowerLeft_y() + + def getLowerLeft(self): + return self.getLowerLeft_x(), self.getLowerLeft_y() + + def getLowerRight(self): + return self.getLowerRight_x(), self.getLowerRight_y() + + def getUpperLeft(self): + return self.getUpperLeft_x(), self.getUpperLeft_y() + + def getUpperRight(self): + return self.getUpperRight_x(), self.getUpperRight_y() + + def setLowerLeft(self, value): + self[0], self[1] = [self.ensureIsNumber(x) for x in value] + + def setLowerRight(self, value): + self[2], self[1] = [self.ensureIsNumber(x) for x in value] + + def setUpperLeft(self, value): + self[0], self[3] = [self.ensureIsNumber(x) for x in value] + + def setUpperRight(self, value): + self[2], self[3] = [self.ensureIsNumber(x) for x in value] + + def getWidth(self): + return self.getUpperRight_x() - self.getLowerLeft_x() + + def getHeight(self): + return self.getUpperRight_y() - self.getLowerLeft_y() + + lowerLeft = property(getLowerLeft, setLowerLeft, None, None) + """ + Property to read and modify the lower left coordinate of this box + in (x,y) form. + """ + lowerRight = property(getLowerRight, setLowerRight, None, None) + """ + Property to read and modify the lower right coordinate of this box + in (x,y) form. + """ + upperLeft = property(getUpperLeft, setUpperLeft, None, None) + """ + Property to read and modify the upper left coordinate of this box + in (x,y) form. + """ + upperRight = property(getUpperRight, setUpperRight, None, None) + """ + Property to read and modify the upper right coordinate of this box + in (x,y) form. + """ + + +class Field(TreeObject): + """ + A class representing a field dictionary. This class is accessed through + :meth:`getFields()<PyPDF2.PdfFileReader.getFields>` + """ + def __init__(self, data): + DictionaryObject.__init__(self) + attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff", + "/V", "/DV", "/AA") + for attr in attributes: + try: + self[NameObject(attr)] = data[attr] + except KeyError: + pass + + fieldType = property(lambda self: self.get("/FT")) + """ + Read-only property accessing the type of this field. + """ + + parent = property(lambda self: self.get("/Parent")) + """ + Read-only property accessing the parent of this field. + """ + + kids = property(lambda self: self.get("/Kids")) + """ + Read-only property accessing the kids of this field. + """ + + name = property(lambda self: self.get("/T")) + """ + Read-only property accessing the name of this field. + """ + + altName = property(lambda self: self.get("/TU")) + """ + Read-only property accessing the alternate name of this field. + """ + + mappingName = property(lambda self: self.get("/TM")) + """ + Read-only property accessing the mapping name of this field. This + name is used by PyPDF2 as a key in the dictionary returned by + :meth:`getFields()<PyPDF2.PdfFileReader.getFields>` + """ + + flags = property(lambda self: self.get("/Ff")) + """ + Read-only property accessing the field flags, specifying various + characteristics of the field (see Table 8.70 of the PDF 1.7 reference). + """ + + value = property(lambda self: self.get("/V")) + """ + Read-only property accessing the value of this field. Format + varies based on field type. + """ + + defaultValue = property(lambda self: self.get("/DV")) + """ + Read-only property accessing the default value of this field. + """ + + additionalActions = property(lambda self: self.get("/AA")) + """ + Read-only property accessing the additional actions dictionary. + This dictionary defines the field's behavior in response to trigger events. + See Section 8.5.2 of the PDF 1.7 reference. + """ + + +class Destination(TreeObject): + """ + A class representing a destination within a PDF file. + See section 8.2.1 of the PDF 1.6 reference. + + :param str title: Title of this destination. + :param int page: Page number of this destination. + :param str typ: How the destination is displayed. + :param args: Additional arguments may be necessary depending on the type. + :raises PdfReadError: If destination type is invalid. + + Valid ``typ`` arguments (see PDF spec for details): + /Fit No additional arguments + /XYZ [left] [top] [zoomFactor] + /FitH [top] + /FitV [left] + /FitR [left] [bottom] [right] [top] + /FitB No additional arguments + /FitBH [top] + /FitBV [left] + """ + def __init__(self, title, page, typ, *args): + DictionaryObject.__init__(self) + self[NameObject("/Title")] = title + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ + + # from table 8.2 of the PDF 1.7 reference. + if typ == "/XYZ": + (self[NameObject("/Left")], self[NameObject("/Top")], + self[NameObject("/Zoom")]) = args + elif typ == "/FitR": + (self[NameObject("/Left")], self[NameObject("/Bottom")], + self[NameObject("/Right")], self[NameObject("/Top")]) = args + elif typ in ["/FitH", "/FitBH"]: + self[NameObject("/Top")], = args + elif typ in ["/FitV", "/FitBV"]: + self[NameObject("/Left")], = args + elif typ in ["/Fit", "/FitB"]: + pass + else: + raise utils.PdfReadError("Unknown Destination Type: %r" % typ) + + def getDestArray(self): + return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self]) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + key = NameObject('/D') + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + + key = NameObject("/S") + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = NameObject("/GoTo") + value.writeToStream(stream, encryption_key) + + stream.write(b_("\n")) + stream.write(b_(">>")) + + title = property(lambda self: self.get("/Title")) + """ + Read-only property accessing the destination title. + + :rtype: str + """ + + page = property(lambda self: self.get("/Page")) + """ + Read-only property accessing the destination page number. + + :rtype: int + """ + + typ = property(lambda self: self.get("/Type")) + """ + Read-only property accessing the destination type. + + :rtype: str + """ + + zoom = property(lambda self: self.get("/Zoom", None)) + """ + Read-only property accessing the zoom factor. + + :rtype: int, or ``None`` if not available. + """ + + left = property(lambda self: self.get("/Left", None)) + """ + Read-only property accessing the left horizontal coordinate. + + :rtype: int, or ``None`` if not available. + """ + + right = property(lambda self: self.get("/Right", None)) + """ + Read-only property accessing the right horizontal coordinate. + + :rtype: int, or ``None`` if not available. + """ + + top = property(lambda self: self.get("/Top", None)) + """ + Read-only property accessing the top vertical coordinate. + + :rtype: int, or ``None`` if not available. + """ + + bottom = property(lambda self: self.get("/Bottom", None)) + """ + Read-only property accessing the bottom vertical coordinate. + + :rtype: int, or ``None`` if not available. + """ + + +class Bookmark(Destination): + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]: + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.raw_get(key) + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + key = NameObject('/Dest') + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + stream.write(b_(">>")) + + +def encode_pdfdocencoding(unicode_string): + retval = b_('') + for c in unicode_string: + try: + retval += b_(chr(_pdfDocEncoding_rev[c])) + except KeyError: + raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, + "does not exist in translation table") + return retval + + +def decode_pdfdocencoding(byte_array): + retval = u_('') + for b in byte_array: + c = _pdfDocEncoding[ord_(b)] + if c == u_('\u0000'): + raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1, + "does not exist in translation table") + retval += c + return retval + +_pdfDocEncoding = ( + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), + u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), + u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), + u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), + u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), + u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), + u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), + u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), + u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), + u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), + u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), + u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), + u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), + u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), + u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), + u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), + u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), + u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), + u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), + u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), + u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), + u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), + u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), + u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), + u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), + u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), + u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), + u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), + u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') +) + +assert len(_pdfDocEncoding) == 256 + +_pdfDocEncoding_rev = {} +for i in range(256): + char = _pdfDocEncoding[i] + if char == u_("\u0000"): + continue + assert char not in _pdfDocEncoding_rev + _pdfDocEncoding_rev[char] = i diff --git a/PdfFileTransformer/PyPDF2/merger.py b/PdfFileTransformer/PyPDF2/merger.py new file mode 100644 index 0000000..c3373e4 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/merger.py @@ -0,0 +1,553 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from .generic import * +from .utils import isString, str_ +from .pdf import PdfFileReader, PdfFileWriter +from .pagerange import PageRange +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO + StreamIO = StringIO +else: + from io import BytesIO + from io import FileIO as file + StreamIO = BytesIO + + +class _MergedPage(object): + """ + _MergedPage is used internally by PdfFileMerger to collect necessary + information on each page that is being merged. + """ + def __init__(self, pagedata, src, id): + self.src = src + self.pagedata = pagedata + self.out_pagedata = None + self.id = id + + +class PdfFileMerger(object): + """ + Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs + into a single PDF. It can concatenate, slice, insert, or any combination + of the above. + + See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`) + and :meth:`write()<write>` for usage information. + + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + """ + + def __init__(self, strict=True): + self.inputs = [] + self.pages = [] + self.output = PdfFileWriter() + self.bookmarks = [] + self.named_dests = [] + self.id_count = 0 + self.strict = strict + + def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + Merges the pages from the given file into the output file at the + specified page number. + + :param int position: The *page number* to insert this file. File will + be inserted after the given number. + + :param fileobj: A File Object or an object that supports the standard read + and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str bookmark: Optionally, you may specify a bookmark to be applied at + the beginning of the included file by supplying the text of the bookmark. + + :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_bookmarks: You may prevent the source document's bookmarks + from being imported by specifying this as ``False``. + """ + + # This parameter is passed to self.inputs.append and means + # that the stream used was created in this method. + my_file = False + + # If the fileobj parameter is a string, assume it is a path + # and create a file object at that location. If it is a file, + # copy the file's contents into a BytesIO (or StreamIO) stream object; if + # it is a PdfFileReader, copy that reader's stream into a + # BytesIO (or StreamIO) stream. + # If fileobj is none of the above types, it is not modified + decryption_key = None + if isString(fileobj): + fileobj = file(fileobj, 'rb') + my_file = True + elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): + fileobj.seek(0) + filecontent = fileobj.read() + fileobj = StreamIO(filecontent) + my_file = True + elif isinstance(fileobj, PdfFileReader): + orig_tell = fileobj.stream.tell() + fileobj.stream.seek(0) + filecontent = StreamIO(fileobj.stream.read()) + fileobj.stream.seek(orig_tell) # reset the stream to its original location + fileobj = filecontent + if hasattr(fileobj, '_decryption_key'): + decryption_key = fileobj._decryption_key + my_file = True + + # Create a new PdfFileReader instance using the stream + # (either file or BytesIO or StringIO) created above + pdfr = PdfFileReader(fileobj, strict=self.strict) + if decryption_key is not None: + pdfr._decryption_key = decryption_key + + # Find the range of pages to merge. + if pages == None: + pages = (0, pdfr.getNumPages()) + elif isinstance(pages, PageRange): + pages = pages.indices(pdfr.getNumPages()) + elif not isinstance(pages, tuple): + raise TypeError('"pages" must be a tuple of (start, stop[, step])') + + srcpages = [] + if bookmark: + bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) + + outline = [] + if import_bookmarks: + outline = pdfr.getOutlines() + outline = self._trim_outline(pdfr, outline, pages) + + if bookmark: + self.bookmarks += [bookmark, outline] + else: + self.bookmarks += outline + + dests = pdfr.namedDestinations + dests = self._trim_dests(pdfr, dests, pages) + self.named_dests += dests + + # Gather all the pages that are going to be merged + for i in range(*pages): + pg = pdfr.getPage(i) + + id = self.id_count + self.id_count += 1 + + mp = _MergedPage(pg, pdfr, id) + + srcpages.append(mp) + + self._associate_dests_to_pages(srcpages) + self._associate_bookmarks_to_pages(srcpages) + + # Slice to insert the pages at the specified position + self.pages[position:position] = srcpages + + # Keep track of our input files so we can close them later + self.inputs.append((fileobj, pdfr, my_file)) + + def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate + all pages onto the end of the file instead of specifying a position. + + :param fileobj: A File Object or an object that supports the standard read + and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str bookmark: Optionally, you may specify a bookmark to be applied at + the beginning of the included file by supplying the text of the bookmark. + + :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_bookmarks: You may prevent the source document's bookmarks + from being imported by specifying this as ``False``. + """ + + self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) + + def write(self, fileobj): + """ + Writes all data that has been merged to the given output file. + + :param fileobj: Output file. Can be a filename or any kind of + file-like object. + """ + my_file = False + if isString(fileobj): + fileobj = file(fileobj, 'wb') + my_file = True + + # Add pages to the PdfFileWriter + # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 + for page in self.pages: + self.output.addPage(page.pagedata) + page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) + #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 + #page.out_pagedata = IndirectObject(idnum, 0, self.output) + + # Once all pages are added, create bookmarks to point at those pages + self._write_dests() + self._write_bookmarks() + + # Write the output to the file + self.output.write(fileobj) + + if my_file: + fileobj.close() + + def close(self): + """ + Shuts all file descriptors (input and output) and clears all memory + usage. + """ + self.pages = [] + for fo, pdfr, mine in self.inputs: + if mine: + fo.close() + + self.inputs = [] + self.output = None + + def addMetadata(self, infos): + """ + Add custom metadata to the output. + + :param dict infos: a Python dictionary where each key is a field + and each value is your new metadata. + Example: ``{u'/Title': u'My title'}`` + """ + self.output.addMetadata(infos) + + def setPageLayout(self, layout): + """ + Set the page layout + + :param str layout: The page layout to be used + + Valid layouts are: + /NoLayout Layout explicitly not specified + /SinglePage Show one page at a time + /OneColumn Show one column at a time + /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left + /TwoColumnRight Show pages in two columns, odd-numbered pages on the right + /TwoPageLeft Show two pages at a time, odd-numbered pages on the left + /TwoPageRight Show two pages at a time, odd-numbered pages on the right + """ + self.output.setPageLayout(layout) + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + Valid modes are: + /UseNone Do not show outlines or thumbnails panels + /UseOutlines Show outlines (aka bookmarks) panel + /UseThumbs Show page thumbnails panel + /FullScreen Fullscreen view + /UseOC Show Optional Content Group (OCG) panel + /UseAttachments Show attachments panel + """ + self.output.setPageMode(mode) + + def _trim_dests(self, pdf, dests, pages): + """ + Removes any named destinations that are not a part of the specified + page set. + """ + new_dests = [] + prev_header_added = True + for k, o in list(dests.items()): + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + assert str_(k) == str_(o['/Title']) + new_dests.append(o) + break + return new_dests + + def _trim_outline(self, pdf, outline, pages): + """ + Removes any outline/bookmark entries that are not a part of the + specified page set. + """ + new_outline = [] + prev_header_added = True + for i, o in enumerate(outline): + if isinstance(o, list): + sub = self._trim_outline(pdf, o, pages) + if sub: + if not prev_header_added: + new_outline.append(outline[i-1]) + new_outline.append(sub) + else: + prev_header_added = False + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + new_outline.append(o) + prev_header_added = True + break + return new_outline + + def _write_dests(self): + dests = self.named_dests + + for v in dests: + pageno = None + pdf = None + if '/Page' in v: + for i, p in enumerate(self.pages): + if p.id == v['/Page']: + v[NameObject('/Page')] = p.out_pagedata + pageno = i + pdf = p.src + break + if pageno != None: + self.output.addNamedDestinationObject(v) + + def _write_bookmarks(self, bookmarks=None, parent=None): + + if bookmarks == None: + bookmarks = self.bookmarks + + last_added = None + for b in bookmarks: + if isinstance(b, list): + self._write_bookmarks(b, last_added) + continue + + pageno = None + pdf = None + if '/Page' in b: + for i, p in enumerate(self.pages): + if p.id == b['/Page']: + #b[NameObject('/Page')] = p.out_pagedata + args = [NumberObject(p.id), NameObject(b['/Type'])] + #nothing more to add + #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' + if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Top'] + elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + del b['/Left'] + elif b['/Type'] == '/XYZ': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): + args.append(FloatObject(b['/Zoom'])) + else: + args.append(FloatObject(0)) + del b['/Top'], b['/Zoom'], b['/Left'] + elif b['/Type'] == '/FitR': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): + args.append(FloatObject(b['/Bottom'])) + else: + args.append(FloatObject(0)) + if '/Right' in b and not isinstance(b['/Right'], NullObject): + args.append(FloatObject(b['/Right'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] + + b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) + + pageno = i + pdf = p.src + break + if pageno != None: + del b['/Page'], b['/Type'] + last_added = self.output.addBookmarkDict(b, parent) + + def _associate_dests_to_pages(self, pages): + for nd in self.named_dests: + pageno = None + np = nd['/Page'] + + if isinstance(np, NumberObject): + continue + + for p in pages: + if np.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + nd[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],)) + + def _associate_bookmarks_to_pages(self, pages, bookmarks=None): + if bookmarks == None: + bookmarks = self.bookmarks + + for b in bookmarks: + if isinstance(b, list): + self._associate_bookmarks_to_pages(pages, b) + continue + + pageno = None + bp = b['/Page'] + + if isinstance(bp, NumberObject): + continue + + for p in pages: + if bp.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + b[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],)) + + def findBookmark(self, bookmark, root=None): + if root == None: + root = self.bookmarks + + for i, b in enumerate(root): + if isinstance(b, list): + res = self.findBookmark(bookmark, b) + if res: + return [i] + res + elif b == bookmark or b['/Title'] == bookmark: + return [i] + + return None + + def addBookmark(self, title, pagenum, parent=None): + """ + Add a bookmark to this PDF file. + + :param str title: Title to use for this bookmark. + :param int pagenum: Page number this bookmark will point to. + :param parent: A reference to a parent bookmark to create nested + bookmarks. + """ + if parent == None: + iloc = [len(self.bookmarks)-1] + elif isinstance(parent, list): + iloc = parent + else: + iloc = self.findBookmark(parent) + + dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + + if parent == None: + self.bookmarks.append(dest) + else: + bmparent = self.bookmarks + for i in iloc[:-1]: + bmparent = bmparent[i] + npos = iloc[-1]+1 + if npos < len(bmparent) and isinstance(bmparent[npos], list): + bmparent[npos].append(dest) + else: + bmparent.insert(npos, [dest]) + return dest + + def addNamedDestination(self, title, pagenum): + """ + Add a destination to the output. + + :param str title: Title to use + :param int pagenum: Page number this destination points at. + """ + + dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + self.named_dests.append(dest) + + +class OutlinesObject(list): + def __init__(self, pdf, tree, parent=None): + list.__init__(self) + self.tree = tree + self.pdf = pdf + self.parent = parent + + def remove(self, index): + obj = self[index] + del self[index] + self.tree.removeChild(obj) + + def add(self, title, pagenum): + pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] + action = DictionaryObject() + action.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self.pdf._addObject(action) + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A'): actionRef, + NameObject('/Title'): createStringObject(title), + }) + + self.pdf._addObject(bookmark) + + self.tree.addChild(bookmark) + + def removeAll(self): + for child in [x for x in self.tree.children()]: + self.tree.removeChild(child) + self.pop() diff --git a/PdfFileTransformer/PyPDF2/pagerange.py b/PdfFileTransformer/PyPDF2/pagerange.py new file mode 100644 index 0000000..ce96ec5 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/pagerange.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +""" +Representation and utils for ranges of PDF file pages. + +Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>. +All rights reserved. This software is available under a BSD license; +see https://github.com/mstamy2/PyPDF2/blob/master/LICENSE +""" + +import re +from .utils import isString + +_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". +PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) +# groups: 12 34 5 6 7 8 + + +class ParseError(Exception): + pass + + +PAGE_RANGE_HELP = """Remember, page indices start with zero. + Page range expression examples: + : all pages. -1 last page. + 22 just the 23rd page. :-1 all but the last page. + 0:3 the first three pages. -2 second-to-last page. + :3 the first three pages. -2: last two pages. + 5: from the sixth page onward. -3:-1 third & second to last. + The third, "stride" or "step" number is also recognized. + ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0. + 1:10:2 1 3 5 7 9 2::-1 2 1 0. + ::-1 all pages in reverse order. +""" + + +class PageRange(object): + """ + A slice-like representation of a range of page indices, + i.e. page numbers, only starting at zero. + The syntax is like what you would put between brackets [ ]. + The slice is one of the few Python types that can't be subclassed, + but this class converts to and from slices, and allows similar use. + o PageRange(str) parses a string representing a page range. + o PageRange(slice) directly "imports" a slice. + o to_slice() gives the equivalent slice. + o str() and repr() allow printing. + o indices(n) is like slice.indices(n). + """ + + def __init__(self, arg): + """ + Initialize with either a slice -- giving the equivalent page range, + or a PageRange object -- making a copy, + or a string like + "int", "[int]:[int]" or "[int]:[int]:[int]", + where the brackets indicate optional ints. + {page_range_help} + Note the difference between this notation and arguments to slice(): + slice(3) means the first three pages; + PageRange("3") means the range of only the fourth page. + However PageRange(slice(3)) means the first three pages. + """ + if isinstance(arg, slice): + self._slice = arg + return + + if isinstance(arg, PageRange): + self._slice = arg.to_slice() + return + + m = isString(arg) and re.match(PAGE_RANGE_RE, arg) + if not m: + raise ParseError(arg) + elif m.group(2): + # Special case: just an int means a range of one page. + start = int(m.group(2)) + stop = start + 1 if start != -1 else None + self._slice = slice(start, stop) + else: + self._slice = slice(*[int(g) if g else None + for g in m.group(4, 6, 8)]) + + # Just formatting this when there is __doc__ for __init__ + if __init__.__doc__: + __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP) + + @staticmethod + def valid(input): + """ True if input is a valid initializer for a PageRange. """ + return isinstance(input, slice) or \ + isinstance(input, PageRange) or \ + (isString(input) + and bool(re.match(PAGE_RANGE_RE, input))) + + def to_slice(self): + """ Return the slice equivalent of this page range. """ + return self._slice + + def __str__(self): + """ A string like "1:2:3". """ + s = self._slice + if s.step == None: + if s.start != None and s.stop == s.start + 1: + return str(s.start) + + indices = s.start, s.stop + else: + indices = s.start, s.stop, s.step + return ':'.join("" if i == None else str(i) for i in indices) + + def __repr__(self): + """ A string like "PageRange('1:2:3')". """ + return "PageRange(" + repr(str(self)) + ")" + + def indices(self, n): + """ + n is the length of the list of pages to choose from. + Returns arguments for range(). See help(slice.indices). + """ + return self._slice.indices(n) + + +PAGE_RANGE_ALL = PageRange(":") # The range of all pages. + + +def parse_filename_page_ranges(args): + """ + Given a list of filenames and page ranges, return a list of + (filename, page_range) pairs. + First arg must be a filename; other ags are filenames, page-range + expressions, slice objects, or PageRange objects. + A filename not followed by a page range indicates all pages of the file. + """ + pairs = [] + pdf_filename = None + did_page_range = False + for arg in args + [None]: + if PageRange.valid(arg): + if not pdf_filename: + raise ValueError("The first argument must be a filename, " \ + "not a page range.") + + pairs.append( (pdf_filename, PageRange(arg)) ) + did_page_range = True + else: + # New filename or end of list--do all of the previous file? + if pdf_filename and not did_page_range: + pairs.append( (pdf_filename, PAGE_RANGE_ALL) ) + + pdf_filename = arg + did_page_range = False + return pairs diff --git a/PdfFileTransformer/PyPDF2/pdf.py b/PdfFileTransformer/PyPDF2/pdf.py new file mode 100644 index 0000000..3bd0066 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/pdf.py @@ -0,0 +1,3074 @@ +# -*- coding: utf-8 -*- +# +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +A pure-Python PDF library with an increasing number of capabilities. +See README for links to FAQ, documentation, homepage, etc. +""" + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +__maintainer__ = "Phaseit, Inc." +__maintainer_email = "PyPDF2@phaseit.net" + +import string +import math +import struct +import sys +import uuid +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO +else: + from io import StringIO + +if version_info < ( 3, 0 ): + BytesIO = StringIO +else: + from io import BytesIO + +from . import filters +from . import utils +import warnings +import codecs +from .generic import * +from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList +from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning + +if version_info < ( 2, 4 ): + from sets import ImmutableSet as frozenset + +if version_info < ( 2, 5 ): + from md5 import md5 +else: + from hashlib import md5 +import uuid + + +class PdfFileWriter(object): + """ + This class supports writing PDF files out, given pages produced by another + class (typically :class:`PdfFileReader<PdfFileReader>`). + """ + def __init__(self): + self._header = b_("%PDF-1.3") + self._objects = [] # array of indirect objects + + # The root of our page tree node. + pages = DictionaryObject() + pages.update({ + NameObject("/Type"): NameObject("/Pages"), + NameObject("/Count"): NumberObject(0), + NameObject("/Kids"): ArrayObject(), + }) + self._pages = self._addObject(pages) + + # info object + info = DictionaryObject() + info.update({ + NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be')) + }) + self._info = self._addObject(info) + + # root object + root = DictionaryObject() + root.update({ + NameObject("/Type"): NameObject("/Catalog"), + NameObject("/Pages"): self._pages, + }) + self._root = None + self._root_object = root + + def setHeader(self, header): + self._header = header + + def _addObject(self, obj): + self._objects.append(obj) + return IndirectObject(len(self._objects), 0, self) + + def getObject(self, ido): + if ido.pdf != self: + raise ValueError("pdf must be self") + return self._objects[ido.idnum - 1] + + def _addPage(self, page, action): + assert page["/Type"] == "/Page" + page[NameObject("/Parent")] = self._pages + page = self._addObject(page) + pages = self.getObject(self._pages) + action(pages["/Kids"], page) + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) + + def addPage(self, page): + """ + Adds a page to this PDF file. The page is usually acquired from a + :class:`PdfFileReader<PdfFileReader>` instance. + + :param PageObject page: The page to add to the document. Should be + an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` + """ + self._addPage(page, list.append) + + def insertPage(self, page, index=0): + """ + Insert a page in this PDF file. The page is usually acquired from a + :class:`PdfFileReader<PdfFileReader>` instance. + + :param PageObject page: The page to add to the document. This + argument should be an instance of :class:`PageObject<pdf.PageObject>`. + :param int index: Position at which the page will be inserted. + """ + self._addPage(page, lambda l, p: l.insert(index, p)) + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: the page at the index given by *pageNumber* + :rtype: :class:`PageObject<pdf.PageObject>` + """ + pages = self.getObject(self._pages) + # XXX: crude hack + return pages["/Kids"][pageNumber].getObject() + + def getNumPages(self): + """ + :return: the number of pages. + :rtype: int + """ + pages = self.getObject(self._pages) + return int(pages[NameObject("/Count")]) + + def addBlankPage(self, width=None, height=None): + """ + Appends a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :return: the newly appended page + :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + page = PageObject.createBlankPage(self, width, height) + self.addPage(page) + return page + + def insertBlankPage(self, width=None, height=None, index=0): + """ + Inserts a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :param int index: Position to add the page. + :return: the newly appended page + :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + if width is None or height is None and \ + (self.getNumPages() - 1) >= index: + oldpage = self.getPage(index) + width = oldpage.mediaBox.getWidth() + height = oldpage.mediaBox.getHeight() + page = PageObject.createBlankPage(self, width, height) + self.insertPage(page, index) + return page + + def addJS(self, javascript): + """ + Add Javascript which will launch upon opening this PDF. + + :param str javascript: Your Javascript. + + >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + # Example: This will launch the print window when the PDF is opened. + """ + js = DictionaryObject() + js.update({ + NameObject("/Type"): NameObject("/Action"), + NameObject("/S"): NameObject("/JavaScript"), + NameObject("/JS"): NameObject("(%s)" % javascript) + }) + js_indirect_object = self._addObject(js) + + # We need a name for parameterized javascript in the pdf file, but it can be anything. + js_string_name = str(uuid.uuid4()) + + js_name_tree = DictionaryObject() + js_name_tree.update({ + NameObject("/JavaScript"): DictionaryObject({ + NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object]) + }) + }) + self._addObject(js_name_tree) + + self._root_object.update({ + NameObject("/OpenAction"): js_indirect_object, + NameObject("/Names"): js_name_tree + }) + + def addAttachment(self, fname, fdata): + """ + Embed a file inside the PDF. + + :param str fname: The filename to display. + :param str fdata: The data in the file. + + Reference: + https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + Section 7.11.3 + """ + + # We need 3 entries: + # * The file's data + # * The /Filespec entry + # * The file's name, which goes in the Catalog + + + # The entry for the file + """ Sample: + 8 0 obj + << + /Length 12 + /Type /EmbeddedFile + >> + stream + Hello world! + endstream + endobj + """ + file_entry = DecodedStreamObject() + file_entry.setData(fdata) + file_entry.update({ + NameObject("/Type"): NameObject("/EmbeddedFile") + }) + + # The Filespec entry + """ Sample: + 7 0 obj + << + /Type /Filespec + /F (hello.txt) + /EF << /F 8 0 R >> + >> + """ + efEntry = DictionaryObject() + efEntry.update({ NameObject("/F"):file_entry }) + + filespec = DictionaryObject() + filespec.update({ + NameObject("/Type"): NameObject("/Filespec"), + NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject + NameObject("/EF"): efEntry + }) + + # Then create the entry for the root, as it needs a reference to the Filespec + """ Sample: + 1 0 obj + << + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R + /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> + >> + endobj + + """ + embeddedFilesNamesDictionary = DictionaryObject() + embeddedFilesNamesDictionary.update({ + NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) + }) + + embeddedFilesDictionary = DictionaryObject() + embeddedFilesDictionary.update({ + NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary + }) + # Update the root + self._root_object.update({ + NameObject("/Names"): embeddedFilesDictionary + }) + + def appendPagesFromReader(self, reader, after_page_append=None): + """ + Copy pages from reader to writer. Includes an optional callback parameter + which is invoked after pages are appended to the writer. + + :param reader: a PdfFileReader object from which to copy page + annotations to this writer object. The writer's annots + will then be updated + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page + appended to the writer. + """ + # Get page count from writer and reader + reader_num_pages = reader.getNumPages() + writer_num_pages = self.getNumPages() + + # Copy pages from reader to writer + for rpagenum in range(0, reader_num_pages): + reader_page = reader.getPage(rpagenum) + self.addPage(reader_page) + writer_page = self.getPage(writer_num_pages+rpagenum) + # Trigger callback, pass writer page as parameter + if callable(after_page_append): after_page_append(writer_page) + + def updatePageFormFieldValues(self, page, fields): + ''' + Update the form field values for a given page from a fields dictionary. + Copy field texts and values from fields to page. + + :param page: Page reference from PDF writer where the annotations + and field data will be updated. + :param fields: a Python dictionary of field names (/T) and text + values (/V) + ''' + # Iterate through pages, update field values + for j in range(0, len(page['/Annots'])): + writer_annot = page['/Annots'][j].getObject() + for field in fields: + if writer_annot.get('/T') == field: + writer_annot.update({ + NameObject("/V"): TextStringObject(fields[field]) + }) + + def cloneReaderDocumentRoot(self, reader): + ''' + Copy the reader document root to the writer. + + :param reader: PdfFileReader from the document root should be copied. + :callback after_page_append + ''' + self._root_object = reader.trailer['/Root'] + + def cloneDocumentFromReader(self, reader, after_page_append=None): + ''' + Create a copy (clone) of a document from a PDF file reader + + :param reader: PDF file reader instance from which the clone + should be created. + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Signature includes a reference to the + appended page (delegates to appendPagesFromReader). Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page just + appended to the document. + ''' + self.cloneReaderDocumentRoot(reader) + self.appendPagesFromReader(reader, after_page_append) + + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): + """ + Encrypt this PDF file with the PDF Standard encryption handler. + + :param str user_pwd: The "user password", which allows for opening + and reading the PDF file with the restrictions provided. + :param str owner_pwd: The "owner password", which allows for + opening the PDF files without any restrictions. By default, + the owner password is the same as the user password. + :param bool use_128bit: flag as to whether to use 128bit + encryption. When false, 40bit encryption will be used. By default, + this flag is on. + """ + import time, random + if owner_pwd == None: + owner_pwd = user_pwd + if use_128bit: + V = 2 + rev = 3 + keylen = int(128 / 8) + else: + V = 1 + rev = 2 + keylen = int(40 / 8) + # permit everything: + P = -1 + O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) + ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) + self._ID = ArrayObject((ID_1, ID_2)) + if rev == 2: + U, key = _alg34(user_pwd, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) + encrypt = DictionaryObject() + encrypt[NameObject("/Filter")] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject("/Length")] = NumberObject(keylen * 8) + encrypt[NameObject("/R")] = NumberObject(rev) + encrypt[NameObject("/O")] = ByteStringObject(O) + encrypt[NameObject("/U")] = ByteStringObject(U) + encrypt[NameObject("/P")] = NumberObject(P) + self._encrypt = self._addObject(encrypt) + self._encrypt_key = key + + def write(self, stream): + """ + Writes the collection of pages added to this object out as a PDF file. + + :param stream: An object to write the file to. The object must support + the write method and the tell method, similar to a file object. + """ + if hasattr(stream, 'mode') and 'b' not in stream.mode: + warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name) + debug = False + import struct + + if not self._root: + self._root = self._addObject(self._root_object) + + externalReferenceMap = {} + + # PDF objects sometimes have circular references to their /Page objects + # inside their object tree (for example, annotations). Those will be + # indirect references to objects that we've recreated in this PDF. To + # address this problem, PageObject's store their original object + # reference number, and we add it to the external reference map before + # we sweep for indirect references. This forces self-page-referencing + # trees to reference the correct new object location, rather than + # copying in a new copy of the page object. + for objIndex in range(len(self._objects)): + obj = self._objects[objIndex] + if isinstance(obj, PageObject) and obj.indirectRef != None: + data = obj.indirectRef + if data.pdf not in externalReferenceMap: + externalReferenceMap[data.pdf] = {} + if data.generation not in externalReferenceMap[data.pdf]: + externalReferenceMap[data.pdf][data.generation] = {} + externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) + + self.stack = [] + if debug: print(("ERM:", externalReferenceMap, "root:", self._root)) + self._sweepIndirectReferences(externalReferenceMap, self._root) + del self.stack + + # Begin writing: + object_positions = [] + stream.write(self._header + b_("\n")) + stream.write(b_("%\xE2\xE3\xCF\xD3\n")) + for i in range(len(self._objects)): + idnum = (i + 1) + obj = self._objects[i] + object_positions.append(stream.tell()) + stream.write(b_(str(idnum) + " 0 obj\n")) + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack("<i", i + 1)[:3] + pack2 = struct.pack("<i", 0)[:2] + key = self._encrypt_key + pack1 + pack2 + assert len(key) == (len(self._encrypt_key) + 5) + md5_hash = md5(key).digest() + key = md5_hash[:min(16, len(self._encrypt_key) + 5)] + obj.writeToStream(stream, key) + stream.write(b_("\nendobj\n")) + + # xref table + xref_location = stream.tell() + stream.write(b_("xref\n")) + stream.write(b_("0 %s\n" % (len(self._objects) + 1))) + stream.write(b_("%010d %05d f \n" % (0, 65535))) + for offset in object_positions: + stream.write(b_("%010d %05d n \n" % (offset, 0))) + + # trailer + stream.write(b_("trailer\n")) + trailer = DictionaryObject() + trailer.update({ + NameObject("/Size"): NumberObject(len(self._objects) + 1), + NameObject("/Root"): self._root, + NameObject("/Info"): self._info, + }) + if hasattr(self, "_ID"): + trailer[NameObject("/ID")] = self._ID + if hasattr(self, "_encrypt"): + trailer[NameObject("/Encrypt")] = self._encrypt + trailer.writeToStream(stream, None) + + # eof + stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) + + def addMetadata(self, infos): + """ + Add custom metadata to the output. + + :param dict infos: a Python dictionary where each key is a field + and each value is your new metadata. + """ + args = {} + for key, value in list(infos.items()): + args[NameObject(key)] = createStringObject(value) + self.getObject(self._info).update(args) + + def _sweepIndirectReferences(self, externMap, data): + debug = False + if debug: print((data, "TYPE", data.__class__.__name__)) + if isinstance(data, DictionaryObject): + for key, value in list(data.items()): + origvalue = value + value = self._sweepIndirectReferences(externMap, value) + if isinstance(value, StreamObject): + # a dictionary value is a stream. streams must be indirect + # objects, so we need to change this value. + value = self._addObject(value) + data[key] = value + return data + elif isinstance(data, ArrayObject): + for i in range(len(data)): + value = self._sweepIndirectReferences(externMap, data[i]) + if isinstance(value, StreamObject): + # an array value is a stream. streams must be indirect + # objects, so we need to change this value + value = self._addObject(value) + data[i] = value + return data + elif isinstance(data, IndirectObject): + # internal indirect references are fine + if data.pdf == self: + if data.idnum in self.stack: + return data + else: + self.stack.append(data.idnum) + realdata = self.getObject(data) + self._sweepIndirectReferences(externMap, realdata) + return data + else: + if data.pdf.stream.closed: + raise ValueError("I/O operation on closed file: {}".format(data.pdf.stream.name)) + newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None) + if newobj == None: + try: + newobj = data.pdf.getObject(data) + self._objects.append(None) # placeholder + idnum = len(self._objects) + newobj_ido = IndirectObject(idnum, 0, self) + if data.pdf not in externMap: + externMap[data.pdf] = {} + if data.generation not in externMap[data.pdf]: + externMap[data.pdf][data.generation] = {} + externMap[data.pdf][data.generation][data.idnum] = newobj_ido + newobj = self._sweepIndirectReferences(externMap, newobj) + self._objects[idnum-1] = newobj + return newobj_ido + except ValueError: + # Unable to resolve the Object, returning NullObject instead. + warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format( + data.__class__.__name__, data + )) + return NullObject() + return newobj + else: + return data + + def getReference(self, obj): + idnum = self._objects.index(obj) + 1 + ref = IndirectObject(idnum, 0, self) + assert ref.getObject() == obj + return ref + + def getOutlineRoot(self): + if '/Outlines' in self._root_object: + outline = self._root_object['/Outlines'] + idnum = self._objects.index(outline) + 1 + outlineRef = IndirectObject(idnum, 0, self) + assert outlineRef.getObject() == outline + else: + outline = TreeObject() + outline.update({ }) + outlineRef = self._addObject(outline) + self._root_object[NameObject('/Outlines')] = outlineRef + + return outline + + def getNamedDestRoot(self): + if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject): + names = self._root_object['/Names'] + idnum = self._objects.index(names) + 1 + namesRef = IndirectObject(idnum, 0, self) + assert namesRef.getObject() == names + if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): + dests = names['/Dests'] + idnum = self._objects.index(dests) + 1 + destsRef = IndirectObject(idnum, 0, self) + assert destsRef.getObject() == dests + if '/Names' in dests: + nd = dests['/Names'] + else: + nd = ArrayObject() + dests[NameObject('/Names')] = nd + else: + dests = DictionaryObject() + destsRef = self._addObject(dests) + names[NameObject('/Dests')] = destsRef + nd = ArrayObject() + dests[NameObject('/Names')] = nd + + else: + names = DictionaryObject() + namesRef = self._addObject(names) + self._root_object[NameObject('/Names')] = namesRef + dests = DictionaryObject() + destsRef = self._addObject(dests) + names[NameObject('/Dests')] = destsRef + nd = ArrayObject() + dests[NameObject('/Names')] = nd + + return nd + + def addBookmarkDestination(self, dest, parent=None): + destRef = self._addObject(dest) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + parent = parent.getObject() + #print parent.__class__.__name__ + parent.addChild(destRef, self) + + return destRef + + def addBookmarkDict(self, bookmark, parent=None): + bookmarkObj = TreeObject() + for k, v in list(bookmark.items()): + bookmarkObj[NameObject(str(k))] = v + bookmarkObj.update(bookmark) + + if '/A' in bookmark: + action = DictionaryObject() + for k, v in list(bookmark['/A'].items()): + action[NameObject(str(k))] = v + actionRef = self._addObject(action) + bookmarkObj[NameObject('/A')] = actionRef + + bookmarkRef = self._addObject(bookmarkObj) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + parent = parent.getObject() + parent.addChild(bookmarkRef, self) + + return bookmarkRef + + def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args): + """ + Add a bookmark to this PDF file. + + :param str title: Title to use for this bookmark. + :param int pagenum: Page number this bookmark will point to. + :param parent: A reference to a parent bookmark to create nested + bookmarks. + :param tuple color: Color of the bookmark as a red, green, blue tuple + from 0.0 to 1.0 + :param bool bold: Bookmark is bold + :param bool italic: Bookmark is italic + :param str fit: The fit of the destination page. See + :meth:`addLink()<addLink>` for details. + """ + pageRef = self.getObject(self._pages)['/Kids'][pagenum] + action = DictionaryObject() + zoomArgs = [] + for a in args: + if a is not None: + zoomArgs.append(NumberObject(a)) + else: + zoomArgs.append(NullObject()) + dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs) + destArray = dest.getDestArray() + action.update({ + NameObject('/D') : destArray, + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self._addObject(action) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A'): actionRef, + NameObject('/Title'): createStringObject(title), + }) + + if color is not None: + bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])}) + + format = 0 + if italic: + format += 1 + if bold: + format += 2 + if format: + bookmark.update({NameObject('/F'): NumberObject(format)}) + + bookmarkRef = self._addObject(bookmark) + + parent = parent.getObject() + parent.addChild(bookmarkRef, self) + + return bookmarkRef + + def addNamedDestinationObject(self, dest): + destRef = self._addObject(dest) + + nd = self.getNamedDestRoot() + nd.extend([dest['/Title'], destRef]) + + return destRef + + def addNamedDestination(self, title, pagenum): + pageRef = self.getObject(self._pages)['/Kids'][pagenum] + dest = DictionaryObject() + dest.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + + destRef = self._addObject(dest) + nd = self.getNamedDestRoot() + + nd.extend([title, destRef]) + + return destRef + + def removeLinks(self): + """ + Removes links and annotations from this output. + """ + pages = self.getObject(self._pages)['/Kids'] + for page in pages: + pageRef = self.getObject(page) + if "/Annots" in pageRef: + del pageRef['/Annots'] + + def removeImages(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)['/Kids'] + for j in range(len(pages)): + page = pages[j] + pageRef = self.getObject(page) + content = pageRef['/Contents'].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, pageRef) + + _operations = [] + seq_graphics = False + for operands, operator in content.operations: + if operator == b_('Tj'): + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_("'"): + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if ignoreByteStringObject: + if not isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + + if operator == b_('q'): + seq_graphics = True + if operator == b_('Q'): + seq_graphics = False + if seq_graphics: + if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), + b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), + b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]: + continue + if operator == b_('re'): + continue + _operations.append((operands, operator)) + + content.operations = _operations + pageRef.__setitem__(NameObject('/Contents'), content) + + def removeText(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)['/Kids'] + for j in range(len(pages)): + page = pages[j] + pageRef = self.getObject(page) + content = pageRef['/Contents'].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, pageRef) + for operands,operator in content.operations: + if operator == b_('Tj'): + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[0] = TextStringObject() + elif operator == b_("'"): + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[2] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if not ignoreByteStringObject: + if isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + else: + if isinstance(operands[0][i], TextStringObject) or \ + isinstance(operands[0][i], ByteStringObject): + operands[0][i] = TextStringObject() + + pageRef.__setitem__(NameObject('/Contents'), content) + + def addURI(self, pagenum, uri, rect, border=None): + """ + Add an URI from a rectangular area to the specified page. + This uses the basic structure of AddLink + + :param int pagenum: index of the page on which to place the URI action. + :param int uri: string -- uri of resource to link to. + :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + + REMOVED FIT/ZOOM ARG + -John Mulligan + """ + + pageLink = self.getObject(self._pages)['/Kids'][pagenum] + pageRef = self.getObject(pageLink) + + if border is not None: + borderArr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dashPattern = ArrayObject([NameObject(n) for n in border[3]]) + borderArr.append(dashPattern) + else: + borderArr = [NumberObject(2)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + lnk2 = DictionaryObject() + lnk2.update({ + NameObject('/S'): NameObject('/URI'), + NameObject('/URI'): TextStringObject(uri) + }); + lnk = DictionaryObject() + lnk.update({ + NameObject('/Type'): NameObject('/Annot'), + NameObject('/Subtype'): NameObject('/Link'), + NameObject('/P'): pageLink, + NameObject('/Rect'): rect, + NameObject('/H'): NameObject('/I'), + NameObject('/Border'): ArrayObject(borderArr), + NameObject('/A'): lnk2 + }) + lnkRef = self._addObject(lnk) + + if "/Annots" in pageRef: + pageRef['/Annots'].append(lnkRef) + else: + pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + + def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): + """ + Add an internal link from a rectangular area to the specified page. + + :param int pagenum: index of the page on which to place the link. + :param int pagedest: index of the page to which the link should go. + :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need + to be supplied. Passing ``None`` will be read as a null value for that coordinate. + + Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details): + /Fit No additional arguments + /XYZ [left] [top] [zoomFactor] + /FitH [top] + /FitV [left] + /FitR [left] [bottom] [right] [top] + /FitB No additional arguments + /FitBH [top] + /FitBV [left] + """ + + pageLink = self.getObject(self._pages)['/Kids'][pagenum] + pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link + pageRef = self.getObject(pageLink) + + if border is not None: + borderArr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dashPattern = ArrayObject([NameObject(n) for n in border[3]]) + borderArr.append(dashPattern) + else: + borderArr = [NumberObject(0)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + zoomArgs = [] + for a in args: + if a is not None: + zoomArgs.append(NumberObject(a)) + else: + zoomArgs.append(NullObject()) + dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link + destArray = dest.getDestArray() + + lnk = DictionaryObject() + lnk.update({ + NameObject('/Type'): NameObject('/Annot'), + NameObject('/Subtype'): NameObject('/Link'), + NameObject('/P'): pageLink, + NameObject('/Rect'): rect, + NameObject('/Border'): ArrayObject(borderArr), + NameObject('/Dest'): destArray + }) + lnkRef = self._addObject(lnk) + + if "/Annots" in pageRef: + pageRef['/Annots'].append(lnkRef) + else: + pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + + _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object['/PageLayout'] + except KeyError: + return None + + def setPageLayout(self, layout): + """ + Set the page layout + + :param str layout: The page layout to be used + + Valid layouts are: + /NoLayout Layout explicitly not specified + /SinglePage Show one page at a time + /OneColumn Show one column at a time + /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left + /TwoColumnRight Show pages in two columns, odd-numbered pages on the right + /TwoPageLeft Show two pages at a time, odd-numbered pages on the left + /TwoPageRight Show two pages at a time, odd-numbered pages on the right + """ + if not isinstance(layout, NameObject): + if layout not in self._valid_layouts: + warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts))) + layout = NameObject(layout) + self._root_object.update({NameObject('/PageLayout'): layout}) + + pageLayout = property(getPageLayout, setPageLayout) + """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>` + and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods.""" + + _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments'] + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description + of valid modes. + + :return: Page mode currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object['/PageMode'] + except KeyError: + return None + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + Valid modes are: + /UseNone Do not show outlines or thumbnails panels + /UseOutlines Show outlines (aka bookmarks) panel + /UseThumbs Show page thumbnails panel + /FullScreen Fullscreen view + /UseOC Show Optional Content Group (OCG) panel + /UseAttachments Show attachments panel + """ + if not isinstance(mode, NameObject): + if mode not in self._valid_modes: + warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes))) + mode = NameObject(mode) + self._root_object.update({NameObject('/PageMode'): mode}) + + pageMode = property(getPageMode, setPageMode) + """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>` + and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods.""" + + +class PdfFileReader(object): + """ + Initializes a PdfFileReader object. This operation can take some time, as + the PDF stream's cross-reference tables are read into memory. + + :param stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + :param warndest: Destination for logging warnings (defaults to + ``sys.stderr``). + :param bool overwriteWarnings: Determines whether to override Python's + ``warnings.py`` module with a custom implementation (defaults to + ``True``). + """ + def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True): + if overwriteWarnings: + # have to dynamically override the default showwarning since there are no + # public methods that specify the 'file' parameter + def _showwarning(message, category, filename, lineno, file=warndest, line=None): + if file is None: + file = sys.stderr + try: + file.write(formatWarning(message, category, filename, lineno, line)) + except IOError: + pass + warnings.showwarning = _showwarning + self.strict = strict + self.flattenedPages = None + self.resolvedObjects = {} + self.xrefIndex = 0 + self._pageId2Num = None # map page IndirectRef number to Page Number + if hasattr(stream, 'mode') and 'b' not in stream.mode: + warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) + if isString(stream): + fileobj = open(stream, 'rb') + stream = BytesIO(b_(fileobj.read())) + fileobj.close() + self.read(stream) + self.stream = stream + + self._override_encryption = False + + def getDocumentInfo(self): + """ + Retrieves the PDF file's document information dictionary, if it exists. + Note that some PDF files use metadata streams instead of docinfo + dictionaries, and these metadata streams will not be accessed by this + function. + + :return: the document information of this PDF file + :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists. + """ + if "/Info" not in self.trailer: + return None + obj = self.trailer['/Info'] + retval = DocumentInformation() + retval.update(obj) + return retval + + documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function.""" + + def getXmpMetadata(self): + """ + Retrieves XMP (Extensible Metadata Platform) data from the PDF document + root. + + :return: a :class:`XmpInformation<xmp.XmpInformation>` + instance that can be used to access XMP metadata from the document. + :rtype: :class:`XmpInformation<xmp.XmpInformation>` or + ``None`` if no metadata was found on the document root. + """ + try: + self._override_encryption = True + return self.trailer["/Root"].getXmpMetadata() + finally: + self._override_encryption = False + + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + """ + Read-only property that accesses the + :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function. + """ + + def getNumPages(self): + """ + Calculates the number of pages in this PDF file. + + :return: number of pages + :rtype: int + :raises PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + + # Flattened pages will not work on an Encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.isEncrypted: + try: + self._override_encryption = True + self.decrypt('') + return self.trailer["/Root"]["/Pages"]["/Count"] + except: + raise utils.PdfReadError("File has not been decrypted") + finally: + self._override_encryption = False + else: + if self.flattenedPages == None: + self._flatten() + return len(self.flattenedPages) + + numPages = property(lambda self: self.getNumPages(), None, None) + """ + Read-only property that accesses the + :meth:`getNumPages()<PdfFileReader.getNumPages>` function. + """ + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: a :class:`PageObject<pdf.PageObject>` instance. + :rtype: :class:`PageObject<pdf.PageObject>` + """ + ## ensure that we're not trying to access an encrypted PDF + #assert not self.trailer.has_key("/Encrypt") + if self.flattenedPages == None: + self._flatten() + return self.flattenedPages[pageNumber] + + namedDestinations = property(lambda self: + self.getNamedDestinations(), None, None) + """ + Read-only property that accesses the + :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function. + """ + + # A select group of relevant field attributes. For the complete list, + # see section 8.6.2 of the PDF 1.7 reference. + + def getFields(self, tree = None, retval = None, fileobj = None): + """ + Extracts field data if this PDF contains interactive form fields. + The *tree* and *retval* parameters are for recursive use. + + :param fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + :return: A dictionary where each key is a field name, and each + value is a :class:`Field<PyPDF2.generic.Field>` object. By + default, the mapping name is used for keys. + :rtype: dict, or ``None`` if form data could not be located. + """ + fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent", + "/T" : "Field Name", "/TU" : "Alternate Field Name", + "/TM" : "Mapping Name", "/Ff" : "Field Flags", + "/V" : "Value", "/DV" : "Default Value"} + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + # get the AcroForm tree + if "/AcroForm" in catalog: + tree = catalog["/AcroForm"] + else: + return None + if tree == None: + return retval + + self._checkKids(tree, retval, fileobj) + for attr in fieldAttributes: + if attr in tree: + # Tree is a field + self._buildField(tree, retval, fileobj, fieldAttributes) + break + + if "/Fields" in tree: + fields = tree["/Fields"] + for f in fields: + field = f.getObject() + self._buildField(field, retval, fileobj, fieldAttributes) + + return retval + + def _buildField(self, field, retval, fileobj, fieldAttributes): + self._checkKids(field, retval, fileobj) + try: + key = field["/TM"] + except KeyError: + try: + key = field["/T"] + except KeyError: + # Ignore no-name field for now + return + if fileobj: + self._writeField(fileobj, field, fieldAttributes) + fileobj.write("\n") + retval[key] = Field(field) + + def _checkKids(self, tree, retval, fileobj): + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getFields(kid.getObject(), retval, fileobj) + + def _writeField(self, fileobj, field, fieldAttributes): + order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"] + for attr in order: + attrName = fieldAttributes[attr] + try: + if attr == "/FT": + # Make the field type value more clear + types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice", + "/Sig":"Signature"} + if field[attr] in types: + fileobj.write(attrName + ": " + types[field[attr]] + "\n") + elif attr == "/Parent": + # Let's just write the name of the parent + try: + name = field["/Parent"]["/TM"] + except KeyError: + name = field["/Parent"]["/T"] + fileobj.write(attrName + ": " + name + "\n") + else: + fileobj.write(attrName + ": " + str(field[attr]) + "\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def getFormTextFields(self): + ''' Retrieves form fields from the document with textual data (inputs, dropdowns) + ''' + # Retrieve document form fields + formfields = self.getFields() + return dict( + (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \ + if formfields[field].get('/FT') == '/Tx' + ) + + def getNamedDestinations(self, tree=None, retval=None): + """ + Retrieves the named destinations present in the document. + + :return: a dictionary which maps names to + :class:`Destinations<PyPDF2.generic.Destination>`. + :rtype: dict + """ + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + + # get the name tree + if "/Dests" in catalog: + tree = catalog["/Dests"] + elif "/Names" in catalog: + names = catalog['/Names'] + if "/Dests" in names: + tree = names['/Dests'] + + if tree == None: + return retval + + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) + + if "/Names" in tree: + names = tree["/Names"] + for i in range(0, len(names), 2): + key = names[i].getObject() + val = names[i+1].getObject() + if isinstance(val, DictionaryObject) and '/D' in val: + val = val['/D'] + dest = self._buildDestination(key, val) + if dest != None: + retval[key] = dest + + return retval + + outlines = property(lambda self: self.getOutlines(), None, None) + """ + Read-only property that accesses the + :meth:`getOutlines()<PdfFileReader.getOutlines>` function. + """ + + def getOutlines(self, node=None, outlines=None): + """ + Retrieves the document outline present in the document. + + :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`. + """ + if outlines == None: + outlines = [] + catalog = self.trailer["/Root"] + + # get the outline dictionary and named destinations + if "/Outlines" in catalog: + try: + lines = catalog["/Outlines"] + except utils.PdfReadError: + # this occurs if the /Outlines object reference is incorrect + # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf + # so continue to load the file without the Bookmarks + return outlines + + if "/First" in lines: + node = lines["/First"] + self._namedDests = self.getNamedDestinations() + + if node == None: + return outlines + + # see if there are any more outlines + while True: + outline = self._buildOutline(node) + if outline: + outlines.append(outline) + + # check for sub-outlines + if "/First" in node: + subOutlines = [] + self.getOutlines(node["/First"], subOutlines) + if subOutlines: + outlines.append(subOutlines) + + if "/Next" not in node: + break + node = node["/Next"] + + return outlines + + def _getPageNumberByIndirect(self, indirectRef): + """Generate _pageId2Num""" + if self._pageId2Num is None: + id2num = {} + for i, x in enumerate(self.pages): + id2num[x.indirectRef.idnum] = i + self._pageId2Num = id2num + + if isinstance(indirectRef, int): + idnum = indirectRef + else: + idnum = indirectRef.idnum + + ret = self._pageId2Num.get(idnum, -1) + return ret + + def getPageNumber(self, page): + """ + Retrieve page number of a given PageObject + + :param PageObject page: The page to get page number. Should be + an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` + :return: the page number or -1 if page not found + :rtype: int + """ + indirectRef = page.indirectRef + ret = self._getPageNumberByIndirect(indirectRef) + return ret + + def getDestinationPageNumber(self, destination): + """ + Retrieve page number of a given Destination object + + :param Destination destination: The destination to get page number. + Should be an instance of + :class:`Destination<PyPDF2.pdf.Destination>` + :return: the page number or -1 if page not found + :rtype: int + """ + indirectRef = destination.page + ret = self._getPageNumberByIndirect(indirectRef) + return ret + + def _buildDestination(self, title, array): + page, typ = array[0:2] + array = array[2:] + return Destination(title, page, typ, *array) + + def _buildOutline(self, node): + dest, title, outline = None, None, None + + if "/A" in node and "/Title" in node: + # Action, section 8.5 (only type GoTo supported) + title = node["/Title"] + action = node["/A"] + if action["/S"] == "/GoTo": + dest = action["/D"] + elif "/Dest" in node and "/Title" in node: + # Destination, section 8.2.1 + title = node["/Title"] + dest = node["/Dest"] + + # if destination found, then create outline + if dest: + if isinstance(dest, ArrayObject): + outline = self._buildDestination(title, dest) + elif isString(dest) and dest in self._namedDests: + outline = self._namedDests[dest] + outline[NameObject("/Title")] = title + else: + raise utils.PdfReadError("Unexpected destination %r" % dest) + return outline + + pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), + None, None) + """ + Read-only property that emulates a list based upon the + :meth:`getNumPages()<PdfFileReader.getNumPages>` and + :meth:`getPage()<PdfFileReader.getPage>` methods. + """ + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` + for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer['/Root']['/PageLayout'] + except KeyError: + return None + + pageLayout = property(getPageLayout) + """Read-only property accessing the + :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method.""" + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()<PdfFileWriter.setPageMode>` + for a description of valid modes. + + :return: Page mode currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer['/Root']['/PageMode'] + except KeyError: + return None + + pageMode = property(getPageMode) + """Read-only property accessing the + :meth:`getPageMode()<PdfFileReader.getPageMode>` method.""" + + def _flatten(self, pages=None, inherit=None, indirectRef=None): + inheritablePageAttributes = ( + NameObject("/Resources"), NameObject("/MediaBox"), + NameObject("/CropBox"), NameObject("/Rotate") + ) + if inherit == None: + inherit = dict() + if pages == None: + self.flattenedPages = [] + catalog = self.trailer["/Root"].getObject() + pages = catalog["/Pages"].getObject() + + t = "/Pages" + if "/Type" in pages: + t = pages["/Type"] + + if t == "/Pages": + for attr in inheritablePageAttributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in pages["/Kids"]: + addt = {} + if isinstance(page, IndirectObject): + addt["indirectRef"] = page + self._flatten(page.getObject(), inherit, **addt) + elif t == "/Page": + for attr, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr not in pages: + pages[attr] = value + pageObj = PageObject(self, indirectRef) + pageObj.update(pages) + self.flattenedPages.append(pageObj) + + def _getObjectFromStream(self, indirectReference): + # indirect reference to object in object stream + # read the entire object stream into memory + debug = False + stmnum, idx = self.xref_objStm[indirectReference.idnum] + if debug: print(("Here1: %s %s"%(stmnum, idx))) + objStm = IndirectObject(stmnum, 0, self).getObject() + if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData()))) + # This is an xref to a stream, so its type better be a stream + assert objStm['/Type'] == '/ObjStm' + # /N is the number of indirect objects in the stream + assert idx < objStm['/N'] + streamData = BytesIO(b_(objStm.getData())) + for i in range(objStm['/N']): + readNonWhitespace(streamData) + streamData.seek(-1, 1) + objnum = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + offset = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + if objnum != indirectReference.idnum: + # We're only interested in one object + continue + if self.strict and idx != i: + raise utils.PdfReadError("Object is in wrong index.") + streamData.seek(objStm['/First']+offset, 0) + if debug: + pos = streamData.tell() + streamData.seek(0, 0) + lines = streamData.readlines() + for i in range(0, len(lines)): + print((lines[i])) + streamData.seek(pos, 0) + try: + obj = readObject(streamData, self) + except utils.PdfStreamError as e: + # Stream object cannot be read. Normally, a critical error, but + # Adobe Reader doesn't complain, so continue (in strict mode?) + e = sys.exc_info()[1] + warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \ + (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning) + + if self.strict: + raise utils.PdfReadError("Can't read object stream: %s"%e) + # Replace with null. Hopefully it's nothing important. + obj = NullObject() + return obj + + if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") + return NullObject() + + def getObject(self, indirectReference): + debug = False + if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) + retval = self.cacheGetIndirectObject(indirectReference.generation, + indirectReference.idnum) + if retval != None: + return retval + if indirectReference.generation == 0 and \ + indirectReference.idnum in self.xref_objStm: + retval = self._getObjectFromStream(indirectReference) + elif indirectReference.generation in self.xref and \ + indirectReference.idnum in self.xref[indirectReference.generation]: + start = self.xref[indirectReference.generation][indirectReference.idnum] + if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start)) + self.stream.seek(start, 0) + idnum, generation = self.readObjectHeader(self.stream) + if idnum != indirectReference.idnum and self.xrefIndex: + # Xref table probably had bad indexes due to not being zero-indexed + if self.strict: + raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + else: pass # xref table is corrected in non-strict mode + elif idnum != indirectReference.idnum and self.strict: + # some other problem + raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + if self.strict: + assert generation == indirectReference.generation + retval = readObject(self.stream, self) + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self.isEncrypted: + # if we don't have the encryption key: + if not hasattr(self, '_decryption_key'): + raise utils.PdfReadError("file has not been decrypted") + # otherwise, decrypt here... + import struct + pack1 = struct.pack("<i", indirectReference.idnum)[:3] + pack2 = struct.pack("<i", indirectReference.generation)[:2] + key = self._decryption_key + pack1 + pack2 + assert len(key) == (len(self._decryption_key) + 5) + md5_hash = md5(key).digest() + key = md5_hash[:min(16, len(self._decryption_key) + 5)] + retval = self._decryptObject(retval, key) + else: + warnings.warn("Object %d %d not defined."%(indirectReference.idnum, + indirectReference.generation), utils.PdfReadWarning) + #if self.strict: + raise utils.PdfReadError("Could not find object.") + self.cacheIndirectObject(indirectReference.generation, + indirectReference.idnum, retval) + return retval + + def _decryptObject(self, obj, key): + if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): + obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) + elif isinstance(obj, StreamObject): + obj._data = utils.RC4_encrypt(key, obj._data) + elif isinstance(obj, DictionaryObject): + for dictkey, value in list(obj.items()): + obj[dictkey] = self._decryptObject(value, key) + elif isinstance(obj, ArrayObject): + for i in range(len(obj)): + obj[i] = self._decryptObject(obj[i], key) + return obj + + def readObjectHeader(self, stream): + # Should never be necessary to read out whitespace, since the + # cross-reference table should put us in the right spot to read the + # object header. In reality... some files have stupid cross reference + # tables that are off by whitespace bytes. + extra = False + utils.skipOverComment(stream) + extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) + idnum = readUntilWhitespace(stream) + extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) + generation = readUntilWhitespace(stream) + obj = stream.read(3) + readNonWhitespace(stream) + stream.seek(-1, 1) + if (extra and self.strict): + #not a fatal error + warnings.warn("Superfluous whitespace found in object header %s %s" % \ + (idnum, generation), utils.PdfReadWarning) + return int(idnum), int(generation) + + def cacheGetIndirectObject(self, generation, idnum): + debug = False + out = self.resolvedObjects.get((generation, idnum)) + if debug and out: print(("cache hit: %d %d"%(idnum, generation))) + elif debug: print(("cache miss: %d %d"%(idnum, generation))) + return out + + def cacheIndirectObject(self, generation, idnum, obj): + # return None # Sometimes we want to turn off cache for debugging. + if (generation, idnum) in self.resolvedObjects: + msg = "Overwriting cache for %s %s"%(generation, idnum) + if self.strict: raise utils.PdfReadError(msg) + else: warnings.warn(msg) + self.resolvedObjects[(generation, idnum)] = obj + return obj + + def read(self, stream): + debug = False + if debug: print(">>read", stream) + # start at the end: + stream.seek(-1, 2) + if not stream.tell(): + raise utils.PdfReadError('Cannot read an empty file') + last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream + line = b_('') + while line[:5] != b_("%%EOF"): + if stream.tell() < last1K: + raise utils.PdfReadError("EOF marker not found") + line = self.readNextEndLine(stream) + if debug: print(" line:",line) + + # find startxref entry - the location of the xref table + line = self.readNextEndLine(stream) + try: + startxref = int(line) + except ValueError: + # 'startxref' may be on the same line as the location + if not line.startswith(b_("startxref")): + raise utils.PdfReadError("startxref not found") + startxref = int(line[9:].strip()) + warnings.warn("startxref on same line as offset") + else: + line = self.readNextEndLine(stream) + if line[:9] != b_("startxref"): + raise utils.PdfReadError("startxref not found") + + # read all cross reference tables and their trailers + self.xref = {} + self.xref_objStm = {} + self.trailer = DictionaryObject() + while True: + # load the xref table + stream.seek(startxref, 0) + x = stream.read(1) + if x == b_("x"): + # standard cross-reference table + ref = stream.read(4) + if ref[:3] != b_("ref"): + raise utils.PdfReadError("xref table read error") + readNonWhitespace(stream) + stream.seek(-1, 1) + firsttime = True; # check if the first time looking at the xref table + while True: + num = readObject(stream, self) + if firsttime and num != 0: + self.xrefIndex = num + if self.strict: + warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning) + #if table not zero indexed, could be due to error from when PDF was created + #which will lead to mismatched indices later on, only warned and corrected if self.strict=True + firsttime = False + readNonWhitespace(stream) + stream.seek(-1, 1) + size = readObject(stream, self) + readNonWhitespace(stream) + stream.seek(-1, 1) + cnt = 0 + while cnt < size: + line = stream.read(20) + + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes (as of PDF 1.7). However, some files have + # 21-byte entries (or more) due to the use of \r\n + # (CRLF) EOL's. Detect that case, and adjust the line + # until it does not begin with a \r (CR) or \n (LF). + while line[0] in b_("\x0D\x0A"): + stream.seek(-20 + 1, 1) + line = stream.read(20) + + # On the other hand, some malformed PDF files + # use a single character EOL without a preceeding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in b_("0123456789t"): + stream.seek(-1, 1) + + offset, generation = line[:16].split(b_(" ")) + offset, generation = int(offset), int(generation) + if generation not in self.xref: + self.xref[generation] = {} + if num in self.xref[generation]: + # It really seems like we should allow the last + # xref table in the file to override previous + # ones. Since we read the file backwards, assume + # any existing key is already set correctly. + pass + else: + self.xref[generation][num] = offset + cnt += 1 + num += 1 + readNonWhitespace(stream) + stream.seek(-1, 1) + trailertag = stream.read(7) + if trailertag != b_("trailer"): + # more xrefs! + stream.seek(-7, 1) + else: + break + readNonWhitespace(stream) + stream.seek(-1, 1) + newTrailer = readObject(stream, self) + for key, value in list(newTrailer.items()): + if key not in self.trailer: + self.trailer[key] = value + if "/Prev" in newTrailer: + startxref = newTrailer["/Prev"] + else: + break + elif x.isdigit(): + # PDF 1.5+ Cross-Reference Stream + stream.seek(-1, 1) + idnum, generation = self.readObjectHeader(stream) + xrefstream = readObject(stream, self) + assert xrefstream["/Type"] == "/XRef" + self.cacheIndirectObject(generation, idnum, xrefstream) + streamData = BytesIO(b_(xrefstream.getData())) + # Index pairs specify the subsections in the dictionary. If + # none create one subsection that spans everything. + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs)))) + entrySizes = xrefstream.get("/W") + assert len(entrySizes) >= 3 + if self.strict and len(entrySizes) > 3: + raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes) + + def getEntry(i): + # Reads the correct number of bytes for each entry. See the + # discussion of the W parameter in PDF spec table 17. + if entrySizes[i] > 0: + d = streamData.read(entrySizes[i]) + return convertToInt(d, entrySizes[i]) + + # PDF Spec Table 17: A value of zero for an element in the + # W array indicates...the default value shall be used + if i == 0: return 1 # First value defaults to 1 + else: return 0 + + def used_before(num, generation): + # We move backwards through the xrefs, don't replace any. + return num in self.xref.get(generation, []) or \ + num in self.xref_objStm + + # Iterate through each subsection + last_end = 0 + for start, size in self._pairs(idx_pairs): + # The subsections must increase + assert start >= last_end + last_end = start + size + for num in range(start, start+size): + # The first entry is the type + xref_type = getEntry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = getEntry(1) + next_generation = getEntry(2) + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = getEntry(1) + generation = getEntry(2) + if generation not in self.xref: + self.xref[generation] = {} + if not used_before(num, generation): + self.xref[generation][num] = byte_offset + if debug: print(("XREF Uncompressed: %s %s"%( + num, generation))) + elif xref_type == 2: + # compressed objects + objstr_num = getEntry(1) + obstr_idx = getEntry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + if debug: print(("XREF Compressed: %s %s %s"%( + num, objstr_num, obstr_idx))) + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise utils.PdfReadError("Unknown xref type: %s"% + xref_type) + + trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" + for key in trailerKeys: + if key in xrefstream and key not in self.trailer: + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/Prev" in xrefstream: + startxref = xrefstream["/Prev"] + else: + break + else: + # bad xref character at startxref. Let's see if we can find + # the xref table nearby, as we've observed this error with an + # off-by-one before. + stream.seek(-11, 1) + tmp = stream.read(20) + xref_loc = tmp.find(b_("xref")) + if xref_loc != -1: + startxref -= (10 - xref_loc) + continue + # No explicit xref table, try finding a cross-reference stream. + stream.seek(startxref, 0) + found = False + for look in range(5): + if stream.read(1).isdigit(): + # This is not a standard PDF, consider adding a warning + startxref += look + found = True + break + if found: + continue + # no xref table found at specified location + raise utils.PdfReadError("Could not find xref table at specified location") + #if not zero-indexed, verify that the table is correct; change it if necessary + if self.xrefIndex and not self.strict: + loc = stream.tell() + for gen in self.xref: + if gen == 65535: continue + for id in self.xref[gen]: + stream.seek(self.xref[gen][id], 0) + try: + pid, pgen = self.readObjectHeader(stream) + except ValueError: + break + if pid == id - self.xrefIndex: + self._zeroXref(gen) + break + #if not, then either it's just plain wrong, or the non-zero-index is actually correct + stream.seek(loc, 0) #return to where it was + + def _zeroXref(self, generation): + self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) + + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i+1] + i += 2 + if (i+1) >= len(array): + break + + def readNextEndLine(self, stream): + debug = False + if debug: print(">>readNextEndLine") + line = b_("") + while True: + # Prevent infinite loops in malformed PDFs + if stream.tell() == 0: + raise utils.PdfReadError("Could not read malformed PDF file") + x = stream.read(1) + if debug: print((" x:", x, "%x"%ord(x))) + if stream.tell() < 2: + raise utils.PdfReadError("EOL marker not found") + stream.seek(-2, 1) + if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR + crlf = False + while x == b_('\n') or x == b_('\r'): + if debug: + if ord(x) == 0x0D: print(" x is CR 0D") + elif ord(x) == 0x0A: print(" x is LF 0A") + x = stream.read(1) + if x == b_('\n') or x == b_('\r'): # account for CR+LF + stream.seek(-1, 1) + crlf = True + if stream.tell() < 2: + raise utils.PdfReadError("EOL marker not found") + stream.seek(-2, 1) + stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1 + break + else: + if debug: print(" x is neither") + line = x + line + if debug: print((" RNEL line:", line)) + if debug: print("leaving RNEL") + return line + + def decrypt(self, password): + """ + When using an encrypted / secured PDF file with the PDF Standard + encryption handler, this function will allow the file to be decrypted. + It checks the given password against the document's user password and + owner password, and then stores the resulting decryption key if either + password is correct. + + It does not matter which password was matched. Both passwords provide + the correct decryption key that will allow the document to be used with + this library. + + :param str password: The password to match. + :return: ``0`` if the password failed, ``1`` if the password matched the user + password, and ``2`` if the password matched the owner password. + :rtype: int + :raises NotImplementedError: if document uses an unsupported encryption + method. + """ + + self._override_encryption = True + try: + return self._decrypt(password) + finally: + self._override_encryption = False + + def _decrypt(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + if encrypt['/Filter'] != '/Standard': + raise NotImplementedError("only Standard PDF encryption handler is available") + if not (encrypt['/V'] in (1, 2)): + raise NotImplementedError("only algorithm code 1 and 2 are supported. This PDF uses code %s" % encrypt['/V']) + user_password, key = self._authenticateUserPassword(password) + if user_password: + self._decryption_key = key + return 1 + else: + rev = encrypt['/R'].getObject() + if rev == 2: + keylen = 5 + else: + keylen = encrypt['/Length'].getObject() // 8 + key = _alg33_1(password, rev, keylen) + real_O = encrypt["/O"].getObject() + if rev == 2: + userpass = utils.RC4_encrypt(key, real_O) + else: + val = real_O + for i in range(19, -1, -1): + new_key = b_('') + for l in range(len(key)): + new_key += b_(chr(utils.ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + userpass = val + owner_password, key = self._authenticateUserPassword(userpass) + if owner_password: + self._decryption_key = key + return 2 + return 0 + + def _authenticateUserPassword(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + rev = encrypt['/R'].getObject() + owner_entry = encrypt['/O'].getObject() + p_entry = encrypt['/P'].getObject() + id_entry = self.trailer['/ID'].getObject() + id1_entry = id_entry[0].getObject() + real_U = encrypt['/U'].getObject().original_bytes + if rev == 2: + U, key = _alg34(password, owner_entry, p_entry, id1_entry) + elif rev >= 3: + U, key = _alg35(password, rev, + encrypt["/Length"].getObject() // 8, owner_entry, + p_entry, id1_entry, + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) + U, real_U = U[:16], real_U[:16] + return U == real_U, key + + def getIsEncrypted(self): + return "/Encrypt" in self.trailer + + isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) + """ + Read-only boolean property showing whether this PDF file is encrypted. + Note that this property, if true, will remain true even after the + :meth:`decrypt()<PdfFileReader.decrypt>` method is called. + """ + + +def getRectangle(self, name, defaults): + retval = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval == None: + for d in defaults: + retval = self.get(d) + if retval != None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.getObject(retval) + retval = RectangleObject(retval) + setRectangle(self, name, retval) + return retval + + +def setRectangle(self, name, value): + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + + +def deleteRectangle(self, name): + del self[name] + + +def createRectangleAccessor(name, fallback): + return \ + property( + lambda self: getRectangle(self, name, fallback), + lambda self, value: setRectangle(self, name, value), + lambda self: deleteRectangle(self, name) + ) + + +class PageObject(DictionaryObject): + """ + This class represents a single page within a PDF file. Typically this + object will be created by accessing the + :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the + :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is + also possible to create an empty page with the + :meth:`createBlankPage()<PageObject.createBlankPage>` static method. + + :param pdf: PDF file the page belongs to. + :param indirectRef: Stores the original indirect reference to + this object in its source PDF + """ + def __init__(self, pdf=None, indirectRef=None): + DictionaryObject.__init__(self) + self.pdf = pdf + self.indirectRef = indirectRef + + def createBlankPage(pdf=None, width=None, height=None): + """ + Returns a new blank page. + If ``width`` or ``height`` is ``None``, try to get the page size + from the last page of *pdf*. + + :param pdf: PDF file the page belongs to + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default user + space units. + :return: the new blank page: + :rtype: :class:`PageObject<PageObject>` + :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains + no page + """ + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject('/Type'), NameObject('/Page')) + page.__setitem__(NameObject('/Parent'), NullObject()) + page.__setitem__(NameObject('/Resources'), DictionaryObject()) + if width is None or height is None: + if pdf is not None and pdf.getNumPages() > 0: + lastpage = pdf.getPage(pdf.getNumPages() - 1) + width = lastpage.mediaBox.getWidth() + height = lastpage.mediaBox.getHeight() + else: + raise utils.PageSizeNotDefinedError() + page.__setitem__(NameObject('/MediaBox'), + RectangleObject([0, 0, width, height])) + + return page + createBlankPage = staticmethod(createBlankPage) + + def rotateClockwise(self, angle): + """ + Rotates a page clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + assert angle % 90 == 0 + self._rotate(angle) + return self + + def rotateCounterClockwise(self, angle): + """ + Rotates a page counter-clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + assert angle % 90 == 0 + self._rotate(-angle) + return self + + def _rotate(self, angle): + rotateObj = self.get("/Rotate", 0) + currentAngle = rotateObj if isinstance(rotateObj, int) else rotateObj.getObject() + self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) + + def _mergeResources(res1, res2, resource): + newRes = DictionaryObject() + newRes.update(res1.get(resource, DictionaryObject()).getObject()) + page2Res = res2.get(resource, DictionaryObject()).getObject() + renameRes = {} + for key in list(page2Res.keys()): + if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key): + newname = NameObject(key + str(uuid.uuid4())) + renameRes[key] = newname + newRes[newname] = page2Res[key] + elif key not in newRes: + newRes[key] = page2Res.raw_get(key) + return newRes, renameRes + _mergeResources = staticmethod(_mergeResources) + + def _contentStreamRename(stream, rename, pdf): + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands, operator in stream.operations: + for i in range(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op,op) + return stream + _contentStreamRename = staticmethod(_contentStreamRename) + + def _pushPopGS(contents, pdf): + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], "q"]) + stream.operations.append([[], "Q"]) + return stream + _pushPopGS = staticmethod(_pushPopGS) + + def _addTransformationMatrix(contents, pdf, ctm): + # adds transformation matrix at the beginning of the given + # contents stream. + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert(0, [[FloatObject(a), FloatObject(b), + FloatObject(c), FloatObject(d), FloatObject(e), + FloatObject(f)], " cm"]) + return contents + _addTransformationMatrix = staticmethod(_addTransformationMatrix) + + def getContents(self): + """ + Accesses the page contents. + + :return: the ``/Contents`` object, or ``None`` if it doesn't exist. + ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 + """ + if "/Contents" in self: + return self["/Contents"].getObject() + else: + return None + + def mergePage(self, page2): + """ + Merges the content streams of two pages into one. Resource references + (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + of this page are not altered. The parameter page's content stream will + be added to the end of this page's content stream, meaning that it will + be drawn after, or "on top" of this page. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + """ + self._mergePage(page2) + + def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + newResources = DictionaryObject() + rename = {} + originalResources = self["/Resources"].getObject() + page2Resources = page2["/Resources"].getObject() + newAnnots = ArrayObject() + + for page in (self, page2): + if "/Annots" in page: + annots = page["/Annots"] + if isinstance(annots, ArrayObject): + for ref in annots: + newAnnots.append(ref) + + for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": + new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) + if new: + newResources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + newResources[NameObject("/ProcSet")] = ArrayObject( + frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( + frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) + ) + ) + + newContentArray = ArrayObject() + + originalContent = self.getContents() + if originalContent is not None: + newContentArray.append(PageObject._pushPopGS( + originalContent, self.pdf)) + + page2Content = page2.getContents() + if page2Content is not None: + if page2transformation is not None: + page2Content = page2transformation(page2Content) + page2Content = PageObject._contentStreamRename( + page2Content, rename, self.pdf) + page2Content = PageObject._pushPopGS(page2Content, self.pdf) + newContentArray.append(page2Content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), + self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()] + corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), + page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), + page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), + page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()] + if ctm is not None: + ctm = [float(x) for x in ctm] + new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)] + new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)] + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = [min(new_x), min(new_y)] + upperright = [max(new_x), max(new_y)] + lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] + upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])] + + self.mediaBox.setLowerLeft(lowerleft) + self.mediaBox.setUpperRight(upperright) + + self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) + self[NameObject('/Resources')] = newResources + self[NameObject('/Annots')] = newAnnots + + def mergeTransformedPage(self, page2, ctm, expand=False): + """ + This is similar to mergePage, but a transformation matrix is + applied to the merged stream. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param tuple ctm: a 6-element tuple containing the operands of the + transformation matrix + :param bool expand: Whether the page should be expanded to fit the dimensions + of the page to be merged. + """ + self._mergePage(page2, lambda page2Content: + PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand) + + def mergeScaledPage(self, page2, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is scaled + by appling a transformation matrix. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + # CTM to scale : [ sx 0 0 sy 0 0 ] + return self.mergeTransformedPage(page2, [scale, 0, + 0, scale, + 0, 0], expand) + + def mergeRotatedPage(self, page2, rotation, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + return self.mergeTransformedPage(page2, + [math.cos(rotation), math.sin(rotation), + -math.sin(rotation), math.cos(rotation), + 0, 0], expand) + + def mergeTranslatedPage(self, page2, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + return self.mergeTransformedPage(page2, [1, 0, + 0, 1, + tx, ty], expand) + + def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and translated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], + [0, 1, 0], + [-tx, -ty, 1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + rtranslation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + ctm = utils.matrixMultiply(translation, rotating) + ctm = utils.matrixMultiply(ctm, rtranslation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + + return self.mergeTransformedPage(page2, + [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float scale: The scaling factor + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(scaling, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated, + rotated and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + translation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + ctm = utils.matrixMultiply(ctm, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + ## + # Applys a transformation matrix the page. + # + # @param ctm A 6 elements tuple containing the operands of the + # transformation matrix + def addTransformation(self, ctm): + """ + Applies a transformation matrix to the page. + + :param tuple ctm: A 6-element tuple containing the operands of the + transformation matrix. + """ + originalContent = self.getContents() + if originalContent is not None: + newContent = PageObject._addTransformationMatrix( + originalContent, self.pdf, ctm) + newContent = PageObject._pushPopGS(newContent, self.pdf) + self[NameObject('/Contents')] = newContent + + def scale(self, sx, sy): + """ + Scales a page by the given factors by appling a transformation + matrix to its content and updating the page size. + + :param float sx: The scaling factor on horizontal axis. + :param float sy: The scaling factor on vertical axis. + """ + self.addTransformation([sx, 0, + 0, sy, + 0, 0]) + self.mediaBox = RectangleObject([ + float(self.mediaBox.getLowerLeft_x()) * sx, + float(self.mediaBox.getLowerLeft_y()) * sy, + float(self.mediaBox.getUpperRight_x()) * sx, + float(self.mediaBox.getUpperRight_y()) * sy]) + if "/VP" in self: + viewport = self["/VP"] + if isinstance(viewport, ArrayObject): + bbox = viewport[0]["/BBox"] + else: + bbox = viewport["/BBox"] + scaled_bbox = RectangleObject([ + float(bbox[0]) * sx, + float(bbox[1]) * sy, + float(bbox[2]) * sx, + float(bbox[3]) * sy]) + if isinstance(viewport, ArrayObject): + self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox + else: + self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox + + def scaleBy(self, factor): + """ + Scales a page by the given factor by appling a transformation + matrix to its content and updating the page size. + + :param float factor: The scaling factor (for both X and Y axis). + """ + self.scale(factor, factor) + + def scaleTo(self, width, height): + """ + Scales a page to the specified dimentions by appling a + transformation matrix to its content and updating the page size. + + :param float width: The new width. + :param float height: The new heigth. + """ + sx = width / float(self.mediaBox.getUpperRight_x() - + self.mediaBox.getLowerLeft_x ()) + sy = height / float(self.mediaBox.getUpperRight_y() - + self.mediaBox.getLowerLeft_y ()) + self.scale(sx, sy) + + def compressContentStreams(self): + """ + Compresses the size of this page by joining all content streams and + applying a FlateDecode filter. + + However, it is possible that this function will perform no action if + content stream compression becomes "automatic" for some reason. + """ + content = self.getContents() + if content is not None: + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject("/Contents")] = content.flateEncode() + + def extractText(self): + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + + :return: a unicode string object. + """ + text = u_("") + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + for operands, operator in content.operations: + if operator == b_("Tj"): + _text = operands[0] + if isinstance(_text, TextStringObject): + text += _text + text += "\n" + elif operator == b_("T*"): + text += "\n" + elif operator == b_("'"): + text += "\n" + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == b_('"'): + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text + elif operator == b_("TJ"): + for i in operands[0]: + if isinstance(i, TextStringObject): + text += i + text += "\n" + return text + + mediaBox = createRectangleAccessor("/MediaBox", ()) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the boundaries of the physical medium on which the page is + intended to be displayed or printed. + """ + + cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the visible region of default user space. When the page is + displayed or printed, its contents are to be clipped (cropped) to this + rectangle and then imposed on the output medium in some + implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`. + """ + + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the region to which the contents of the page should be clipped + when output in a production enviroment. + """ + + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the intended dimensions of the finished page after trimming. + """ + + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the extent of the page's meaningful content as intended by the + page's creator. + """ + + +class ContentStream(DecodedStreamObject): + def __init__(self, stream, pdf): + self.pdf = pdf + self.operations = [] + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + stream = stream.getObject() + if isinstance(stream, ArrayObject): + data = b_("") + for s in stream: + data += b_(s.getObject().getData()) + stream = BytesIO(b_(data)) + else: + stream = BytesIO(b_(stream.getData())) + self.__parseContentStream(stream) + + def __parseContentStream(self, stream): + # file("f:\\tmp.txt", "w").write(stream.read()) + stream.seek(0, 0) + operands = [] + while True: + peek = readNonWhitespace(stream) + if peek == b_('') or ord_(peek) == 0: + break + stream.seek(-1, 1) + if peek.isalpha() or peek == b_("'") or peek == b_('"'): + operator = utils.readUntilRegex(stream, + NameObject.delimiterPattern, True) + if operator == b_("BI"): + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._readInlineImage(stream) + self.operations.append((ii, b_("INLINE IMAGE"))) + else: + self.operations.append((operands, operator)) + operands = [] + elif peek == b_('%'): + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, readObject will handle + # encountering a comment -- but readObject assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in (b_('\r'), b_('\n')): + peek = stream.read(1) + else: + operands.append(readObject(stream, None)) + + def _readInlineImage(self, stream): + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + if tok == b_("I"): + # "ID" - begin of image data + break + key = readObject(stream, self.pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == b_("ID") + data = b_("") + while True: + # Read the inline image, while checking for EI (End Image) operator. + tok = stream.read(1) + if tok == b_("E"): + # Check for End Image + tok2 = stream.read(1) + if tok2 == b_("I"): + # Data can contain EI, so check for the Q operator. + tok3 = stream.read(1) + info = tok + tok2 + # We need to find whitespace between EI and Q. + has_q_whitespace = False + while tok3 in utils.WHITESPACES: + has_q_whitespace = True + info += tok3 + tok3 = stream.read(1) + if tok3 == b_("Q") and has_q_whitespace: + stream.seek(-1, 1) + break + else: + stream.seek(-1,1) + data += info + else: + stream.seek(-1, 1) + data += tok + else: + data += tok + return {"settings": settings, "data": data} + + def _getData(self): + newdata = BytesIO() + for operands, operator in self.operations: + if operator == b_("INLINE IMAGE"): + newdata.write(b_("BI")) + dicttext = BytesIO() + operands["settings"].writeToStream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write(b_("ID ")) + newdata.write(operands["data"]) + newdata.write(b_("EI")) + else: + for op in operands: + op.writeToStream(newdata, None) + newdata.write(b_(" ")) + newdata.write(b_(operator)) + newdata.write(b_("\n")) + return newdata.getvalue() + + def _setData(self, value): + self.__parseContentStream(BytesIO(b_(value))) + + _data = property(_getData, _setData) + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>` + + All text properties of the document metadata have + *two* properties, eg. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where + the metadata is being displayed. The raw property can sometimes return + a ``ByteStringObject``, if PyPDF2 was unable to decode the string's + text encoding; this requires additional safety in the caller and + therefore is not as commonly accessed. + """ + + def __init__(self): + DictionaryObject.__init__(self) + + def getText(self, key): + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + title = property(lambda self: self.getText("/Title")) + """Read-only property accessing the document's **title**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the title is not specified.""" + title_raw = property(lambda self: self.get("/Title")) + """The "raw" version of title; can return a ``ByteStringObject``.""" + + author = property(lambda self: self.getText("/Author")) + """Read-only property accessing the document's **author**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the author is not specified.""" + author_raw = property(lambda self: self.get("/Author")) + """The "raw" version of author; can return a ``ByteStringObject``.""" + + subject = property(lambda self: self.getText("/Subject")) + """Read-only property accessing the document's **subject**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the subject is not specified.""" + subject_raw = property(lambda self: self.get("/Subject")) + """The "raw" version of subject; can return a ``ByteStringObject``.""" + + creator = property(lambda self: self.getText("/Creator")) + """Read-only property accessing the document's **creator**. If the + document was converted to PDF from another format, this is the name of the + application (e.g. OpenOffice) that created the original document from + which it was converted. Returns a unicode string (``TextStringObject``) + or ``None`` if the creator is not specified.""" + creator_raw = property(lambda self: self.get("/Creator")) + """The "raw" version of creator; can return a ``ByteStringObject``.""" + + producer = property(lambda self: self.getText("/Producer")) + """Read-only property accessing the document's **producer**. + If the document was converted to PDF from another format, this is + the name of the application (for example, OSX Quartz) that converted + it to PDF. Returns a unicode string (``TextStringObject``) + or ``None`` if the producer is not specified.""" + producer_raw = property(lambda self: self.get("/Producer")) + """The "raw" version of producer; can return a ``ByteStringObject``.""" + + +def convertToInt(d, size): + if size > 8: + raise utils.PdfReadError("invalid size in convertToInt") + d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) + d = d[-8:] + return struct.unpack(">q", d)[0] + +# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 +_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \ + b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \ + b_('\xa9\xfe\x64\x53\x69\x7a') + + +# Implementation of algorithm 3.2 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): + # 1. Pad or truncate the password string to exactly 32 bytes. If the + # password string is more than 32 bytes long, use only its first 32 bytes; + # if it is less than 32 bytes long, pad it by appending the required number + # of additional bytes from the beginning of the padding string + # (_encryption_padding). + password = b_((str_(password) + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + import struct + m = md5(password) + # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash + # function. + m.update(owner_entry.original_bytes) + # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass + # these bytes to the MD5 hash function, low-order byte first. + p_entry = struct.pack('<i', p_entry) + m.update(p_entry) + # 5. Pass the first element of the file's file identifier array to the MD5 + # hash function. + m.update(id1_entry.original_bytes) + # 6. (Revision 3 or greater) If document metadata is not being encrypted, + # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. + if rev >= 3 and not metadata_encrypt: + m.update(b_("\xff\xff\xff\xff")) + # 7. Finish the hash. + md5_hash = m.digest() + # 8. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass the first n bytes of the output as + # input into a new MD5 hash, where n is the number of bytes of the + # encryption key as defined by the value of the encryption dictionary's + # /Length entry. + if rev >= 3: + for i in range(50): + md5_hash = md5(md5_hash[:keylen]).digest() + # 9. Set the encryption key to the first n bytes of the output from the + # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or + # greater, depends on the value of the encryption dictionary's /Length + # entry. + return md5_hash[:keylen] + + +# Implementation of algorithm 3.3 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg33(owner_pwd, user_pwd, rev, keylen): + # steps 1 - 4 + key = _alg33_1(owner_pwd, rev, keylen) + # 5. Pad or truncate the user password string as described in step 1 of + # algorithm 3.2. + user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) + # 6. Encrypt the result of step 5, using an RC4 encryption function with + # the encryption key obtained in step 4. + val = utils.RC4_encrypt(key, user_pwd) + # 7. (Revision 3 or greater) Do the following 19 times: Take the output + # from the previous invocation of the RC4 function and pass it as input to + # a new invocation of the function; use an encryption key generated by + # taking each byte of the encryption key obtained in step 4 and performing + # an XOR operation between that byte and the single-byte value of the + # iteration counter (from 1 to 19). + if rev >= 3: + for i in range(1, 20): + new_key = '' + for l in range(len(key)): + new_key += chr(ord_(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + # 8. Store the output from the final invocation of the RC4 as the value of + # the /O entry in the encryption dictionary. + return val + + +# Steps 1-4 of algorithm 3.3 +def _alg33_1(password, rev, keylen): + # 1. Pad or truncate the owner password string as described in step 1 of + # algorithm 3.2. If there is no owner password, use the user password + # instead. + password = b_((password + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + m = md5(password) + # 3. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass it as input into a new MD5 hash. + md5_hash = m.digest() + if rev >= 3: + for i in range(50): + md5_hash = md5(md5_hash).digest() + # 4. Create an RC4 encryption key using the first n bytes of the output + # from the final MD5 hash, where n is always 5 for revision 2 but, for + # revision 3 or greater, depends on the value of the encryption + # dictionary's /Length entry. + key = md5_hash[:keylen] + return key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg34(password, owner_entry, p_entry, id1_entry): + # 1. Create an encryption key based on the user password string, as + # described in algorithm 3.2. + key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, + # using an RC4 encryption function with the encryption key from the + # preceding step. + U = utils.RC4_encrypt(key, _encryption_padding) + # 3. Store the result of step 2 as the value of the /U entry in the + # encryption dictionary. + return U, key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + # 1. Create an encryption key based on the user password string, as + # described in Algorithm 3.2. + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 2. Initialize the MD5 hash function and pass the 32-byte padding string + # shown in step 1 of Algorithm 3.2 as input to this function. + m = md5() + m.update(_encryption_padding) + # 3. Pass the first element of the file's file identifier array (the value + # of the ID entry in the document's trailer dictionary; see Table 3.13 on + # page 73) to the hash function and finish the hash. (See implementation + # note 25 in Appendix H.) + m.update(id1_entry.original_bytes) + md5_hash = m.digest() + # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption + # function with the encryption key from step 1. + val = utils.RC4_encrypt(key, md5_hash) + # 5. Do the following 19 times: Take the output from the previous + # invocation of the RC4 function and pass it as input to a new invocation + # of the function; use an encryption key generated by taking each byte of + # the original encryption key (obtained in step 2) and performing an XOR + # operation between that byte and the single-byte value of the iteration + # counter (from 1 to 19). + for i in range(1, 20): + new_key = b_('') + for l in range(len(key)): + new_key += b_(chr(ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + # 6. Append 16 bytes of arbitrary padding to the output from the final + # invocation of the RC4 function and store the 32-byte result as the value + # of the U entry in the encryption dictionary. + # (implementator note: I don't know what "arbitrary padding" is supposed to + # mean, so I have used null bytes. This seems to match a few other + # people's implementations) + return val + (b_('\x00') * 16), key diff --git a/PdfFileTransformer/PyPDF2/utils.py b/PdfFileTransformer/PyPDF2/utils.py new file mode 100644 index 0000000..2120c70 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/utils.py @@ -0,0 +1,309 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +Utility functions for PDF library. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + + +import sys + +try: + import __builtin__ as builtins +except ImportError: # Py3 + import builtins + + +xrange_fn = getattr(builtins, "xrange", range) +_basestring = getattr(builtins, "basestring", str) + +bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X +string_type = getattr(builtins, "unicode", str) +int_types = (int, long) if sys.version_info[0] < 3 else (int,) + + +# Make basic type tests more consistent +def isString(s): + """Test if arg is a string. Compatible with Python 2 and 3.""" + return isinstance(s, _basestring) + + +def isInt(n): + """Test if arg is an int. Compatible with Python 2 and 3.""" + return isinstance(n, int_types) + + +def isBytes(b): + """Test if arg is a bytes instance. Compatible with Python 2 and 3.""" + return isinstance(b, bytes_type) + + +#custom implementation of warnings.formatwarning +def formatWarning(message, category, filename, lineno, line=None): + file = filename.replace("/", "\\").rsplit("\\", 1)[1] # find the file name + return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno) + + +def readUntilWhitespace(stream, maxchars=None): + """ + Reads non-whitespace characters and returns them. + Stops upon encountering whitespace or when maxchars is reached. + """ + txt = b_("") + while True: + tok = stream.read(1) + if tok.isspace() or not tok: + break + txt += tok + if len(txt) == maxchars: + break + return txt + + +def readNonWhitespace(stream): + """ + Finds and reads the next non-whitespace character (ignores whitespace). + """ + tok = WHITESPACES[0] + while tok in WHITESPACES: + tok = stream.read(1) + return tok + + +def skipOverWhitespace(stream): + """ + Similar to readNonWhitespace, but returns a Boolean if more than + one whitespace character was read. + """ + tok = WHITESPACES[0] + cnt = 0; + while tok in WHITESPACES: + tok = stream.read(1) + cnt+=1 + return (cnt > 1) + + +def skipOverComment(stream): + tok = stream.read(1) + stream.seek(-1, 1) + if tok == b_('%'): + while tok not in (b_('\n'), b_('\r')): + tok = stream.read(1) + + +def readUntilRegex(stream, regex, ignore_eof=False): + """ + Reads until the regular expression pattern matched (ignore the match) + Raise PdfStreamError on premature end-of-file. + :param bool ignore_eof: If true, ignore end-of-line and return immediately + """ + name = b_('') + while True: + tok = stream.read(16) + if not tok: + # stream has truncated prematurely + if ignore_eof == True: + return name + else: + raise PdfStreamError("Stream has ended unexpectedly") + m = regex.search(tok) + if m is not None: + name += tok[:m.start()] + stream.seek(m.start()-len(tok), 1) + break + name += tok + return name + + +class ConvertFunctionsToVirtualList(object): + def __init__(self, lengthFunction, getFunction): + self.lengthFunction = lengthFunction + self.getFunction = getFunction + + def __len__(self): + return self.lengthFunction() + + def __getitem__(self, index): + if isinstance(index, slice): + indices = xrange_fn(*index.indices(len(self))) + cls = type(self) + return cls(indices.__len__, lambda idx: self[indices[idx]]) + if not isInt(index): + raise TypeError("sequence indices must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.getFunction(index) + + +def RC4_encrypt(key, plaintext): + S = [i for i in range(256)] + j = 0 + for i in range(256): + j = (j + S[i] + ord_(key[i % len(key)])) % 256 + S[i], S[j] = S[j], S[i] + i, j = 0, 0 + retval = [] + for x in range(len(plaintext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + t = S[(S[i] + S[j]) % 256] + retval.append(b_(chr(ord_(plaintext[x]) ^ t))) + return b_("").join(retval) + + +def matrixMultiply(a, b): + return [[sum([float(i)*float(j) + for i, j in zip(row, col)] + ) for col in zip(*b)] + for row in a] + + +def markLocation(stream): + """Creates text file showing current location in context.""" + # Mainly for debugging + RADIUS = 5000 + stream.seek(-RADIUS, 1) + outputDoc = open('PyPDF2_pdfLocation.txt', 'w') + outputDoc.write(stream.read(RADIUS)) + outputDoc.write('HERE') + outputDoc.write(stream.read(RADIUS)) + outputDoc.close() + stream.seek(-RADIUS, 1) + + +class PyPdfError(Exception): + pass + + +class PdfReadError(PyPdfError): + pass + + +class PageSizeNotDefinedError(PyPdfError): + pass + + +class PdfReadWarning(UserWarning): + pass + + +class PdfStreamError(PdfReadError): + pass + + +if sys.version_info[0] < 3: + def b_(s): + return s +else: + B_CACHE = {} + + def b_(s): + bc = B_CACHE + if s in bc: + return bc[s] + if type(s) == bytes: + return s + else: + r = s.encode('latin-1') + if len(s) < 2: + bc[s] = r + return r + + +def u_(s): + if sys.version_info[0] < 3: + return unicode(s, 'unicode_escape') + else: + return s + + +def str_(b): + if sys.version_info[0] < 3: + return b + else: + if type(b) == bytes: + return b.decode('latin-1') + else: + return b + + +def ord_(b): + if sys.version_info[0] < 3 or type(b) == str: + return ord(b) + else: + return b + + +def chr_(c): + if sys.version_info[0] < 3: + return c + else: + return chr(c) + + +def barray(b): + if sys.version_info[0] < 3: + return b + else: + return bytearray(b) + + +def hexencode(b): + if sys.version_info[0] < 3: + return b.encode('hex') + else: + import codecs + coder = codecs.getencoder('hex_codec') + return coder(b)[0] + + +def hexStr(num): + return hex(num).replace('L', '') + + +WHITESPACES = [b_(x) for x in [' ', '\n', '\r', '\t', '\x00']] + + +def paethPredictor(left, up, up_left): + p = left + up - up_left + dist_left = abs(p - left) + dist_up = abs(p - up) + dist_up_left = abs(p - up_left) + + if dist_left <= dist_up and dist_left <= dist_up_left: + return left + elif dist_up <= dist_up_left: + return up + else: + return up_left diff --git a/PdfFileTransformer/PyPDF2/xmp.py b/PdfFileTransformer/PyPDF2/xmp.py new file mode 100644 index 0000000..7ba62f0 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/xmp.py @@ -0,0 +1,358 @@ +import re +import datetime +import decimal +from .generic import PdfObject +from xml.dom import getDOMImplementation +from xml.dom.minidom import parseString +from .utils import u_ + +RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" +XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" +PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" +XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" + +# What is the PDFX namespace, you might ask? I might ask that too. It's +# a completely undocumented namespace used to place "custom metadata" +# properties, which are arbitrary metadata properties with no semantic or +# documented meaning. Elements in the namespace are key/value-style storage, +# where the element name is the key and the content is the value. The keys +# are transformed into valid XML identifiers by substituting an invalid +# identifier character with \u2182 followed by the unicode hex ID of the +# original character. A key like "my car" is therefore "my\u21820020car". +# +# \u2182, in case you're wondering, is the unicode character +# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for +# escaping characters. +# +# Intentional users of the pdfx namespace should be shot on sight. A +# custom data schema and sensical XML elements could be used instead, as is +# suggested by Adobe's own documentation on XMP (under "Extensibility of +# Schemas"). +# +# Information presented here on the /pdfx/ schema is a result of limited +# reverse engineering, and does not constitute a full specification. +PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" + +iso8601 = re.compile(""" + (?P<year>[0-9]{4}) + (- + (?P<month>[0-9]{2}) + (- + (?P<day>[0-9]+) + (T + (?P<hour>[0-9]{2}): + (?P<minute>[0-9]{2}) + (:(?P<second>[0-9]{2}(.[0-9]+)?))? + (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) + )? + )? + )? + """, re.VERBOSE) + + +class XmpInformation(PdfObject): + """ + An object that represents Adobe XMP metadata. + Usually accessed by :meth:`getXmpMetadata()<PyPDF2.PdfFileReader.getXmpMetadata>` + """ + + def __init__(self, stream): + self.stream = stream + docRoot = parseString(self.stream.getData()) + self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] + self.cache = {} + + def writeToStream(self, stream, encryption_key): + self.stream.writeToStream(stream, encryption_key) + + def getElement(self, aboutUri, namespace, name): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + attr = desc.getAttributeNodeNS(namespace, name) + if attr != None: + yield attr + for element in desc.getElementsByTagNameNS(namespace, name): + yield element + + def getNodesInNamespace(self, aboutUri, namespace): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + for i in range(desc.attributes.length): + attr = desc.attributes.item(i) + if attr.namespaceURI == namespace: + yield attr + for child in desc.childNodes: + if child.namespaceURI == namespace: + yield child + + def _getText(self, element): + text = "" + for child in element.childNodes: + if child.nodeType == child.TEXT_NODE: + text += child.data + return text + + def _converter_string(value): + return value + + def _converter_date(value): + m = iso8601.match(value) + year = int(m.group("year")) + month = int(m.group("month") or "1") + day = int(m.group("day") or "1") + hour = int(m.group("hour") or "0") + minute = int(m.group("minute") or "0") + second = decimal.Decimal(m.group("second") or "0") + seconds = second.to_integral(decimal.ROUND_FLOOR) + milliseconds = (second - seconds) * 1000000 + tzd = m.group("tzd") or "Z" + dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) + if tzd != "Z": + tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] + tzd_hours *= -1 + if tzd_hours < 0: + tzd_minutes *= -1 + dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) + return dt + _test_converter_date = staticmethod(_converter_date) + + def _getter_bag(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") + if len(bags): + for bag in bags: + for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_seq(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") + if len(seqs): + for seq in seqs: + for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + else: + value = converter(self._getText(element)) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_langalt(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = {} + for element in self.getElement("", namespace, name): + alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") + if len(alts): + for alt in alts: + for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval[item.getAttribute("xml:lang")] = value + else: + retval["x-default"] = converter(self._getText(element)) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_single(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + value = None + for element in self.getElement("", namespace, name): + if element.nodeType == element.ATTRIBUTE_NODE: + value = element.nodeValue + else: + value = self._getText(element) + break + if value != None: + value = converter(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = value + return value + return get + + dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) + """ + Contributors to the resource (other than the authors). An unsorted + array of names. + """ + + dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) + """ + Text describing the extent or scope of the resource. + """ + + dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) + """ + A sorted array of names of the authors of the resource, listed in order + of precedence. + """ + + dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) + """ + A sorted array of dates (datetime.datetime instances) of signifigance to + the resource. The dates and times are in UTC. + """ + + dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) + """ + A language-keyed dictionary of textual descriptions of the content of the + resource. + """ + + dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) + """ + The mime-type of the resource. + """ + + dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) + """ + Unique identifier of the resource. + """ + + dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) + """ + An unordered array specifying the languages used in the resource. + """ + + dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) + """ + An unordered array of publisher names. + """ + + dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) + """ + An unordered array of text descriptions of relationships to other + documents. + """ + + dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) + """ + A language-keyed dictionary of textual descriptions of the rights the + user has to this resource. + """ + + dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) + """ + Unique identifier of the work from which this resource was derived. + """ + + dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) + """ + An unordered array of descriptive phrases or keywrods that specify the + topic of the content of the resource. + """ + + dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) + """ + A language-keyed dictionary of the title of the resource. + """ + + dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) + """ + An unordered array of textual descriptions of the document type. + """ + + pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) + """ + An unformatted text string representing document keywords. + """ + + pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) + """ + The PDF file version, for example 1.0, 1.3. + """ + + pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) + """ + The name of the tool that created the PDF document. + """ + + xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) + """ + The date and time the resource was originally created. The date and + time are returned as a UTC datetime.datetime object. + """ + + xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) + """ + The date and time the resource was last modified. The date and time + are returned as a UTC datetime.datetime object. + """ + + xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) + """ + The date and time that any metadata for this resource was last + changed. The date and time are returned as a UTC datetime.datetime + object. + """ + + xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) + """ + The name of the first known tool used to create the resource. + """ + + xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) + """ + The common identifier for all versions and renditions of this resource. + """ + + xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) + """ + An identifier for a specific incarnation of a document, updated each + time a file is saved. + """ + + def custom_properties(self): + if not hasattr(self, "_custom_properties"): + self._custom_properties = {} + for node in self.getNodesInNamespace("", PDFX_NAMESPACE): + key = node.localName + while True: + # see documentation about PDFX_NAMESPACE earlier in file + idx = key.find(u_("\u2182")) + if idx == -1: + break + key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] + if node.nodeType == node.ATTRIBUTE_NODE: + value = node.nodeValue + else: + value = self._getText(node) + self._custom_properties[key] = value + return self._custom_properties + + custom_properties = property(custom_properties) + """ + Retrieves custom metadata properties defined in the undocumented pdfx + metadata schema. + + :return: a dictionary of key/value items for custom metadata properties. + :rtype: dict + """ diff --git a/PdfFileTransformer/__init__.py b/PdfFileTransformer/__init__.py new file mode 100644 index 0000000..f6d0a4d --- /dev/null +++ b/PdfFileTransformer/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- + +from .PyPDF2 import PdfFileReader, PdfFileWriter +from .pdf import Pdf
\ No newline at end of file diff --git a/PdfFileTransformer/pdf.py b/PdfFileTransformer/pdf.py new file mode 100644 index 0000000..c93fb61 --- /dev/null +++ b/PdfFileTransformer/pdf.py @@ -0,0 +1,352 @@ +# -*- coding: utf-8 -*- + +import logging +import re +import tempfile +from .PyPDF2 import PdfFileWriter, PdfFileReader + + +class Pdf: + + def __init__(self, filename): + self.filename = filename + self.buffer = bytearray() + self.objects = [] # [(7,0,b"data"), (8,0,b"data2"), ..] + self.trailer = {} # {Root: (7, 0), Info: (5, 0)} + self.translation_table = {} # {(6,0):7, (5,0): 8}, ..] + self.original_xref_offset = 0 + self.original_first_obj_offset = 0 + self.file_offset = 0 + + self.clean_and_read_pdf() + self.check_pdf_header() + self.parse_xref_offset() + self.parse_xref_table() + self.parse_objects() + self.parse_trailer() + + def clean_and_read_pdf(self): + f_input = open(self.filename, "rb") + pdf_header = f_input.read(8) + f_input.seek(0) + filename_output = tempfile.mktemp() + logging.info("Use " + filename_output + " for normalisation output") + f_ouput = open(filename_output, "wb") + writer = PdfFileWriter() + reader = PdfFileReader(f_input) + info = reader.getDocumentInfo() + if info.producer is not None: + writer.addMetadata({u'/Producer': info.producer}) + else: + writer.addMetadata({u'/Producer': u'TruePolyglot'}) + if info.creator is not None: + writer.addMetadata({u'/Creator': info.creator}) + else: + writer.addMetadata({u'/Creator': u'TruePolyglot'}) + writer.appendPagesFromReader(reader) + writer.setHeader(pdf_header) + writer.write(f_ouput) + f_input.close() + f_ouput.close() + f_norm = open(filename_output, "rb") + self.buffer = bytearray(f_norm.read()) + self.size = len(self.buffer) + f_norm.close() + + def check_pdf_header(self): + if self.buffer[0:5] == b"%PDF-": + pdf_version = self.buffer[5:8].decode("utf-8") + logging.info("PDF Header found: " + pdf_version) + else: + raise Exception("PDF Header not found") + + def parse_xref_offset(self): + r = re.compile(b'startxref\n([0-9]+)') + m = r.search(self.buffer) + if m is None: + raise Exception('Unable to find xref offset') + self.original_xref_offset = int(m.group(1)) + logging.info("Xref offset found at: " + hex(self.original_xref_offset)) + + def parse_xref_table(self): + xref_table = [] + r = re.compile(b'xref\n([0-9]+) ([0-9]+)') + offset = self.original_xref_offset + s = r.search(self.buffer[offset:offset + 32]) + nb_xtable_object = int(s.group(2)) + logging.info("Nb objects in Xref table: " + str(nb_xtable_object)) + xref_header_size = s.end() + r = re.compile(b'([0-9]+) ([0-9]+) ([f|n])') + x = 0 + for i in range(nb_xtable_object): + s = r.search( + self.buffer[self.original_xref_offset + xref_header_size + x:]) + if s is not None: + x = x + s.end() + xref_table.append((int(s.group(1)), + int(s.group(2)), + s.group(3))) + logging.debug("Xref table:") + for i in xref_table: + logging.debug(str(i[0]) + " " + + str(i[1]) + " " + + i[2].decode("utf-8")) + + def parse_objects(self): + r_begin = re.compile(b'([0-9]+) ([0-9]+) obj\n') + r_end = re.compile(b'\nendobj\n') + + offset_buffer = 0 + obj = () + while offset_buffer < self.size: + m_begin = r_begin.match( + self.buffer[offset_buffer:offset_buffer + 32]) + obj_nb_index = 0 + obj_nb_offset = 0 + obj_offset_start = 0 + obj_offset_end = 0 + if m_begin is not None: + if self.original_first_obj_offset == 0: + self.original_first_obj_offset = (offset_buffer + + m_begin.start()) + obj_nb_index = int(m_begin.group(1)) + obj_nb_offset = int(m_begin.group(2)) + obj_data_start = m_begin.end() + obj_offset_start = offset_buffer + m_begin.start() + while offset_buffer < self.size: + m_end = r_end.match( + self.buffer[offset_buffer:offset_buffer + 8]) + if m_end is not None: + obj_offset_end = offset_buffer + m_end.end() - 2 + break + else: + offset_buffer = offset_buffer + 1 + else: + offset_buffer = offset_buffer + 1 + + if (obj_offset_start != 0 and + obj_offset_end != 0): + a = obj_offset_start + obj_data_start + b = obj_offset_end - 6 + obj = (obj_nb_index, obj_nb_offset, + self.buffer[a:b]) + logging.debug("Objects: (" + str(obj_nb_index) + + ", " + str(obj_nb_offset) + + ", " + hex(obj_offset_start) + + ", " + hex(obj_offset_end)) + self.objects.append(obj) + + def parse_trailer(self): + r_begin = re.compile(b'trailer\n') + s_begin = r_begin.search(self.buffer[self.original_xref_offset:]) + start = self.original_xref_offset + s_begin.start() + logging.info("Trailer found at:" + hex(start)) + + r_root = re.compile(b'/Root ([0-9]+) ([0-9]+) R') + s_root = r_root.search(self.buffer[self.original_xref_offset:]) + if s_root is None: + raise Exception('Root not found') + else: + self.trailer["Root"] = (int(s_root.group(1)), int(s_root.group(2))) + + r_info = re.compile(b'/Info ([0-9]+) ([0-9]+) R') + s_info = r_info.search(self.buffer[self.original_xref_offset:]) + if s_info is not None: + self.trailer["Info"] = (int(s_info.group(1)), int(s_info.group(2))) + + def get_file_header(self): + return self.buffer[:self.original_first_obj_offset] + + def get_xref_table(self): + offset_xref = 0 + buf = (b'xref\n' + + str(offset_xref).encode('utf-8') + b' ' + + str(len(self.objects) + 1).encode('utf-8') + b'\n' + + str(0).zfill(10).encode('utf-8') + b' ' + + str(65535).zfill(5).encode('utf-8') + b' f \n') + + for i in range(len(self.objects)): + obj_start = self.get_object_offset(i) + logging.info("Obj %d at %d" % (self.objects[i][0], obj_start)) + buf = (buf + + (str(obj_start).zfill(10)).encode('utf-8') + b' ' + + str(0).zfill(5).encode('utf-8') + b' ' + + b'n' + b' \n') + return buf + + def get_trailer(self): + trailer_data = (b"trailer\n<<\n/Size " + + str(len(self.objects) + 1).encode("utf-8") + + b"\n/Root " + + str(self.trailer["Root"][0]).encode("utf-8") + + b" " + + str(self.trailer["Root"][1]).encode("utf-8") + + b" R\n") + if "Info" in self.trailer: + trailer_data = (trailer_data + + b"/Info " + + str(self.trailer["Info"][0]).encode("utf-8") + + b" " + + str(self.trailer["Info"][1]).encode("utf-8") + + b" R\n") + trailer_data = trailer_data + b">>" + return trailer_data + + def get_xref_offset(self): + return self.get_end_of_last_object() + 1 + + def get_eof(self): + s = (b'startxref\n' + + str(self.get_xref_offset()).encode("utf-8") + + b'\n%%EOF\n') + return s + + def build_object(self, obj): + buf = (str(obj[0]).encode("utf-8") + + b' ' + + str(obj[1]).encode("utf-8") + + b' obj\n' + + obj[2] + + b'\nendobj') + return buf + + def get_build_buffer(self): + b_buffer = bytearray() + b_buffer = b_buffer + self.get_file_header() + for obj in self.objects: + b_buffer = b_buffer + self.build_object(obj) + b'\n' + b_buffer = b_buffer + self.get_xref_table() + b_buffer = b_buffer + self.get_trailer() + b'\n' + b_buffer = b_buffer + self.get_eof() + return b_buffer + + def get_obj(self, nb): + for obj in self.objects: + if obj[0] == nb: + return obj + + def get_end_of_last_object(self): + offset = self.get_last_object_offset() + offset = offset + len(self.build_object(self.objects[-1])) + return offset + + def generate_stream_obj_data(self, data): + buf = (b'<<\n/Filter /FlateDecode\n/Length ' + + str(len(data)).encode("utf-8") + + b'\n>>\nstream\n' + + data + + b'\nendstream') + return buf + + def insert_new_obj_stream_at(self, position, stream_data): + ''' + Return offset of stream data + ''' + logging.info("Insert obj at %d" % position) + obj_nb = position + obj_off = 0 + data = self.generate_stream_obj_data(stream_data) + obj = (obj_nb, obj_off, data) + + obj_data = self.build_object(obj) + full_obj_size = len(obj_data) + logging.info("New object full size is: " + str(full_obj_size)) + + obj = (obj_nb, obj_off, data) + self.objects.insert(position, obj) + + self.reorder_objects() + self.fix_trailer_ref() + + def get_first_stream_offset(self): + offset = self.file_offset + len(self.get_file_header()) + r = re.compile(b'stream\n') + m = r.search(self.objects[0][2]) + offset = offset + len(b"1 0 obj\n") + m.end() + return offset + + def get_last_stream_offset(self): + offset = self.file_offset + self.get_last_object_offset() + r = re.compile(b'stream\n') + m = r.search(self.build_object(self.objects[-1])) + return offset + m.end() + + def get_object_offset(self, index): + offset = self.file_offset + len(self.get_file_header()) + for obj in self.objects[:index]: + offset = offset + len(self.build_object(obj)) + 1 + return offset + + def get_last_object_offset(self): + offset = self.get_object_offset(len(self.objects) - 1) + return offset + + def insert_new_obj_stream_at_start(self, data): + return self.insert_new_obj_stream_at(0, data) + + def insert_new_obj_stream_at_end(self, data): + return self.insert_new_obj_stream_at(len(self.objects) + 1, + data) + + def generate_translation_table(self): + for i in range(len(self.objects)): + self.translation_table[(self.objects[i][0], + self.objects[i][1])] = i + 1 + logging.info(self.translation_table) + + def replace_ref(self, ibuffer): + ''' + Exemple: + in: AZERTY 6 0 R -- BGT 88 0 R HYT + out: AZERTY 77 0 R -- BGT 9 0 R HYT + ''' + index = 0 + obuffer = bytearray() + while True: + r = re.compile(b'([0-9]+) ([0-9]+) R') + s = r.search(ibuffer[index:]) + if s is None: + obuffer = obuffer + ibuffer[index:] + break + o_old = int(s.group(1)) + p_old = int(s.group(2)) + o_new = self.translation_table[(o_old, p_old)] + p_new = p_old + + newref = (str(o_new).encode("utf-8") + + b" " + + str(p_new).encode("utf-8") + + b" R") + + nbuffer = ibuffer[index:index + s.start()] + newref + obuffer = obuffer + nbuffer + index = index + s.end() + return obuffer + + def reorder_objects(self): + self.generate_translation_table() + offset_obj = len(self.get_file_header()) + for i in range(len(self.objects)): + buf = self.objects[i][2] + new_buf = self.replace_ref(buf) + obj_nb = self.objects[i][0] + new_obj_nb = self.translation_table[(obj_nb, 0)] + new_obj_start = offset_obj + size_obj = len(self.build_object((new_obj_nb, + 0, + new_buf))) + new_obj_end = new_obj_start + size_obj + + offset_obj = new_obj_end + 1 + obj = (new_obj_nb, + 0, + new_buf) + self.objects[i] = obj + + def fix_trailer_ref(self): + new_obj_nb = self.translation_table[self.trailer["Root"]] + self.trailer["Root"] = (new_obj_nb, 0) + + if "Info" in self.trailer: + new_obj_nb = self.translation_table[self.trailer["Info"]] + self.trailer["Info"] = (new_obj_nb, 0) diff --git a/PolyglotFile/__init__.py b/PolyglotFile/__init__.py new file mode 100644 index 0000000..4261a1a --- /dev/null +++ b/PolyglotFile/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- + +from .polyglotpdfzip import PolyglotPdfZip +from .polyglotzippdf import PolyglotZipPdf +from .polyglotszippdf import PolyglotSZipPdf + diff --git a/PolyglotFile/polyglotpdfzip.py b/PolyglotFile/polyglotpdfzip.py new file mode 100644 index 0000000..81c3f06 --- /dev/null +++ b/PolyglotFile/polyglotpdfzip.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +import logging + +''' + |-------------------------------| - + |--------- PDF Header ----------K1 | J1 + |-------------------------------| - + |----- PDF OBJ 1 = ZIP Data ----K2 | + |-------------------------------| - + |---- Original PDF Ojbects -----K3 | J2 + |-------------------------------| - + |--- Last OBJ = End Zip Data ---K4 | + |-------------------------------| | + |---------- Xref Table ---------| | + |-------------------------------K5 | + |----------- Trailer -----------| | + |-------------------------------| | +''' + + +class PolyglotPdfZip(): + from PdfFileTransformer import Pdf + from ZipFileTransformer import Zip + + def __init__(self, Pdf, Zip): + self.buffer = bytearray() + self.pdf = Pdf + self.zip = Zip + self.buffer = bytearray() + + def generate(self): + k2_stream = self.zip.buffer[:self.zip.end_of_data] + size_k2_stream = len(k2_stream) + self.pdf.insert_new_obj_stream_at_start(k2_stream) + offset_k2_stream = self.pdf.get_first_stream_offset() + + k4_stream = self.zip.buffer[self.zip.central_dir_file_header:] + size_k4_stream = len(k4_stream) + self.pdf.insert_new_obj_stream_at_end(k4_stream) + offset_k4_stream = self.pdf.get_last_stream_offset() + + pdf_buffer = self.pdf.get_build_buffer() + + j1 = pdf_buffer[0:offset_k2_stream] + j2 = pdf_buffer[offset_k2_stream + size_k2_stream:offset_k4_stream] + self.zip.add_data_to_file(j1, j2, True) + + k5 = pdf_buffer[offset_k4_stream + size_k4_stream:] + self.buffer = self.zip.buffer + k5 + + def write(self, filename): + fd = open(filename, "wb") + fd.write(self.buffer) + fd.close() diff --git a/PolyglotFile/polyglotszippdf.py b/PolyglotFile/polyglotszippdf.py new file mode 100644 index 0000000..0796946 --- /dev/null +++ b/PolyglotFile/polyglotszippdf.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +from .polyglotpdfzip import PolyglotPdfZip +import logging +import tempfile +from ZipFileTransformer import ZipFile +from ZipFileTransformer import Zip +from PdfFileTransformer import Pdf + +''' + |-----------------------------------| - + |--------- ZIP Data[0] = -----------| | + |- PDF Header + PDF Obj[0] Header --| | + |-----------------------------------| | K2 + |------- PDF Obj[0] stream = ------| | + |--------- ZIP Data LF [1:] --------| | + |-----------------------------------| - + |------ Original PDF Ojbects -------| | + |-----------------------------------| | + |------------ Xref Table -----------| | + |-----------------------------------| | J2 + |------------- Trailer -------------| | + |-----------------------------------| - + |---------- End Zip Data -----------| + |-----------------------------------| +''' + + +class PolyglotSZipPdf(PolyglotPdfZip): + + def __init__(self, Pdf, Zip): + super().__init__(Pdf, Zip) + + def get_rebuild_zip_first_part_size(self): + + zo_path = tempfile.mkstemp()[1] + logging.info("use tmp file zip: " + zo_path) + zo = ZipFile(zo_path, 'a') + zi = ZipFile(self.zip.filename, 'r') + for zipinfo in zi.infolist(): + zo.writestr(zipinfo, zi.read(zipinfo)) + zi.close() + zo.close() + + rebuild_zip = Zip(zo_path) + + p = rebuild_zip.end_of_data + k2_stream = rebuild_zip.buffer[:p] + + size_k2_stream = len(k2_stream) + + return size_k2_stream + + def get_pdf_header(self): + return self.pdf.get_file_header() + + def generate_zip_with_pdf_part(self, filename, pdf_data): + + zo = ZipFile(filename, 'a') + zi = ZipFile(self.zip.filename, 'r') + zo.writestr(' ', pdf_data, 0) + for zipinfo in zi.infolist(): + zo.writestr(zipinfo, zi.read(zipinfo)) + zi.close() + zo.close() + + def get_rebuild_pdf(self, zo_path, offset): + ''' + Generate polyglot with final zip. + ''' + new_zip = Zip(zo_path) + new_pdf = Pdf(self.pdf.filename) + + p1 = new_zip.end_of_first_local_file_header + p2 = new_zip.end_of_data + k2_stream = new_zip.buffer[p1:p2] + + size_k2_stream = len(k2_stream) + new_pdf.insert_new_obj_stream_at_start(k2_stream) + k2_stream_offset = new_pdf.get_first_stream_offset() + + new_pdf.file_offset = offset + pdf_buffer = new_pdf.get_build_buffer() + j2 = pdf_buffer[k2_stream_offset + size_k2_stream:] + new_zip.add_data_to_file(b'', j2, True) + + return new_zip.buffer + + def get_pdf_offset(self, zipfile): + + f = open(zipfile, "rb") + data = f.read() + return data.find(b"%PDF") + + def generate(self): + + zip_stream_size = self.get_rebuild_zip_first_part_size() + pdf_header = self.get_pdf_header() + pdf_header = (pdf_header + + b'1 0 obj\n<<\n/Filter /FlateDecode\n/Length ' + + str(zip_stream_size).encode("utf-8") + + b'\n>>\nstream\n') + + filename = tempfile.mkstemp()[1] + logging.info("use tmp file for new zip: " + filename) + self.generate_zip_with_pdf_part(filename, pdf_header) + + pdf_offset = self.get_pdf_offset(filename) + + self.buffer = self.get_rebuild_pdf(filename, pdf_offset) diff --git a/PolyglotFile/polyglotzippdf.py b/PolyglotFile/polyglotzippdf.py new file mode 100644 index 0000000..2493663 --- /dev/null +++ b/PolyglotFile/polyglotzippdf.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +from .polyglotpdfzip import PolyglotPdfZip + + +''' + |-------------------------------| - + |--------- PDF Header ----------K1 | J1 + |-------------------------------| - + |----- PDF OBJ 1 = ZIP Data ----K2 | + |-------------------------------| - + |---- Original PDF Ojbects -----K3 | + |-------------------------------| | + |---------- Xref Table ---------| | + |-------------------------------K4 | J2 + |----------- Trailer -----------| | + |-------------------------------| - + |-------- End Zip Data ---------| | + |-------------------------------| | +''' + + +class PolyglotZipPdf(PolyglotPdfZip): + + def generate(self): + k2_stream = self.zip.buffer[:self.zip.end_of_data] + size_k2_stream = len(k2_stream) + self.pdf.insert_new_obj_stream_at_start(k2_stream) + offset_k2_stream = self.pdf.get_first_stream_offset() + + pdf_buffer = self.pdf.get_build_buffer() + + j1 = pdf_buffer[0:offset_k2_stream] + j2 = pdf_buffer[offset_k2_stream + size_k2_stream:] + + self.zip.add_data_to_file(j1, j2, True) + self.buffer = self.zip.buffer diff --git a/README.md b/README.md new file mode 100644 index 0000000..97224cf --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# TruePolyglot + +See webiste at https://truepolyglot.hackade.org diff --git a/ZipFileTransformer/__init__.py b/ZipFileTransformer/__init__.py new file mode 100644 index 0000000..0b53e27 --- /dev/null +++ b/ZipFileTransformer/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- + +from .zip import Zip +from .zipfile import * diff --git a/ZipFileTransformer/zip.py b/ZipFileTransformer/zip.py new file mode 100644 index 0000000..91ff4c5 --- /dev/null +++ b/ZipFileTransformer/zip.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- + +import logging +import re + + +class Zip: + + def __init__(self, filename): + self.filename = filename + self.buffer = bytearray() + self.size = 0 + self.end_central_dir = 0 + self.first_local_file_header = 0 + self.offset_local_file = [] + self.offset_central_directory = [] + self.end_of_data = 0 + self.end_of_first_local_file_header = 0 + + self.read() + self.check_header() + self.call_all_parsers() + self.check_central_directory() + self.parse_central_directories() + self.parse_local_file_headers() + + def call_all_parsers(self): + self.parse_offset_end_central_dir() + self.parse_nb_of_disk() + self.parse_start_disk() + self.parse_nb_of_central_dir() + self.parse_nb_total_of_central_dir() + self.parse_size_central_dir() + self.parse_central_dir_file_header() + self.parse_comment_length() + + def read(self): + with open(self.filename, 'rb') as fd: + self.buffer = bytearray(fd.read()) + self.size = len(self.buffer) + logging.info("read " + str(self.size) + " bytes from Zip file") + + def check_header(self): + if self.buffer[0:4] != b"PK\x03\x04": + raise Exception("Zip header not found") + + def parse_offset_end_central_dir(self): + r = re.compile(b'\x06\x05KP') + s = r.search(self.buffer[::-1]) + if s is None: + raise Exception("Unable to find end of central directory") + self.end_central_dir = self.size - s.end() + logging.info("Offset end of central directory: " + + hex(self.end_central_dir)) + + def parse_nb_of_disk(self): + self.nb_of_disk = int.from_bytes( + self.buffer[self.end_central_dir + 4:self.end_central_dir + 6], + "little") + logging.debug("Nb of disk: " + str(self.nb_of_disk)) + + def parse_start_disk(self): + self.start_disk = int.from_bytes( + self.buffer[self.end_central_dir + 6:self.end_central_dir + 8], + "little") + logging.debug("Start disk: " + str(self.start_disk)) + + def parse_nb_of_central_dir(self): + self.nb_of_central_dir = int.from_bytes( + self.buffer[self.end_central_dir + 8:self.end_central_dir + 10], + "little") + logging.info("Nb of central directory record: " + + str(self.nb_of_central_dir)) + + def parse_nb_total_of_central_dir(self): + self.nb_total_of_central_dir = int.from_bytes( + self.buffer[self.end_central_dir + 10:self.end_central_dir + 12], + "little") + logging.info("Nb of total central directory record: " + + str(self.nb_total_of_central_dir)) + + def parse_size_central_dir(self): + self.size_central_dir = int.from_bytes( + self.buffer[self.end_central_dir + 12:self.end_central_dir + 14], + "little") + logging.info("Size of central directory: " + + str(self.size_central_dir)) + + def parse_central_dir_file_header(self): + self.central_dir_file_header = int.from_bytes( + self.buffer[self.end_central_dir + 16:self.end_central_dir + 20], + "little") + logging.info("Central directory file header: " + + hex(self.central_dir_file_header)) + + def parse_comment_length(self): + self.comment_length = int.from_bytes( + self.buffer[self.end_central_dir + 20:self.end_central_dir + 22], + "little") + logging.info("Comment length: " + + str(self.comment_length)) + + def check_central_directory(self): + offset = self.central_dir_file_header + if (self.buffer[offset:offset + 4] != + b'PK\x01\x02'): + raise Exception("Unable to find central directory") + logging.info("Found central directory") + + def parse_central_directories(self): + if (self.buffer[self.central_dir_file_header: + self.central_dir_file_header + 4] != + b'PK\x01\x02'): + raise Exception("Unable to find first central directory") + logging.info("Found first central directory") + + i = 0 + size = 0 + offset = self.central_dir_file_header + + while (self.buffer[size + offset: + size + offset + 4] == + b'PK\x01\x02'): + + logging.info("Parse central directory n°" + str(i)) + logging.info("Offset: " + hex(offset + size)) + self.offset_central_directory.append(offset + size) + filename_length = int.from_bytes( + self.buffer[size + offset + 28:size + offset + 30], + "little") + logging.info("filename length:" + str(filename_length)) + extra_field_length = int.from_bytes( + self.buffer[size + offset + 30:size + offset + 32], + "little") + logging.info("extra field length:" + str(extra_field_length)) + comment_length = int.from_bytes( + self.buffer[size + offset + 32:size + offset + 34], + "little") + logging.info("comment length:" + str(comment_length)) + local_file_header = int.from_bytes( + self.buffer[size + offset + 42:size + offset + 46], + "little") + if i == 0: + self.first_local_file_header = local_file_header + logging.info("local file header:" + hex(local_file_header)) + + i = i + 1 + size = (size + filename_length + + extra_field_length + comment_length + 46) + + logging.debug("parse header at:" + hex(offset + size)) + + def parse_local_file_headers(self): + size = 0 + offset = self.first_local_file_header + for i in range(self.nb_of_central_dir): + logging.info("Parse local file n°" + str(i)) + compressed_data_lenght = int.from_bytes( + self.buffer[size + offset + 18:size + offset + 22], + "little") + logging.info("compressed data length:" + + str(compressed_data_lenght)) + filename_length = int.from_bytes( + self.buffer[size + offset + 26:size + offset + 28], + "little") + logging.info("filename length:" + str(filename_length)) + extra_field_length = int.from_bytes( + self.buffer[size + offset + 28:size + offset + 30], + "little") + logging.info("extra field length:" + str(extra_field_length)) + local_file_size = (compressed_data_lenght + + filename_length + extra_field_length + 30) + logging.info("local file length:" + hex(local_file_size)) + size = size + local_file_size + logging.debug("parse header at:" + hex(offset + size)) + self.offset_local_file.append(offset + size) + self.end_of_data = offset + size + if i == 0: + self.end_of_first_local_file_header = self.end_of_data + + def add_data_to_file(self, data_before_local, data_after_local, + write_buffer=False): + logging.info("Add data before local lenght:" + + str(len(data_before_local))) + new_buffer = self.buffer + for i in self.offset_central_directory: + logging.info("parse central directory at: " + hex(i)) + local_file_header = int.from_bytes( + self.buffer[i + 42:i + 46], + "little") + logging.info("old local file header: " + hex(local_file_header)) + local_file_header = local_file_header + len(data_before_local) + logging.info("new local file header: " + hex(local_file_header)) + bytes_local_file_header = local_file_header.to_bytes(4, "little") + logging.info("change value at:" + hex(i + 42)) + new_buffer[i + 42:i + 46] = bytes_local_file_header + + logging.info("old central directory header: " + + hex(self.central_dir_file_header)) + new_central_dir_file_header = (self.central_dir_file_header + + len(data_after_local) + + len(data_before_local)) + logging.info("new central directory header: " + + hex(new_central_dir_file_header)) + bytes_offset = new_central_dir_file_header.to_bytes(4, "little") + new_buffer[self.end_central_dir + 16: + self.end_central_dir + 20] = bytes_offset + self.buffer = new_buffer + + if write_buffer: + new_buffer = (data_before_local + + new_buffer[:self.end_of_data] + + data_after_local + + new_buffer[self.central_dir_file_header:]) + self.buffer = new_buffer + + def get_local_file_data(self): + return self.buffer[:self.end_of_data] + + def get_data_after_central_directory(self): + return self.buffer[self.central_dir_file_header:] + + def get_first_part_length(self): + return len(self.get_local_file_data()) + + def get_second_part_length(self): + return len(self.get_data_after_central_directory()) diff --git a/ZipFileTransformer/zipfile.py b/ZipFileTransformer/zipfile.py new file mode 100644 index 0000000..2757ce9 --- /dev/null +++ b/ZipFileTransformer/zipfile.py @@ -0,0 +1,2133 @@ +""" +Read and write ZIP files. + +XXX references to utf-8 need further investigation. +""" +import io +import os +import importlib.util +import sys +import time +import stat +import shutil +import struct +import binascii +import threading + +try: + import zlib # We may need its compression method + crc32 = zlib.crc32 +except ImportError: + zlib = None + crc32 = binascii.crc32 + +try: + import bz2 # We may need its compression method +except ImportError: + bz2 = None + +try: + import lzma # We may need its compression method +except ImportError: + lzma = None + +__all__ = ["BadZipFile", "BadZipfile", "error", + "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA", + "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"] + +class BadZipFile(Exception): + pass + + +class LargeZipFile(Exception): + """ + Raised when writing a zipfile, the zipfile requires ZIP64 extensions + and those extensions are disabled. + """ + +error = BadZipfile = BadZipFile # Pre-3.2 compatibility names + + +ZIP64_LIMIT = (1 << 31) - 1 +ZIP_FILECOUNT_LIMIT = (1 << 16) - 1 +ZIP_MAX_COMMENT = (1 << 16) - 1 + +# constants for Zip file compression methods +ZIP_STORED = 0 +ZIP_DEFLATED = 8 +ZIP_BZIP2 = 12 +ZIP_LZMA = 14 +# Other ZIP compression methods not supported + +DEFAULT_VERSION = 20 +ZIP64_VERSION = 45 +BZIP2_VERSION = 46 +LZMA_VERSION = 63 +# we recognize (but not necessarily support) all features up to that version +MAX_EXTRACT_VERSION = 63 + +# Below are some formats and associated data for reading/writing headers using +# the struct module. The names and structures of headers/records are those used +# in the PKWARE description of the ZIP file format: +# http://www.pkware.com/documents/casestudies/APPNOTE.TXT +# (URL valid as of January 2008) + +# The "end of central directory" structure, magic number, size, and indices +# (section V.I in the format document) +structEndArchive = b"<4s4H2LH" +stringEndArchive = b"PK\005\006" +sizeEndCentDir = struct.calcsize(structEndArchive) + +_ECD_SIGNATURE = 0 +_ECD_DISK_NUMBER = 1 +_ECD_DISK_START = 2 +_ECD_ENTRIES_THIS_DISK = 3 +_ECD_ENTRIES_TOTAL = 4 +_ECD_SIZE = 5 +_ECD_OFFSET = 6 +_ECD_COMMENT_SIZE = 7 +# These last two indices are not part of the structure as defined in the +# spec, but they are used internally by this module as a convenience +_ECD_COMMENT = 8 +_ECD_LOCATION = 9 + +# The "central directory" structure, magic number, size, and indices +# of entries in the structure (section V.F in the format document) +structCentralDir = "<4s4B4HL2L5H2L" +stringCentralDir = b"PK\001\002" +sizeCentralDir = struct.calcsize(structCentralDir) + +# indexes of entries in the central directory structure +_CD_SIGNATURE = 0 +_CD_CREATE_VERSION = 1 +_CD_CREATE_SYSTEM = 2 +_CD_EXTRACT_VERSION = 3 +_CD_EXTRACT_SYSTEM = 4 +_CD_FLAG_BITS = 5 +_CD_COMPRESS_TYPE = 6 +_CD_TIME = 7 +_CD_DATE = 8 +_CD_CRC = 9 +_CD_COMPRESSED_SIZE = 10 +_CD_UNCOMPRESSED_SIZE = 11 +_CD_FILENAME_LENGTH = 12 +_CD_EXTRA_FIELD_LENGTH = 13 +_CD_COMMENT_LENGTH = 14 +_CD_DISK_NUMBER_START = 15 +_CD_INTERNAL_FILE_ATTRIBUTES = 16 +_CD_EXTERNAL_FILE_ATTRIBUTES = 17 +_CD_LOCAL_HEADER_OFFSET = 18 + +# The "local file header" structure, magic number, size, and indices +# (section V.A in the format document) +structFileHeader = "<4s2B4HL2L2H" +stringFileHeader = b"PK\003\004" +sizeFileHeader = struct.calcsize(structFileHeader) + +_FH_SIGNATURE = 0 +_FH_EXTRACT_VERSION = 1 +_FH_EXTRACT_SYSTEM = 2 +_FH_GENERAL_PURPOSE_FLAG_BITS = 3 +_FH_COMPRESSION_METHOD = 4 +_FH_LAST_MOD_TIME = 5 +_FH_LAST_MOD_DATE = 6 +_FH_CRC = 7 +_FH_COMPRESSED_SIZE = 8 +_FH_UNCOMPRESSED_SIZE = 9 +_FH_FILENAME_LENGTH = 10 +_FH_EXTRA_FIELD_LENGTH = 11 + +# The "Zip64 end of central directory locator" structure, magic number, and size +structEndArchive64Locator = "<4sLQL" +stringEndArchive64Locator = b"PK\x06\x07" +sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator) + +# The "Zip64 end of central directory" record, magic number, size, and indices +# (section V.G in the format document) +structEndArchive64 = "<4sQ2H2L4Q" +stringEndArchive64 = b"PK\x06\x06" +sizeEndCentDir64 = struct.calcsize(structEndArchive64) + +_CD64_SIGNATURE = 0 +_CD64_DIRECTORY_RECSIZE = 1 +_CD64_CREATE_VERSION = 2 +_CD64_EXTRACT_VERSION = 3 +_CD64_DISK_NUMBER = 4 +_CD64_DISK_NUMBER_START = 5 +_CD64_NUMBER_ENTRIES_THIS_DISK = 6 +_CD64_NUMBER_ENTRIES_TOTAL = 7 +_CD64_DIRECTORY_SIZE = 8 +_CD64_OFFSET_START_CENTDIR = 9 + +def _check_zipfile(fp): + try: + if _EndRecData(fp): + return True # file has correct magic number + except OSError: + pass + return False + +def is_zipfile(filename): + """Quickly see if a file is a ZIP file by checking the magic number. + + The filename argument may be a file or file-like object too. + """ + result = False + try: + if hasattr(filename, "read"): + result = _check_zipfile(fp=filename) + else: + with open(filename, "rb") as fp: + result = _check_zipfile(fp) + except OSError: + pass + return result + +def _EndRecData64(fpin, offset, endrec): + """ + Read the ZIP64 end-of-archive records and use that to update endrec + """ + try: + fpin.seek(offset - sizeEndCentDir64Locator, 2) + except OSError: + # If the seek fails, the file is not large enough to contain a ZIP64 + # end-of-archive record, so just return the end record we were given. + return endrec + + data = fpin.read(sizeEndCentDir64Locator) + if len(data) != sizeEndCentDir64Locator: + return endrec + sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) + if sig != stringEndArchive64Locator: + return endrec + + if diskno != 0 or disks != 1: + raise BadZipFile("zipfiles that span multiple disks are not supported") + + # Assume no 'zip64 extensible data' + fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) + data = fpin.read(sizeEndCentDir64) + if len(data) != sizeEndCentDir64: + return endrec + sig, sz, create_version, read_version, disk_num, disk_dir, \ + dircount, dircount2, dirsize, diroffset = \ + struct.unpack(structEndArchive64, data) + if sig != stringEndArchive64: + return endrec + + # Update the original endrec using data from the ZIP64 record + endrec[_ECD_SIGNATURE] = sig + endrec[_ECD_DISK_NUMBER] = disk_num + endrec[_ECD_DISK_START] = disk_dir + endrec[_ECD_ENTRIES_THIS_DISK] = dircount + endrec[_ECD_ENTRIES_TOTAL] = dircount2 + endrec[_ECD_SIZE] = dirsize + endrec[_ECD_OFFSET] = diroffset + return endrec + + +def _EndRecData(fpin): + """Return data from the "End of Central Directory" record, or None. + + The data is a list of the nine items in the ZIP "End of central dir" + record followed by a tenth item, the file seek offset of this record.""" + + # Determine file size + fpin.seek(0, 2) + filesize = fpin.tell() + + # Check to see if this is ZIP file with no archive comment (the + # "end of central directory" structure should be the last item in the + # file if this is the case). + try: + fpin.seek(-sizeEndCentDir, 2) + except OSError: + return None + data = fpin.read() + if (len(data) == sizeEndCentDir and + data[0:4] == stringEndArchive and + data[-2:] == b"\000\000"): + # the signature is correct and there's no comment, unpack structure + endrec = struct.unpack(structEndArchive, data) + endrec=list(endrec) + + # Append a blank comment and record start offset + endrec.append(b"") + endrec.append(filesize - sizeEndCentDir) + + # Try to read the "Zip64 end of central directory" structure + return _EndRecData64(fpin, -sizeEndCentDir, endrec) + + # Either this is not a ZIP file, or it is a ZIP file with an archive + # comment. Search the end of the file for the "end of central directory" + # record signature. The comment is the last item in the ZIP file and may be + # up to 64K long. It is assumed that the "end of central directory" magic + # number does not appear in the comment. + maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) + fpin.seek(maxCommentStart, 0) + data = fpin.read() + start = data.rfind(stringEndArchive) + if start >= 0: + # found the magic number; attempt to unpack and interpret + recData = data[start:start+sizeEndCentDir] + if len(recData) != sizeEndCentDir: + # Zip file is corrupted. + return None + endrec = list(struct.unpack(structEndArchive, recData)) + commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file + comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize] + endrec.append(comment) + endrec.append(maxCommentStart + start) + + # Try to read the "Zip64 end of central directory" structure + return _EndRecData64(fpin, maxCommentStart + start - filesize, + endrec) + + # Unable to find a valid end of central directory structure + return None + + +class ZipInfo (object): + """Class with attributes describing each file in the ZIP archive.""" + + __slots__ = ( + 'orig_filename', + 'filename', + 'date_time', + 'compress_type', + '_compresslevel', + 'comment', + 'extra', + 'create_system', + 'create_version', + 'extract_version', + 'reserved', + 'flag_bits', + 'volume', + 'internal_attr', + 'external_attr', + 'header_offset', + 'CRC', + 'compress_size', + 'file_size', + '_raw_time', + ) + + def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): + self.orig_filename = filename # Original file name in archive + + # Terminate the file name at the first null byte. Null bytes in file + # names are used as tricks by viruses in archives. + null_byte = filename.find(chr(0)) + if null_byte >= 0: + filename = filename[0:null_byte] + # This is used to ensure paths in generated ZIP files always use + # forward slashes as the directory separator, as required by the + # ZIP format specification. + if os.sep != "/" and os.sep in filename: + filename = filename.replace(os.sep, "/") + + self.filename = filename # Normalized file name + self.date_time = date_time # year, month, day, hour, min, sec + + if date_time[0] < 1980: + raise ValueError('ZIP does not support timestamps before 1980') + + # Standard values: + self.compress_type = ZIP_STORED # Type of compression for the file + self._compresslevel = None # Level for the compressor + self.comment = b"" # Comment for each file + self.extra = b"" # ZIP extra data + if sys.platform == 'win32': + self.create_system = 0 # System which created ZIP archive + else: + # Assume everything else is unix-y + self.create_system = 3 # System which created ZIP archive + self.create_version = DEFAULT_VERSION # Version which created ZIP archive + self.extract_version = DEFAULT_VERSION # Version needed to extract archive + self.reserved = 0 # Must be zero + self.flag_bits = 0 # ZIP flag bits + self.volume = 0 # Volume number of file header + self.internal_attr = 0 # Internal attributes + self.external_attr = 0 # External file attributes + # Other attributes are set by class ZipFile: + # header_offset Byte offset to the file header + # CRC CRC-32 of the uncompressed file + # compress_size Size of the compressed file + # file_size Size of the uncompressed file + + def __repr__(self): + result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)] + if self.compress_type != ZIP_STORED: + result.append(' compress_type=%s' % + compressor_names.get(self.compress_type, + self.compress_type)) + hi = self.external_attr >> 16 + lo = self.external_attr & 0xFFFF + if hi: + result.append(' filemode=%r' % stat.filemode(hi)) + if lo: + result.append(' external_attr=%#x' % lo) + isdir = self.is_dir() + if not isdir or self.file_size: + result.append(' file_size=%r' % self.file_size) + if ((not isdir or self.compress_size) and + (self.compress_type != ZIP_STORED or + self.file_size != self.compress_size)): + result.append(' compress_size=%r' % self.compress_size) + result.append('>') + return ''.join(result) + + def FileHeader(self, zip64=None): + """Return the per-file header as a string.""" + dt = self.date_time + dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] + dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) + if self.flag_bits & 0x08: + # Set these to zero because we write them after the file data + CRC = compress_size = file_size = 0 + else: + CRC = self.CRC + compress_size = self.compress_size + file_size = self.file_size + + extra = self.extra + + min_version = 0 + if zip64 is None: + zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT + if zip64: + fmt = '<HHQQ' + extra = extra + struct.pack(fmt, + 1, struct.calcsize(fmt)-4, file_size, compress_size) + if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: + if not zip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") + # File is larger than what fits into a 4 byte integer, + # fall back to the ZIP64 extension + file_size = 0xffffffff + compress_size = 0xffffffff + min_version = ZIP64_VERSION + + if self.compress_type == ZIP_BZIP2: + min_version = max(BZIP2_VERSION, min_version) + elif self.compress_type == ZIP_LZMA: + min_version = max(LZMA_VERSION, min_version) + + self.extract_version = max(min_version, self.extract_version) + self.create_version = max(min_version, self.create_version) + filename, flag_bits = self._encodeFilenameFlags() + header = struct.pack(structFileHeader, stringFileHeader, + self.extract_version, self.reserved, flag_bits, + self.compress_type, dostime, dosdate, CRC, + compress_size, file_size, + len(filename), len(extra)) + return header + filename + extra + + def _encodeFilenameFlags(self): + try: + return self.filename.encode('ascii'), self.flag_bits + except UnicodeEncodeError: + return self.filename.encode('utf-8'), self.flag_bits | 0x800 + + def _decodeExtra(self): + # Try to decode the extra field. + extra = self.extra + unpack = struct.unpack + while len(extra) >= 4: + tp, ln = unpack('<HH', extra[:4]) + if ln+4 > len(extra): + raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) + if tp == 0x0001: + if ln >= 24: + counts = unpack('<QQQ', extra[4:28]) + elif ln == 16: + counts = unpack('<QQ', extra[4:20]) + elif ln == 8: + counts = unpack('<Q', extra[4:12]) + elif ln == 0: + counts = () + else: + raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) + + idx = 0 + + # ZIP64 extension (large files and/or large archives) + if self.file_size in (0xffffffffffffffff, 0xffffffff): + self.file_size = counts[idx] + idx += 1 + + if self.compress_size == 0xFFFFFFFF: + self.compress_size = counts[idx] + idx += 1 + + if self.header_offset == 0xffffffff: + old = self.header_offset + self.header_offset = counts[idx] + idx+=1 + + extra = extra[ln+4:] + + @classmethod + def from_file(cls, filename, arcname=None): + """Construct an appropriate ZipInfo for a file on the filesystem. + + filename should be the path to a file or directory on the filesystem. + + arcname is the name which it will have within the archive (by default, + this will be the same as filename, but without a drive letter and with + leading path separators removed). + """ + if isinstance(filename, os.PathLike): + filename = os.fspath(filename) + st = os.stat(filename) + isdir = stat.S_ISDIR(st.st_mode) + mtime = time.localtime(st.st_mtime) + date_time = mtime[0:6] + # Create ZipInfo instance to store file information + if arcname is None: + arcname = filename + arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) + while arcname[0] in (os.sep, os.altsep): + arcname = arcname[1:] + if isdir: + arcname += '/' + zinfo = cls(arcname, date_time) + zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes + if isdir: + zinfo.file_size = 0 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.file_size = st.st_size + + return zinfo + + def is_dir(self): + """Return True if this archive member is a directory.""" + return self.filename[-1] == '/' + + +# ZIP encryption uses the CRC32 one-byte primitive for scrambling some +# internal keys. We noticed that a direct implementation is faster than +# relying on binascii.crc32(). + +_crctable = None +def _gen_crc(crc): + for j in range(8): + if crc & 1: + crc = (crc >> 1) ^ 0xEDB88320 + else: + crc >>= 1 + return crc + +# ZIP supports a password-based form of encryption. Even though known +# plaintext attacks have been found against it, it is still useful +# to be able to get data out of such a file. +# +# Usage: +# zd = _ZipDecrypter(mypwd) +# plain_bytes = zd(cypher_bytes) + +def _ZipDecrypter(pwd): + key0 = 305419896 + key1 = 591751049 + key2 = 878082192 + + global _crctable + if _crctable is None: + _crctable = list(map(_gen_crc, range(256))) + crctable = _crctable + + def crc32(ch, crc): + """Compute the CRC32 primitive on one byte.""" + return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF] + + def update_keys(c): + nonlocal key0, key1, key2 + key0 = crc32(c, key0) + key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF + key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF + key2 = crc32(key1 >> 24, key2) + + for p in pwd: + update_keys(p) + + def decrypter(data): + """Decrypt a bytes object.""" + result = bytearray() + append = result.append + for c in data: + k = key2 | 2 + c ^= ((k * (k^1)) >> 8) & 0xFF + update_keys(c) + append(c) + return bytes(result) + + return decrypter + + +class LZMACompressor: + + def __init__(self): + self._comp = None + + def _init(self): + props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1}) + self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[ + lzma._decode_filter_properties(lzma.FILTER_LZMA1, props) + ]) + return struct.pack('<BBH', 9, 4, len(props)) + props + + def compress(self, data): + if self._comp is None: + return self._init() + self._comp.compress(data) + return self._comp.compress(data) + + def flush(self): + if self._comp is None: + return self._init() + self._comp.flush() + return self._comp.flush() + + +class LZMADecompressor: + + def __init__(self): + self._decomp = None + self._unconsumed = b'' + self.eof = False + + def decompress(self, data): + if self._decomp is None: + self._unconsumed += data + if len(self._unconsumed) <= 4: + return b'' + psize, = struct.unpack('<H', self._unconsumed[2:4]) + if len(self._unconsumed) <= 4 + psize: + return b'' + + self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[ + lzma._decode_filter_properties(lzma.FILTER_LZMA1, + self._unconsumed[4:4 + psize]) + ]) + data = self._unconsumed[4 + psize:] + del self._unconsumed + + result = self._decomp.decompress(data) + self.eof = self._decomp.eof + return result + + +compressor_names = { + 0: 'store', + 1: 'shrink', + 2: 'reduce', + 3: 'reduce', + 4: 'reduce', + 5: 'reduce', + 6: 'implode', + 7: 'tokenize', + 8: 'deflate', + 9: 'deflate64', + 10: 'implode', + 12: 'bzip2', + 14: 'lzma', + 18: 'terse', + 19: 'lz77', + 97: 'wavpack', + 98: 'ppmd', +} + +def _check_compression(compression): + if compression == ZIP_STORED: + pass + elif compression == ZIP_DEFLATED: + if not zlib: + raise RuntimeError( + "Compression requires the (missing) zlib module") + elif compression == ZIP_BZIP2: + if not bz2: + raise RuntimeError( + "Compression requires the (missing) bz2 module") + elif compression == ZIP_LZMA: + if not lzma: + raise RuntimeError( + "Compression requires the (missing) lzma module") + else: + raise NotImplementedError("That compression method is not supported") + + +def _get_compressor(compress_type, compresslevel=None): + if compress_type == ZIP_DEFLATED: + if compresslevel is not None: + return zlib.compressobj(compresslevel, zlib.DEFLATED, -15) + return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15) + elif compress_type == ZIP_BZIP2: + if compresslevel is not None: + return bz2.BZ2Compressor(compresslevel) + return bz2.BZ2Compressor() + # compresslevel is ignored for ZIP_LZMA + elif compress_type == ZIP_LZMA: + return LZMACompressor() + else: + return None + + +def _get_decompressor(compress_type): + if compress_type == ZIP_STORED: + return None + elif compress_type == ZIP_DEFLATED: + return zlib.decompressobj(-15) + elif compress_type == ZIP_BZIP2: + return bz2.BZ2Decompressor() + elif compress_type == ZIP_LZMA: + return LZMADecompressor() + else: + descr = compressor_names.get(compress_type) + if descr: + raise NotImplementedError("compression type %d (%s)" % (compress_type, descr)) + else: + raise NotImplementedError("compression type %d" % (compress_type,)) + + +class _SharedFile: + def __init__(self, file, pos, close, lock, writing): + self._file = file + self._pos = pos + self._close = close + self._lock = lock + self._writing = writing + self.seekable = file.seekable + self.tell = file.tell + + def seek(self, offset, whence=0): + with self._lock: + if self._writing(): + raise ValueError("Can't reposition in the ZIP file while " + "there is an open writing handle on it. " + "Close the writing handle before trying to read.") + self._file.seek(offset, whence) + self._pos = self._file.tell() + return self._pos + + def read(self, n=-1): + with self._lock: + if self._writing(): + raise ValueError("Can't read from the ZIP file while there " + "is an open writing handle on it. " + "Close the writing handle before trying to read.") + self._file.seek(self._pos) + data = self._file.read(n) + self._pos = self._file.tell() + return data + + def close(self): + if self._file is not None: + fileobj = self._file + self._file = None + self._close(fileobj) + +# Provide the tell method for unseekable stream +class _Tellable: + def __init__(self, fp): + self.fp = fp + self.offset = 0 + + def write(self, data): + n = self.fp.write(data) + self.offset += n + return n + + def tell(self): + return self.offset + + def flush(self): + self.fp.flush() + + def close(self): + self.fp.close() + + +class ZipExtFile(io.BufferedIOBase): + """File-like object for reading an archive member. + Is returned by ZipFile.open(). + """ + + # Max size supported by decompressor. + MAX_N = 1 << 31 - 1 + + # Read from compressed files in 4k blocks. + MIN_READ_SIZE = 4096 + + # Chunk size to read during seek + MAX_SEEK_READ = 1 << 24 + + def __init__(self, fileobj, mode, zipinfo, decrypter=None, + close_fileobj=False): + self._fileobj = fileobj + self._decrypter = decrypter + self._close_fileobj = close_fileobj + + self._compress_type = zipinfo.compress_type + self._compress_left = zipinfo.compress_size + self._left = zipinfo.file_size + + self._decompressor = _get_decompressor(self._compress_type) + + self._eof = False + self._readbuffer = b'' + self._offset = 0 + + self.newlines = None + + # Adjust read size for encrypted files since the first 12 bytes + # are for the encryption/password information. + if self._decrypter is not None: + self._compress_left -= 12 + + self.mode = mode + self.name = zipinfo.filename + + if hasattr(zipinfo, 'CRC'): + self._expected_crc = zipinfo.CRC + self._running_crc = crc32(b'') + else: + self._expected_crc = None + + self._seekable = False + try: + if fileobj.seekable(): + self._orig_compress_start = fileobj.tell() + self._orig_compress_size = zipinfo.compress_size + self._orig_file_size = zipinfo.file_size + self._orig_start_crc = self._running_crc + self._seekable = True + except AttributeError: + pass + + def __repr__(self): + result = ['<%s.%s' % (self.__class__.__module__, + self.__class__.__qualname__)] + if not self.closed: + result.append(' name=%r mode=%r' % (self.name, self.mode)) + if self._compress_type != ZIP_STORED: + result.append(' compress_type=%s' % + compressor_names.get(self._compress_type, + self._compress_type)) + else: + result.append(' [closed]') + result.append('>') + return ''.join(result) + + def readline(self, limit=-1): + """Read and return a line from the stream. + + If limit is specified, at most limit bytes will be read. + """ + + if limit < 0: + # Shortcut common case - newline found in buffer. + i = self._readbuffer.find(b'\n', self._offset) + 1 + if i > 0: + line = self._readbuffer[self._offset: i] + self._offset = i + return line + + return io.BufferedIOBase.readline(self, limit) + + def peek(self, n=1): + """Returns buffered bytes without advancing the position.""" + if n > len(self._readbuffer) - self._offset: + chunk = self.read(n) + if len(chunk) > self._offset: + self._readbuffer = chunk + self._readbuffer[self._offset:] + self._offset = 0 + else: + self._offset -= len(chunk) + + # Return up to 512 bytes to reduce allocation overhead for tight loops. + return self._readbuffer[self._offset: self._offset + 512] + + def readable(self): + return True + + def read(self, n=-1): + """Read and return up to n bytes. + If the argument is omitted, None, or negative, data is read and returned until EOF is reached.. + """ + if n is None or n < 0: + buf = self._readbuffer[self._offset:] + self._readbuffer = b'' + self._offset = 0 + while not self._eof: + buf += self._read1(self.MAX_N) + return buf + + end = n + self._offset + if end < len(self._readbuffer): + buf = self._readbuffer[self._offset:end] + self._offset = end + return buf + + n = end - len(self._readbuffer) + buf = self._readbuffer[self._offset:] + self._readbuffer = b'' + self._offset = 0 + while n > 0 and not self._eof: + data = self._read1(n) + if n < len(data): + self._readbuffer = data + self._offset = n + buf += data[:n] + break + buf += data + n -= len(data) + return buf + + def _update_crc(self, newdata): + # Update the CRC using the given data. + if self._expected_crc is None: + # No need to compute the CRC if we don't have a reference value + return + self._running_crc = crc32(newdata, self._running_crc) + # Check the CRC if we're at the end of the file + if self._eof and self._running_crc != self._expected_crc: + raise BadZipFile("Bad CRC-32 for file %r" % self.name) + + def read1(self, n): + """Read up to n bytes with at most one read() system call.""" + + if n is None or n < 0: + buf = self._readbuffer[self._offset:] + self._readbuffer = b'' + self._offset = 0 + while not self._eof: + data = self._read1(self.MAX_N) + if data: + buf += data + break + return buf + + end = n + self._offset + if end < len(self._readbuffer): + buf = self._readbuffer[self._offset:end] + self._offset = end + return buf + + n = end - len(self._readbuffer) + buf = self._readbuffer[self._offset:] + self._readbuffer = b'' + self._offset = 0 + if n > 0: + while not self._eof: + data = self._read1(n) + if n < len(data): + self._readbuffer = data + self._offset = n + buf += data[:n] + break + if data: + buf += data + break + return buf + + def _read1(self, n): + # Read up to n compressed bytes with at most one read() system call, + # decrypt and decompress them. + if self._eof or n <= 0: + return b'' + + # Read from file. + if self._compress_type == ZIP_DEFLATED: + ## Handle unconsumed data. + data = self._decompressor.unconsumed_tail + if n > len(data): + data += self._read2(n - len(data)) + else: + data = self._read2(n) + + if self._compress_type == ZIP_STORED: + self._eof = self._compress_left <= 0 + elif self._compress_type == ZIP_DEFLATED: + n = max(n, self.MIN_READ_SIZE) + data = self._decompressor.decompress(data, n) + self._eof = (self._decompressor.eof or + self._compress_left <= 0 and + not self._decompressor.unconsumed_tail) + if self._eof: + data += self._decompressor.flush() + else: + data = self._decompressor.decompress(data) + self._eof = self._decompressor.eof or self._compress_left <= 0 + + data = data[:self._left] + self._left -= len(data) + if self._left <= 0: + self._eof = True + self._update_crc(data) + return data + + def _read2(self, n): + if self._compress_left <= 0: + return b'' + + n = max(n, self.MIN_READ_SIZE) + n = min(n, self._compress_left) + + data = self._fileobj.read(n) + self._compress_left -= len(data) + if not data: + raise EOFError + + if self._decrypter is not None: + data = self._decrypter(data) + return data + + def close(self): + try: + if self._close_fileobj: + self._fileobj.close() + finally: + super().close() + + def seekable(self): + return self._seekable + + def seek(self, offset, whence=0): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + curr_pos = self.tell() + if whence == 0: # Seek from start of file + new_pos = offset + elif whence == 1: # Seek from current position + new_pos = curr_pos + offset + elif whence == 2: # Seek from EOF + new_pos = self._orig_file_size + offset + else: + raise ValueError("whence must be os.SEEK_SET (0), " + "os.SEEK_CUR (1), or os.SEEK_END (2)") + + if new_pos > self._orig_file_size: + new_pos = self._orig_file_size + + if new_pos < 0: + new_pos = 0 + + read_offset = new_pos - curr_pos + buff_offset = read_offset + self._offset + + if buff_offset >= 0 and buff_offset < len(self._readbuffer): + # Just move the _offset index if the new position is in the _readbuffer + self._offset = buff_offset + read_offset = 0 + elif read_offset < 0: + # Position is before the current position. Reset the ZipExtFile + self._fileobj.seek(self._orig_compress_start) + self._running_crc = self._orig_start_crc + self._compress_left = self._orig_compress_size + self._left = self._orig_file_size + self._readbuffer = b'' + self._offset = 0 + self._decompressor = _get_decompressor(self._compress_type) + self._eof = False + read_offset = new_pos + + while read_offset > 0: + read_len = min(self.MAX_SEEK_READ, read_offset) + self.read(read_len) + read_offset -= read_len + + return self.tell() + + def tell(self): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset + return filepos + + +class _ZipWriteFile(io.BufferedIOBase): + def __init__(self, zf, zinfo, zip64): + self._zinfo = zinfo + self._zip64 = zip64 + self._zipfile = zf + self._compressor = _get_compressor(zinfo.compress_type, + zinfo._compresslevel) + self._file_size = 0 + self._compress_size = 0 + self._crc = 0 + + @property + def _fileobj(self): + return self._zipfile.fp + + def writable(self): + return True + + def write(self, data): + if self.closed: + raise ValueError('I/O operation on closed file.') + nbytes = len(data) + self._file_size += nbytes + self._crc = crc32(data, self._crc) + if self._compressor: + data = self._compressor.compress(data) + self._compress_size += len(data) + self._fileobj.write(data) + return nbytes + + def close(self): + if self.closed: + return + super().close() + # Flush any data from the compressor, and update header info + if self._compressor: + buf = self._compressor.flush() + self._compress_size += len(buf) + self._fileobj.write(buf) + self._zinfo.compress_size = self._compress_size + else: + self._zinfo.compress_size = self._file_size + self._zinfo.CRC = self._crc + self._zinfo.file_size = self._file_size + + # Write updated header info + if self._zinfo.flag_bits & 0x08: + # Write CRC and file sizes after the file data + fmt = '<LQQ' if self._zip64 else '<LLL' + self._fileobj.write(struct.pack(fmt, self._zinfo.CRC, + self._zinfo.compress_size, self._zinfo.file_size)) + self._zipfile.start_dir = self._fileobj.tell() + else: + if not self._zip64: + if self._file_size > ZIP64_LIMIT: + raise RuntimeError('File size unexpectedly exceeded ZIP64 ' + 'limit') + if self._compress_size > ZIP64_LIMIT: + raise RuntimeError('Compressed size unexpectedly exceeded ' + 'ZIP64 limit') + # Seek backwards and write file header (which will now include + # correct CRC and file sizes) + + # Preserve current position in file + self._zipfile.start_dir = self._fileobj.tell() + self._fileobj.seek(self._zinfo.header_offset) + self._fileobj.write(self._zinfo.FileHeader(self._zip64)) + self._fileobj.seek(self._zipfile.start_dir) + + self._zipfile._writing = False + + # Successfully written: Add file to our caches + self._zipfile.filelist.append(self._zinfo) + self._zipfile.NameToInfo[self._zinfo.filename] = self._zinfo + +class ZipFile: + """ Class with methods to open, read, write, close, list zip files. + + z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True, + compresslevel=None) + + file: Either the path to the file, or a file-like object. + If it is a path, the file will be opened and closed by ZipFile. + mode: The mode can be either read 'r', write 'w', exclusive create 'x', + or append 'a'. + compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib), + ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma). + allowZip64: if True ZipFile will create files with ZIP64 extensions when + needed, otherwise it will raise an exception when this would + be necessary. + compresslevel: None (default for the given compression type) or an integer + specifying the level to pass to the compressor. + When using ZIP_STORED or ZIP_LZMA this keyword has no effect. + When using ZIP_DEFLATED integers 0 through 9 are accepted. + When using ZIP_BZIP2 integers 1 through 9 are accepted. + + """ + + fp = None # Set here since __del__ checks it + _windows_illegal_name_trans_table = None + + def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, + compresslevel=None): + """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', + or append 'a'.""" + if mode not in ('r', 'w', 'x', 'a'): + raise ValueError("ZipFile requires mode 'r', 'w', 'x', or 'a'") + + _check_compression(compression) + + self._allowZip64 = allowZip64 + self._didModify = False + self.debug = 0 # Level of printing: 0 through 3 + self.NameToInfo = {} # Find file info given name + self.filelist = [] # List of ZipInfo instances for archive + self.compression = compression # Method of compression + self.compresslevel = compresslevel + self.mode = mode + self.pwd = None + self._comment = b'' + + # Check if we were passed a file-like object + if isinstance(file, os.PathLike): + file = os.fspath(file) + if isinstance(file, str): + # No, it's a filename + self._filePassed = 0 + self.filename = file + modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b', + 'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'} + filemode = modeDict[mode] + while True: + try: + self.fp = io.open(file, filemode) + except OSError: + if filemode in modeDict: + filemode = modeDict[filemode] + continue + raise + break + else: + self._filePassed = 1 + self.fp = file + self.filename = getattr(file, 'name', None) + self._fileRefCnt = 1 + self._lock = threading.RLock() + self._seekable = True + self._writing = False + + try: + if mode == 'r': + self._RealGetContents() + elif mode in ('w', 'x'): + # set the modified flag so central directory gets written + # even if no files are added to the archive + self._didModify = True + try: + self.start_dir = self.fp.tell() + except (AttributeError, OSError): + self.fp = _Tellable(self.fp) + self.start_dir = 0 + self._seekable = False + else: + # Some file-like objects can provide tell() but not seek() + try: + self.fp.seek(self.start_dir) + except (AttributeError, OSError): + self._seekable = False + elif mode == 'a': + try: + # See if file is a zip file + self._RealGetContents() + # seek to start of directory and overwrite + self.fp.seek(self.start_dir) + except BadZipFile: + # file is not a zip file, just append + self.fp.seek(0, 2) + + # set the modified flag so central directory gets written + # even if no files are added to the archive + self._didModify = True + self.start_dir = self.fp.tell() + else: + raise ValueError("Mode must be 'r', 'w', 'x', or 'a'") + except: + fp = self.fp + self.fp = None + self._fpclose(fp) + raise + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def __repr__(self): + result = ['<%s.%s' % (self.__class__.__module__, + self.__class__.__qualname__)] + if self.fp is not None: + if self._filePassed: + result.append(' file=%r' % self.fp) + elif self.filename is not None: + result.append(' filename=%r' % self.filename) + result.append(' mode=%r' % self.mode) + else: + result.append(' [closed]') + result.append('>') + return ''.join(result) + + def _RealGetContents(self): + """Read in the table of contents for the ZIP file.""" + fp = self.fp + try: + endrec = _EndRecData(fp) + except OSError: + raise BadZipFile("File is not a zip file") + if not endrec: + raise BadZipFile("File is not a zip file") + if self.debug > 1: + print(endrec) + size_cd = endrec[_ECD_SIZE] # bytes in central directory + offset_cd = endrec[_ECD_OFFSET] # offset of central directory + self._comment = endrec[_ECD_COMMENT] # archive comment + + # "concat" is zero, unless zip was concatenated to another file + concat = endrec[_ECD_LOCATION] - size_cd - offset_cd + if endrec[_ECD_SIGNATURE] == stringEndArchive64: + # If Zip64 extension structures are present, account for them + concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) + + if self.debug > 2: + inferred = concat + offset_cd + print("given, inferred, offset", offset_cd, inferred, concat) + # self.start_dir: Position of start of central directory + self.start_dir = offset_cd + concat + fp.seek(self.start_dir, 0) + data = fp.read(size_cd) + fp = io.BytesIO(data) + total = 0 + while total < size_cd: + centdir = fp.read(sizeCentralDir) + if len(centdir) != sizeCentralDir: + raise BadZipFile("Truncated central directory") + centdir = struct.unpack(structCentralDir, centdir) + if centdir[_CD_SIGNATURE] != stringCentralDir: + raise BadZipFile("Bad magic number for central directory") + if self.debug > 2: + print(centdir) + filename = fp.read(centdir[_CD_FILENAME_LENGTH]) + flags = centdir[5] + if flags & 0x800: + # UTF-8 file names extension + filename = filename.decode('utf-8') + else: + # Historical ZIP filename encoding + filename = filename.decode('cp437') + # Create ZipInfo instance to store file information + x = ZipInfo(filename) + x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) + x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) + x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] + (x.create_version, x.create_system, x.extract_version, x.reserved, + x.flag_bits, x.compress_type, t, d, + x.CRC, x.compress_size, x.file_size) = centdir[1:12] + if x.extract_version > MAX_EXTRACT_VERSION: + raise NotImplementedError("zip file version %.1f" % + (x.extract_version / 10)) + x.volume, x.internal_attr, x.external_attr = centdir[15:18] + # Convert date/time code to (year, month, day, hour, min, sec) + x._raw_time = t + x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, + t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) + + x._decodeExtra() + x.header_offset = x.header_offset + concat + self.filelist.append(x) + self.NameToInfo[x.filename] = x + + # update total bytes read from central directory + total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + + centdir[_CD_EXTRA_FIELD_LENGTH] + + centdir[_CD_COMMENT_LENGTH]) + + if self.debug > 2: + print("total", total) + + + def namelist(self): + """Return a list of file names in the archive.""" + return [data.filename for data in self.filelist] + + def infolist(self): + """Return a list of class ZipInfo instances for files in the + archive.""" + return self.filelist + + def printdir(self, file=None): + """Print a table of contents for the zip file.""" + print("%-46s %19s %12s" % ("File Name", "Modified ", "Size"), + file=file) + for zinfo in self.filelist: + date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] + print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size), + file=file) + + def testzip(self): + """Read all the files and check the CRC.""" + chunk_size = 2 ** 20 + for zinfo in self.filelist: + try: + # Read by chunks, to avoid an OverflowError or a + # MemoryError with very large embedded files. + with self.open(zinfo.filename, "r") as f: + while f.read(chunk_size): # Check CRC-32 + pass + except BadZipFile: + return zinfo.filename + + def getinfo(self, name): + """Return the instance of ZipInfo given 'name'.""" + info = self.NameToInfo.get(name) + if info is None: + raise KeyError( + 'There is no item named %r in the archive' % name) + + return info + + def setpassword(self, pwd): + """Set default password for encrypted files.""" + if pwd and not isinstance(pwd, bytes): + raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) + if pwd: + self.pwd = pwd + else: + self.pwd = None + + @property + def comment(self): + """The comment text associated with the ZIP file.""" + return self._comment + + @comment.setter + def comment(self, comment): + if not isinstance(comment, bytes): + raise TypeError("comment: expected bytes, got %s" % type(comment).__name__) + # check for valid comment length + if len(comment) > ZIP_MAX_COMMENT: + import warnings + warnings.warn('Archive comment is too long; truncating to %d bytes' + % ZIP_MAX_COMMENT, stacklevel=2) + comment = comment[:ZIP_MAX_COMMENT] + self._comment = comment + self._didModify = True + + def read(self, name, pwd=None): + """Return file bytes (as a string) for name.""" + with self.open(name, "r", pwd) as fp: + return fp.read() + + def open(self, name, mode="r", pwd=None, *, force_zip64=False): + """Return file-like object for 'name'. + + name is a string for the file name within the ZIP file, or a ZipInfo + object. + + mode should be 'r' to read a file already in the ZIP file, or 'w' to + write to a file newly added to the archive. + + pwd is the password to decrypt files (only used for reading). + + When writing, if the file size is not known in advance but may exceed + 2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large + files. If the size is known in advance, it is best to pass a ZipInfo + instance for name, with zinfo.file_size set. + """ + if mode not in {"r", "w"}: + raise ValueError('open() requires mode "r" or "w"') + if pwd and not isinstance(pwd, bytes): + raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) + if pwd and (mode == "w"): + raise ValueError("pwd is only supported for reading files") + if not self.fp: + raise ValueError( + "Attempt to use ZIP archive that was already closed") + + # Make sure we have an info object + if isinstance(name, ZipInfo): + # 'name' is already an info object + zinfo = name + elif mode == 'w': + zinfo = ZipInfo(name) + zinfo.compress_type = self.compression + zinfo._compresslevel = self.compresslevel + else: + # Get info object for name + zinfo = self.getinfo(name) + + if mode == 'w': + return self._open_to_write(zinfo, force_zip64=force_zip64) + + if self._writing: + raise ValueError("Can't read from the ZIP file while there " + "is an open writing handle on it. " + "Close the writing handle before trying to read.") + + # Open for reading: + self._fileRefCnt += 1 + zef_file = _SharedFile(self.fp, zinfo.header_offset, + self._fpclose, self._lock, lambda: self._writing) + try: + # Skip the file header: + fheader = zef_file.read(sizeFileHeader) + if len(fheader) != sizeFileHeader: + raise BadZipFile("Truncated file header") + fheader = struct.unpack(structFileHeader, fheader) + if fheader[_FH_SIGNATURE] != stringFileHeader: + raise BadZipFile("Bad magic number for file header") + + fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) + if fheader[_FH_EXTRA_FIELD_LENGTH]: + zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + + if zinfo.flag_bits & 0x20: + # Zip 2.7: compressed patched data + raise NotImplementedError("compressed patched data (flag bit 5)") + + if zinfo.flag_bits & 0x40: + # strong encryption + raise NotImplementedError("strong encryption (flag bit 6)") + + if zinfo.flag_bits & 0x800: + # UTF-8 filename + fname_str = fname.decode("utf-8") + else: + fname_str = fname.decode("cp437") + + if fname_str != zinfo.orig_filename: + raise BadZipFile( + 'File name in directory %r and header %r differ.' + % (zinfo.orig_filename, fname)) + + # check for encrypted flag & handle password + is_encrypted = zinfo.flag_bits & 0x1 + zd = None + if is_encrypted: + if not pwd: + pwd = self.pwd + if not pwd: + raise RuntimeError("File %r is encrypted, password " + "required for extraction" % name) + + zd = _ZipDecrypter(pwd) + # The first 12 bytes in the cypher stream is an encryption header + # used to strengthen the algorithm. The first 11 bytes are + # completely random, while the 12th contains the MSB of the CRC, + # or the MSB of the file time depending on the header type + # and is used to check the correctness of the password. + header = zef_file.read(12) + h = zd(header[0:12]) + if zinfo.flag_bits & 0x8: + # compare against the file type from extended local headers + check_byte = (zinfo._raw_time >> 8) & 0xff + else: + # compare against the CRC otherwise + check_byte = (zinfo.CRC >> 24) & 0xff + if h[11] != check_byte: + raise RuntimeError("Bad password for file %r" % name) + + return ZipExtFile(zef_file, mode, zinfo, zd, True) + except: + zef_file.close() + raise + + def _open_to_write(self, zinfo, force_zip64=False): + if force_zip64 and not self._allowZip64: + raise ValueError( + "force_zip64 is True, but allowZip64 was False when opening " + "the ZIP file." + ) + if self._writing: + raise ValueError("Can't write to the ZIP file while there is " + "another write handle open on it. " + "Close the first handle before opening another.") + + # Sizes and CRC are overwritten with correct data after processing the file + if not hasattr(zinfo, 'file_size'): + zinfo.file_size = 0 + zinfo.compress_size = 0 + zinfo.CRC = 0 + + zinfo.flag_bits = 0x00 + if zinfo.compress_type == ZIP_LZMA: + # Compressed data includes an end-of-stream (EOS) marker + zinfo.flag_bits |= 0x02 + if not self._seekable: + zinfo.flag_bits |= 0x08 + + if not zinfo.external_attr: + zinfo.external_attr = 0o600 << 16 # permissions: ?rw------- + + # Compressed size can be larger than uncompressed size + zip64 = self._allowZip64 and \ + (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT) + + if self._seekable: + self.fp.seek(self.start_dir) + zinfo.header_offset = self.fp.tell() + + self._writecheck(zinfo) + self._didModify = True + + self.fp.write(zinfo.FileHeader(zip64)) + + self._writing = True + return _ZipWriteFile(self, zinfo, zip64) + + def extract(self, member, path=None, pwd=None): + """Extract a member from the archive to the current working directory, + using its full name. Its file information is extracted as accurately + as possible. `member' may be a filename or a ZipInfo object. You can + specify a different directory using `path'. + """ + if path is None: + path = os.getcwd() + else: + path = os.fspath(path) + + return self._extract_member(member, path, pwd) + + def extractall(self, path=None, members=None, pwd=None): + """Extract all members from the archive to the current working + directory. `path' specifies a different directory to extract to. + `members' is optional and must be a subset of the list returned + by namelist(). + """ + if members is None: + members = self.namelist() + + if path is None: + path = os.getcwd() + else: + path = os.fspath(path) + + for zipinfo in members: + self._extract_member(zipinfo, path, pwd) + + @classmethod + def _sanitize_windows_name(cls, arcname, pathsep): + """Replace bad characters and remove trailing dots from parts.""" + table = cls._windows_illegal_name_trans_table + if not table: + illegal = ':<>|"?*' + table = str.maketrans(illegal, '_' * len(illegal)) + cls._windows_illegal_name_trans_table = table + arcname = arcname.translate(table) + # remove trailing dots + arcname = (x.rstrip('.') for x in arcname.split(pathsep)) + # rejoin, removing empty parts. + arcname = pathsep.join(x for x in arcname if x) + return arcname + + def _extract_member(self, member, targetpath, pwd): + """Extract the ZipInfo object 'member' to a physical + file on the path targetpath. + """ + if not isinstance(member, ZipInfo): + member = self.getinfo(member) + + # build the destination pathname, replacing + # forward slashes to platform specific separators. + arcname = member.filename.replace('/', os.path.sep) + + if os.path.altsep: + arcname = arcname.replace(os.path.altsep, os.path.sep) + # interpret absolute pathname as relative, remove drive letter or + # UNC path, redundant separators, "." and ".." components. + arcname = os.path.splitdrive(arcname)[1] + invalid_path_parts = ('', os.path.curdir, os.path.pardir) + arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) + if x not in invalid_path_parts) + if os.path.sep == '\\': + # filter illegal characters on Windows + arcname = self._sanitize_windows_name(arcname, os.path.sep) + + targetpath = os.path.join(targetpath, arcname) + targetpath = os.path.normpath(targetpath) + + # Create all upper directories if necessary. + upperdirs = os.path.dirname(targetpath) + if upperdirs and not os.path.exists(upperdirs): + os.makedirs(upperdirs) + + if member.is_dir(): + if not os.path.isdir(targetpath): + os.mkdir(targetpath) + return targetpath + + with self.open(member, pwd=pwd) as source, \ + open(targetpath, "wb") as target: + shutil.copyfileobj(source, target) + + return targetpath + + def _writecheck(self, zinfo): + """Check for errors before writing a file to the archive.""" + if zinfo.filename in self.NameToInfo: + import warnings + warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3) + if self.mode not in ('w', 'x', 'a'): + raise ValueError("write() requires mode 'w', 'x', or 'a'") + if not self.fp: + raise ValueError( + "Attempt to write ZIP archive that was already closed") + _check_compression(zinfo.compress_type) + if not self._allowZip64: + requires_zip64 = None + if len(self.filelist) >= ZIP_FILECOUNT_LIMIT: + requires_zip64 = "Files count" + elif zinfo.file_size > ZIP64_LIMIT: + requires_zip64 = "Filesize" + elif zinfo.header_offset > ZIP64_LIMIT: + requires_zip64 = "Zipfile size" + if requires_zip64: + raise LargeZipFile(requires_zip64 + + " would require ZIP64 extensions") + + def write(self, filename, arcname=None, + compress_type=None, compresslevel=None): + """Put the bytes from filename into the archive under the name + arcname.""" + if not self.fp: + raise ValueError( + "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise ValueError( + "Can't write to ZIP archive while an open writing handle exists" + ) + + zinfo = ZipInfo.from_file(filename, arcname) + + if zinfo.is_dir(): + zinfo.compress_size = 0 + zinfo.CRC = 0 + else: + if compress_type is not None: + zinfo.compress_type = compress_type + else: + zinfo.compress_type = self.compression + + if compresslevel is not None: + zinfo._compresslevel = compresslevel + else: + zinfo._compresslevel = self.compresslevel + + if zinfo.is_dir(): + with self._lock: + if self._seekable: + self.fp.seek(self.start_dir) + zinfo.header_offset = self.fp.tell() # Start of header bytes + if zinfo.compress_type == ZIP_LZMA: + # Compressed data includes an end-of-stream (EOS) marker + zinfo.flag_bits |= 0x02 + + self._writecheck(zinfo) + self._didModify = True + + self.filelist.append(zinfo) + self.NameToInfo[zinfo.filename] = zinfo + self.fp.write(zinfo.FileHeader(False)) + self.start_dir = self.fp.tell() + else: + with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: + shutil.copyfileobj(src, dest, 1024*8) + + def writestr(self, zinfo_or_arcname, data, + compress_type=None, compresslevel=None): + """Write a file into the archive. The contents is 'data', which + may be either a 'str' or a 'bytes' instance; if it is a 'str', + it is encoded as UTF-8 first. + 'zinfo_or_arcname' is either a ZipInfo instance or + the name of the file in the archive.""" + if isinstance(data, str): + data = data.encode("utf-8") + if not isinstance(zinfo_or_arcname, ZipInfo): + zinfo = ZipInfo(filename=zinfo_or_arcname, + date_time=time.localtime(time.time())[:6]) + zinfo.compress_type = self.compression + zinfo._compresslevel = self.compresslevel + if zinfo.filename[-1] == '/': + zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.external_attr = 0o600 << 16 # ?rw------- + else: + zinfo = zinfo_or_arcname + + if not self.fp: + raise ValueError( + "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise ValueError( + "Can't write to ZIP archive while an open writing handle exists." + ) + + if compress_type is not None: + zinfo.compress_type = compress_type + + if compresslevel is not None: + zinfo._compresslevel = compresslevel + + zinfo.file_size = len(data) # Uncompressed size + with self._lock: + with self.open(zinfo, mode='w') as dest: + dest.write(data) + + def __del__(self): + """Call the "close()" method in case the user forgot.""" + self.close() + + def close(self): + """Close the file, and for mode 'w', 'x' and 'a' write the ending + records.""" + if self.fp is None: + return + + if self._writing: + raise ValueError("Can't close the ZIP file while there is " + "an open writing handle on it. " + "Close the writing handle before closing the zip.") + + try: + if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records + with self._lock: + if self._seekable: + self.fp.seek(self.start_dir) + self._write_end_record() + finally: + fp = self.fp + self.fp = None + self._fpclose(fp) + + def _write_end_record(self): + for zinfo in self.filelist: # write central directory + dt = zinfo.date_time + dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] + dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) + extra = [] + if zinfo.file_size > ZIP64_LIMIT \ + or zinfo.compress_size > ZIP64_LIMIT: + extra.append(zinfo.file_size) + extra.append(zinfo.compress_size) + file_size = 0xffffffff + compress_size = 0xffffffff + else: + file_size = zinfo.file_size + compress_size = zinfo.compress_size + + if zinfo.header_offset > ZIP64_LIMIT: + extra.append(zinfo.header_offset) + header_offset = 0xffffffff + else: + header_offset = zinfo.header_offset + + extra_data = zinfo.extra + min_version = 0 + if extra: + # Append a ZIP64 field to the extra's + extra_data = struct.pack( + '<HH' + 'Q'*len(extra), + 1, 8*len(extra), *extra) + extra_data + + min_version = ZIP64_VERSION + + if zinfo.compress_type == ZIP_BZIP2: + min_version = max(BZIP2_VERSION, min_version) + elif zinfo.compress_type == ZIP_LZMA: + min_version = max(LZMA_VERSION, min_version) + + extract_version = max(min_version, zinfo.extract_version) + create_version = max(min_version, zinfo.create_version) + try: + filename, flag_bits = zinfo._encodeFilenameFlags() + centdir = struct.pack(structCentralDir, + stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset) + except DeprecationWarning: + print((structCentralDir, stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(zinfo.filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset), file=sys.stderr) + raise + self.fp.write(centdir) + self.fp.write(filename) + self.fp.write(extra_data) + self.fp.write(zinfo.comment) + + pos2 = self.fp.tell() + # Write end-of-zip-archive record + centDirCount = len(self.filelist) + centDirSize = pos2 - self.start_dir + centDirOffset = self.start_dir + requires_zip64 = None + if centDirCount > ZIP_FILECOUNT_LIMIT: + requires_zip64 = "Files count" + elif centDirOffset > ZIP64_LIMIT: + requires_zip64 = "Central directory offset" + elif centDirSize > ZIP64_LIMIT: + requires_zip64 = "Central directory size" + if requires_zip64: + # Need to write the ZIP64 end-of-archive records + if not self._allowZip64: + raise LargeZipFile(requires_zip64 + + " would require ZIP64 extensions") + zip64endrec = struct.pack( + structEndArchive64, stringEndArchive64, + 44, 45, 45, 0, 0, centDirCount, centDirCount, + centDirSize, centDirOffset) + self.fp.write(zip64endrec) + + zip64locrec = struct.pack( + structEndArchive64Locator, + stringEndArchive64Locator, 0, pos2, 1) + self.fp.write(zip64locrec) + centDirCount = min(centDirCount, 0xFFFF) + centDirSize = min(centDirSize, 0xFFFFFFFF) + centDirOffset = min(centDirOffset, 0xFFFFFFFF) + + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, centDirCount, centDirCount, + centDirSize, centDirOffset, len(self._comment)) + self.fp.write(endrec) + self.fp.write(self._comment) + self.fp.flush() + + def _fpclose(self, fp): + assert self._fileRefCnt > 0 + self._fileRefCnt -= 1 + if not self._fileRefCnt and not self._filePassed: + fp.close() + + +class PyZipFile(ZipFile): + """Class to create ZIP archives with Python library files and packages.""" + + def __init__(self, file, mode="r", compression=ZIP_STORED, + allowZip64=True, optimize=-1): + ZipFile.__init__(self, file, mode=mode, compression=compression, + allowZip64=allowZip64) + self._optimize = optimize + + def writepy(self, pathname, basename="", filterfunc=None): + """Add all files from "pathname" to the ZIP archive. + + If pathname is a package directory, search the directory and + all package subdirectories recursively for all *.py and enter + the modules into the archive. If pathname is a plain + directory, listdir *.py and enter all modules. Else, pathname + must be a Python *.py file and the module will be put into the + archive. Added modules are always module.pyc. + This method will compile the module.py into module.pyc if + necessary. + If filterfunc(pathname) is given, it is called with every argument. + When it is False, the file or directory is skipped. + """ + pathname = os.fspath(pathname) + if filterfunc and not filterfunc(pathname): + if self.debug: + label = 'path' if os.path.isdir(pathname) else 'file' + print('%s %r skipped by filterfunc' % (label, pathname)) + return + dir, name = os.path.split(pathname) + if os.path.isdir(pathname): + initname = os.path.join(pathname, "__init__.py") + if os.path.isfile(initname): + # This is a package directory, add it + if basename: + basename = "%s/%s" % (basename, name) + else: + basename = name + if self.debug: + print("Adding package in", pathname, "as", basename) + fname, arcname = self._get_codename(initname[0:-3], basename) + if self.debug: + print("Adding", arcname) + self.write(fname, arcname) + dirlist = sorted(os.listdir(pathname)) + dirlist.remove("__init__.py") + # Add all *.py files and package subdirectories + for filename in dirlist: + path = os.path.join(pathname, filename) + root, ext = os.path.splitext(filename) + if os.path.isdir(path): + if os.path.isfile(os.path.join(path, "__init__.py")): + # This is a package directory, add it + self.writepy(path, basename, + filterfunc=filterfunc) # Recursive call + elif ext == ".py": + if filterfunc and not filterfunc(path): + if self.debug: + print('file %r skipped by filterfunc' % path) + continue + fname, arcname = self._get_codename(path[0:-3], + basename) + if self.debug: + print("Adding", arcname) + self.write(fname, arcname) + else: + # This is NOT a package directory, add its files at top level + if self.debug: + print("Adding files from directory", pathname) + for filename in sorted(os.listdir(pathname)): + path = os.path.join(pathname, filename) + root, ext = os.path.splitext(filename) + if ext == ".py": + if filterfunc and not filterfunc(path): + if self.debug: + print('file %r skipped by filterfunc' % path) + continue + fname, arcname = self._get_codename(path[0:-3], + basename) + if self.debug: + print("Adding", arcname) + self.write(fname, arcname) + else: + if pathname[-3:] != ".py": + raise RuntimeError( + 'Files added with writepy() must end with ".py"') + fname, arcname = self._get_codename(pathname[0:-3], basename) + if self.debug: + print("Adding file", arcname) + self.write(fname, arcname) + + def _get_codename(self, pathname, basename): + """Return (filename, archivename) for the path. + + Given a module name path, return the correct file path and + archive name, compiling if necessary. For example, given + /python/lib/string, return (/python/lib/string.pyc, string). + """ + def _compile(file, optimize=-1): + import py_compile + if self.debug: + print("Compiling", file) + try: + py_compile.compile(file, doraise=True, optimize=optimize) + except py_compile.PyCompileError as err: + print(err.msg) + return False + return True + + file_py = pathname + ".py" + file_pyc = pathname + ".pyc" + pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='') + pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1) + pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2) + if self._optimize == -1: + # legacy mode: use whatever file is present + if (os.path.isfile(file_pyc) and + os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime): + # Use .pyc file. + arcname = fname = file_pyc + elif (os.path.isfile(pycache_opt0) and + os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime): + # Use the __pycache__/*.pyc file, but write it to the legacy pyc + # file name in the archive. + fname = pycache_opt0 + arcname = file_pyc + elif (os.path.isfile(pycache_opt1) and + os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime): + # Use the __pycache__/*.pyc file, but write it to the legacy pyc + # file name in the archive. + fname = pycache_opt1 + arcname = file_pyc + elif (os.path.isfile(pycache_opt2) and + os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime): + # Use the __pycache__/*.pyc file, but write it to the legacy pyc + # file name in the archive. + fname = pycache_opt2 + arcname = file_pyc + else: + # Compile py into PEP 3147 pyc file. + if _compile(file_py): + if sys.flags.optimize == 0: + fname = pycache_opt0 + elif sys.flags.optimize == 1: + fname = pycache_opt1 + else: + fname = pycache_opt2 + arcname = file_pyc + else: + fname = arcname = file_py + else: + # new mode: use given optimization level + if self._optimize == 0: + fname = pycache_opt0 + arcname = file_pyc + else: + arcname = file_pyc + if self._optimize == 1: + fname = pycache_opt1 + elif self._optimize == 2: + fname = pycache_opt2 + else: + msg = "invalid value for 'optimize': {!r}".format(self._optimize) + raise ValueError(msg) + if not (os.path.isfile(fname) and + os.stat(fname).st_mtime >= os.stat(file_py).st_mtime): + if not _compile(file_py, optimize=self._optimize): + fname = arcname = file_py + archivename = os.path.split(arcname)[1] + if basename: + archivename = "%s/%s" % (basename, archivename) + return (fname, archivename) + + +def main(args=None): + import argparse + + description = 'A simple command-line interface for zipfile module.' + parser = argparse.ArgumentParser(description=description) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('-l', '--list', metavar='<zipfile>', + help='Show listing of a zipfile') + group.add_argument('-e', '--extract', nargs=2, + metavar=('<zipfile>', '<output_dir>'), + help='Extract zipfile into target dir') + group.add_argument('-c', '--create', nargs='+', + metavar=('<name>', '<file>'), + help='Create zipfile from sources') + group.add_argument('-t', '--test', metavar='<zipfile>', + help='Test if a zipfile is valid') + args = parser.parse_args(args) + + if args.test is not None: + src = args.test + with ZipFile(src, 'r') as zf: + badfile = zf.testzip() + if badfile: + print("The following enclosed file is corrupted: {!r}".format(badfile)) + print("Done testing") + + elif args.list is not None: + src = args.list + with ZipFile(src, 'r') as zf: + zf.printdir() + + elif args.extract is not None: + src, curdir = args.extract + with ZipFile(src, 'r') as zf: + zf.extractall(curdir) + + elif args.create is not None: + zip_name = args.create.pop(0) + files = args.create + + def addToZip(zf, path, zippath): + if os.path.isfile(path): + zf.write(path, zippath, ZIP_DEFLATED) + elif os.path.isdir(path): + if zippath: + zf.write(path, zippath) + for nm in sorted(os.listdir(path)): + addToZip(zf, + os.path.join(path, nm), os.path.join(zippath, nm)) + # else: ignore + + with ZipFile(zip_name, 'w') as zf: + for path in files: + zippath = os.path.basename(path) + if not zippath: + zippath = os.path.basename(os.path.dirname(path)) + if zippath in ('', os.curdir, os.pardir): + zippath = '' + addToZip(zf, path, zippath) + +if __name__ == "__main__": + main() Binary files differ@@ -0,0 +1,80 @@ +#!/usr/bin/env python +""" +Concatenate pages from pdf files into a single pdf file. + +Page ranges refer to the previously-named file. +A file not followed by a page range means all the pages of the file. + +PAGE RANGES are like Python slices. + {page_range_help} +EXAMPLES + pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 + Concatenate all of head.pdf, all but page seven of content.pdf, + and the last page of tail.pdf, producing output.pdf. + + pdfcat chapter*.pdf >book.pdf + You can specify the output file by redirection. + + pdfcat chapter?.pdf chapter10.pdf >book.pdf + In case you don't want chapter 10 before chapter 2. +""" +# Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>. +# All rights reserved. This software is available under a BSD license; +# see https://github.com/mstamy2/PyPDF2/LICENSE + +from __future__ import print_function +import argparse +from PdfFileTransformer.PyPDF2.pagerange import PAGE_RANGE_HELP + + +def parse_args(): + parser = argparse.ArgumentParser( + description=__doc__.format(page_range_help=PAGE_RANGE_HELP), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("-o", "--output", + metavar="output_file") + parser.add_argument("-v", "--verbose", action="store_true", + help="show page ranges as they are being read") + parser.add_argument("first_filename", nargs=1, + metavar="filename [page range...]") + # argparse chokes on page ranges like "-2:" unless caught like this: + parser.add_argument("fn_pgrgs", nargs=argparse.REMAINDER, + metavar="filenames and/or page ranges") + args = parser.parse_args() + args.fn_pgrgs.insert(0, args.first_filename[0]) + return args + + +from sys import stderr, stdout, exit +import os +import traceback +from collections import defaultdict + +from PdfFileTransformer.PyPDF2 import PdfFileMerger, parse_filename_page_ranges + + +if __name__ == "__main__": + args = parse_args() + filename_page_ranges = parse_filename_page_ranges(args.fn_pgrgs) + if args.output: + output = open(args.output, "wb") + else: + stdout.flush() + output = os.fdopen(stdout.fileno(), "wb") + + merger = PdfFileMerger() + in_fs = dict() + try: + for (filename, page_range) in filename_page_ranges: + if args.verbose: + print(filename, page_range, file=stderr) + if filename not in in_fs: + in_fs[filename] = open(filename, "rb") + merger.append(in_fs[filename], pages=page_range) + except: + print(traceback.format_exc(), file=stderr) + print("Error while reading " + filename, file=stderr) + exit(1) + merger.write(output) + # In 3.0, input files must stay open until output is written. + # Not closing the in_fs because this script exits now. diff --git a/tests/samples/descriptions.txt b/tests/samples/descriptions.txt new file mode 100644 index 0000000..6d57430 --- /dev/null +++ b/tests/samples/descriptions.txt @@ -0,0 +1,7 @@ +== Zip files == + +test1.zip: deux fichiers et commentaire global. + +== Pdf files == + +test1.pdf: fichier des impots.
\ No newline at end of file diff --git a/tests/samples/test1.pdf b/tests/samples/test1.pdf Binary files differnew file mode 100644 index 0000000..3e79ae1 --- /dev/null +++ b/tests/samples/test1.pdf diff --git a/tests/samples/test1.zip b/tests/samples/test1.zip Binary files differnew file mode 100644 index 0000000..cd93f9d --- /dev/null +++ b/tests/samples/test1.zip diff --git a/tests/samples/test1_normalized.pdf b/tests/samples/test1_normalized.pdf Binary files differnew file mode 100644 index 0000000..e955cb1 --- /dev/null +++ b/tests/samples/test1_normalized.pdf diff --git a/tests/test_pdf_add_data.py b/tests/test_pdf_add_data.py new file mode 100755 index 0000000..eb12d93 --- /dev/null +++ b/tests/test_pdf_add_data.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") + +import logging +from PdfFileTransformer import Pdf + + +input_file = "./samples/test1.pdf" +output_file = "./samples/test1_out.pdf" + +logging.basicConfig(level=logging.DEBUG) + +p = Pdf(input_file) +p.insert_new_obj_stream_at_start(b'A' * 140) +p.insert_new_obj_stream_at_end(b'B' * 120) +f = open(output_file, 'wb') +f.write(p.get_build_buffer()) +f.close() diff --git a/tests/test_pdf_normalisation.py b/tests/test_pdf_normalisation.py new file mode 100755 index 0000000..aba197e --- /dev/null +++ b/tests/test_pdf_normalisation.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") +import logging +from PdfFileTransformer.PyPDF2 import PdfFileReader, PdfFileWriter + + +input_file = "./samples/test1.pdf" +output_file = "./samples/test1_out.pdf" + +logging.basicConfig(level=logging.DEBUG) + +f_input = open(input_file, "rb") +reader = PdfFileReader(f_input) + +f_output = open(output_file, "wb") +writer = PdfFileWriter() + +writer.appendPagesFromReader(reader) +writer.setHeader(b"%PDF-1.5") +writer.write(f_output) + +f_input.close() +f_output.close() diff --git a/tests/test_pdf_rebuild.py b/tests/test_pdf_rebuild.py new file mode 100755 index 0000000..29b31c3 --- /dev/null +++ b/tests/test_pdf_rebuild.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") + +from PdfFileTransformer import Pdf +import logging + +input_file = "./samples/test1.pdf" +output_file = "./samples/test1_out.pdf" + +logging.basicConfig(level=logging.DEBUG) + + +p = Pdf(input_file) +f = open(output_file, 'wb') +f.write(p.get_build_buffer()) +f.close() diff --git a/tests/test_polyglot_pdfzip.py b/tests/test_polyglot_pdfzip.py new file mode 100755 index 0000000..f4a3ff2 --- /dev/null +++ b/tests/test_polyglot_pdfzip.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") + +from PdfFileTransformer import Pdf +from ZipFileTransformer import Zip +from PolyglotFile import PolyglotPdfZip +import logging + +input_file_pdf = "./samples/test1.pdf" +input_file_zip = "./samples/test1.zip" +output_file = "./samples/test1_out.pdf" + +logging.basicConfig(level=logging.DEBUG) + + +p = Pdf(input_file_pdf) +z = Zip(input_file_zip) +a = PolyglotPdfZip(p, z) +a.generate() +a.write(output_file) diff --git a/tests/test_rebuild_zip.py b/tests/test_rebuild_zip.py new file mode 100755 index 0000000..ceb04a1 --- /dev/null +++ b/tests/test_rebuild_zip.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") + +import tempfile + +from ZipFileTransformer import Zip, ZipFile + +input_file = "./samples/test1.zip" +output_file = "./samples/test1_out.zip" + +zi = ZipFile(input_file,"r") +zo = ZipFile(output_file,"w") +zo.writestr(' ',b'AAAAAAAAAAAAAAAAAAAAAA',0) +for zipinfo in zi.infolist(): + zo.writestr(zipinfo, zi.read(zipinfo)) +zi.close() +zo.close()
\ No newline at end of file diff --git a/tests/test_zip.py b/tests/test_zip.py new file mode 100755 index 0000000..81b8f6e --- /dev/null +++ b/tests/test_zip.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") + +import tempfile + +from ZipFileTransformer import Zip + +input_file = "./samples/test1.zip" +output_file = tempfile.mktemp() +print("Output: " + output_file) + +z = Zip(input_file) +a = bytearray(b'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA') +b = bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB') +z.add_data_to_file(a, b, False) +g = open(output_file, "wb") +g.write(a + z.get_local_file_data() + b + z.get_data_after_central_directory()) +g.close() diff --git a/truepolyglot b/truepolyglot new file mode 100755 index 0000000..2ff9269 --- /dev/null +++ b/truepolyglot @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import logging + +from PdfFileTransformer import Pdf +from ZipFileTransformer import Zip +from PolyglotFile import PolyglotZipPdf, PolyglotPdfZip, PolyglotSZipPdf + + +def main(): + description_str = ('Generate a polyglot file.\n\nFormats availables:\n' + + '* pdfzip: Generate a file valid as PDF and ZIP.' + + ' The format is closest to PDF.\n' + + '* zippdf: Generate a file valid as ZIP and PDF.' + + ' The format is closest to ZIP.\n' + + '* szippdf: Generate a file valid as ZIP and PDF.' + + ' The format is strictly a ZIP.' + + ' Archive is modified.') + usage_str = '%(prog)s format [options] output-file' + epilog_str = 'TruePolyglot v1.3' + frm = argparse.RawTextHelpFormatter + parser = argparse.ArgumentParser(description=description_str, + epilog=epilog_str, + usage=usage_str, + formatter_class=frm) + parser.add_argument('format', nargs='+', choices=["pdfzip", + "zippdf", + "szippdf"], + help='Output polyglot format') + parser.add_argument('--pdffile', dest='pdffile', + help='PDF input file') + parser.add_argument('--zipfile', dest='zipfile', + help='ZIP input file') + parser.add_argument('--verbose', dest='verbose', + help='Verbosity level (default: debug)', + default="info", + choices=["none", "error", "info", "debug"]) + parser.add_argument('output_file', nargs='+', + help='Output polyglot file path') + + args = parser.parse_args() + + formats = ["pdfzip", "zippdf", "szippdf"] + if args.format[0] in formats: + if args.pdffile is None: + parser.error('pdffile is required') + if args.zipfile is None: + parser.error('zipfile is required') + + if args.verbose == "none": + logging.basicConfig(level=logging.CRITICAL) + if args.verbose == "error": + logging.basicConfig(level=logging.ERROR) + if args.verbose == "info": + logging.basicConfig(level=logging.INFO) + if args.verbose == "debug": + logging.basicConfig(level=logging.DEBUG) + + p = Pdf(args.pdffile) + z = Zip(args.zipfile) + if args.format[0] == "pdfzip": + a = PolyglotPdfZip(p, z) + if args.format[0] == "zippdf": + a = PolyglotZipPdf(p, z) + if args.format[0] == "szippdf": + a = PolyglotSZipPdf(p, z) + a.generate() + a.write(args.output_file[0]) + + +if __name__ == "__main__": + main() diff --git a/website/css/styles.css b/website/css/styles.css new file mode 100644 index 0000000..adc9728 --- /dev/null +++ b/website/css/styles.css @@ -0,0 +1,63 @@ +html { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +body { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +td { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +th { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +h1 { + color: white; +} +a:link { + color: #47B8C7; +} +a:visited { + color: #47B8C7; +} +a:active { + color: #47B8C7; +} +table { + border-collapse: collapse; +} +table, th, td { + border: 1px solid white; +} +th { + background-color: #92D050; + color: black; +} +th { + padding-left: 0.5em; + padding-right: 0.5em; + padding-top: 0.5em; + padding-bottom: 0.5em; +} +td { + padding-left: 0.5em; + padding-right: 0.5em; + padding-bottom: 0.5em; + padding-top: 0.5em; + text-align: left; +} + +.font_reduce { + font-size: 75%; +} + +.warning { + color: #ffb833; +} diff --git a/website/css/styles2.css b/website/css/styles2.css new file mode 100644 index 0000000..56ef7e5 --- /dev/null +++ b/website/css/styles2.css @@ -0,0 +1,61 @@ +html { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +body { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +td { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +th { + background-color: black; + font-family: Consolas,monaco,monospace; + color: #92D050; +} +a:link { + color: #47B8C7; +} +a:visited { + color: #47B8C7; +} +a:active { + color: #47B8C7; +} +table { + border-collapse: collapse; +} +table, th, td { + border: 1px solid white; +} +th { + background-color: #92D050; + color: black; +} +th { + padding-left: 0.5em; + padding-right: 0.5em; + padding-top: 0.5em; + padding-bottom: 0.5em; +} +td { + padding-left: 0.5em; + padding-right: 0.5em; + padding-bottom: 0.5em; + padding-top: 0.5em; + text-align: left; +} +th a:link { + color: black; +} +th a:visited { + color: black; +} +th a:active { + color: black; +} diff --git a/website/favicon.ico b/website/favicon.ico Binary files differnew file mode 100644 index 0000000..0ea93ea --- /dev/null +++ b/website/favicon.ico diff --git a/website/gen_pocs.sh b/website/gen_pocs.sh new file mode 100755 index 0000000..a62e112 --- /dev/null +++ b/website/gen_pocs.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +find -type f -name 'polyglot.pdf' -delete + +mkdir -p ./samples/pdfzip/poc1/ +../truepolyglot pdfzip --pdffile ./samples/pdfzip/poc1/doc.pdf --zipfile ./samples/pdfzip/poc1/archive.zip ./samples/pdfzip/poc1/polyglot.pdf + +mkdir -p ./samples/pdfzip/poc2/ +../truepolyglot pdfzip --pdffile ./samples/pdfzip/poc2/orwell_1984.pdf --zipfile ./samples/pdfzip/poc2/file-FILE5_32.zip ./samples/pdfzip/poc2/polyglot.pdf + +mkdir -p ./samples/pdfzip/poc3/ +../truepolyglot pdfzip --pdffile ./samples/pdfzip/poc3/x86asm.pdf --zipfile ./samples/pdfzip/poc3/fasmw17304.zip ./samples/pdfzip/poc3/polyglot.pdf + +mkdir -p ./samples/zippdf/poc4/ +../truepolyglot zippdf --pdffile ./samples/zippdf/poc4/doc.pdf --zipfile ./samples/zippdf/poc4/archive.zip ./samples/zippdf/poc4/polyglot.pdf + +mkdir -p ./samples/szippdf/poc5/ +../truepolyglot szippdf --pdffile ./samples/szippdf/poc5/electronics.pdf --zipfile ./samples/szippdf/poc5/hello_world.jar ./samples/szippdf/poc5/polyglot.pdf + +mkdir -p ./samples/pdfzip/poc6/ +../truepolyglot pdfzip --pdffile ./samples/pdfzip/poc6/hexinator.pdf --zipfile ./samples/pdfzip/poc6/eicar.zip ./samples/pdfzip/poc6/polyglot.pdf diff --git a/website/index.html b/website/index.html new file mode 100644 index 0000000..c19a2ab --- /dev/null +++ b/website/index.html @@ -0,0 +1,249 @@ +<!doctype html> + +<html lang="fr"> +<head> + <meta charset="utf-8"> + + <title>TruePolyglot</title> + <meta name="description" content="TruePolyglot project website"> + <meta name="author" content="hackade"> + <link rel="stylesheet" href="css/styles.css"> + <link rel="shortcut icon" href="/favicon.ico"> + + +</head> + +<body> + <h1>TruePolyglot</h1> +Truepolyglot is polyglot file generator project. +This means that the generated file is composed of several file formats. The same file can be opened as a ZIP file and as a PDF file for example. +The idea of this project comes from work of <a href="https://github.com/corkami">Ange Albertini</a>, <a href="https://www.alchemistowl.org/pocorgtfo/pocorgtfo07.pdf">International Journal of Proof-of-Concept or Get The Fuck Out</a> and <a href="https://www.troopers.de/wp-content/uploads/2011/04/TR11_Wolf_OMG_PDF.pdf">Julia Wolf</a> that explain how we can build a polyglot file.<br> +Polyglot file can be fastidious to build, even more if you want to respect correctly file format. That's why I decided to build a tool to generate them.<br> +My main motivation was the technical challenge. +<br> + + <h2>Features and changelog</h2> + <div class="font_reduce"> + <table> + <tr> + <th>Description</th> + <th>Version</th> + </tr> + <tr> + <td>Build a polyglot file valid as PDF and ZIP format and that can be opened with 7Zip and Windows Explorer</td> + <td>POC</td> + </tr> + <tr> + <td>Add a stream object in PDF part</td> + <td>POC</td> + </tr> + <tr> + <td>Polyglot file checked without warning with <a href="https://poppler.freedesktop.org/">pdftocairo</a></td> + <td> >= 1.0</td> + </tr> + <tr> + <td>Polyglot file checked without warning with <a href="https://github.com/ANSSI-FR/caradoc">caradoc</a></td> + <td> >= 1.0</td> + </tr> + <tr> + <td>Rebuild PDF Xref Table</td> + <td>>= 1.0</td> + </tr> + <tr> + <td>Stream object with correct length header value</td> + <td>>= 1.0</td> + </tr> + <tr> + <td>Format "zippdf", file without offset after Zip data</td> + <td>>= 1.1</td> + </tr> + <tr> + <td>Polyglot file keep original PDF version</td> + <td>>= 1.1.1</td> + </tr> + <tr> + <td>Add "szippdf" format without offset before and after Zip data</td> + <td>>= 1.2</td> + </tr> + <tr> + <td>Fix /Length stream object value and PDF offset for szippdf format</td> + <td>>= 1.2.1</td> + </tr> + <tr> + <td>PDF object numbers reorder after insertion</td> + <td>>= 1.3</td> + </tr> + </table> + </div> + + <h2>Polyglot file compatibility</h2> + <div class="font_reduce"> + <table> + <tr> + <th>Software</th> + <th>Formats</th> + <th>status</th> + </tr> + <tr> + <td>Acrobat Reader</td> + <td>pdfzip, zippdf</td> + <td>OK</td> + </tr> + <tr> + <td>Acrobat Reader</td> + <td>szippdf</td> + <td><span class="warning">KO</span></td> + </tr> + <tr> + <td>Sumatra PDF</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>Edge</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>Firefox</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>7zip</td> + <td>pdfzip, zippdf</td> + <td><span class="warning">OK with warning</span></td> + </tr> + <tr> + <td>7zip</td> + <td>szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>Explorer Windows</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>Info-ZIP (unzip)</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>Evince</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>pdftocairo -pdf</td> + <td>pdfzip, zippdf, szippdf</td> + <td>OK</td> + </tr> + <tr> + <td>caradoc stats</td> + <td>pdfzip</td> + <td>OK</td> + </tr> + <tr> + <td>java</td> + <td>szippdf</td> + <td>OK</td> + </tr> + + </table> + </div> + + + <h2>Examples</h2> + <ul> + <li><a href="/samples/">Polyglot files repository</a></li> + </ul> + <div class="font_reduce"> + <table> + <tr> + <th>PDF input file</th> + <th>Zip input file</th> + <th>Format</th> + <th>Polyglot</th> + <th>Comment</th> + </tr> + <tr> + <td><a href="/samples/pdfzip/poc1/doc.pdf">doc.pdf</a></td> + <td><a href="/samples/pdfzip/poc1/archive.zip">archive.zip</a></td> + <td>pdfzip</td> + <td><a href="/samples/pdfzip/poc1/polyglot.pdf">polyglot.pdf</a></td> + <td>PDF/ZIP polyglot - 122 Ko</td> + </tr> + <tr> + <td><a href="/samples/pdfzip/poc2/orwell_1984.pdf">orwell_1984.pdf</a></td> + <td><a href="/samples/pdfzip/poc2/file-FILE5_32.zip">file-FILE5_32.zip</a></td> + <td>pdfzip</td> + <td><a href="/samples/pdfzip/poc2/polyglot.pdf">polyglot.pdf</a></td> + <td>PDF/ZIP polyglot - 1.3 Mo</td> + </tr> + <tr> + <td><a href="/samples/pdfzip/poc3/x86asm.pdf">x86asm.pdf</a></td> + <td><a href="/samples/pdfzip/poc3/fasmw17304.zip">fasmw17304.zip</a></td> + <td>pdfzip</td> + <td><a href="/samples/pdfzip/poc3/polyglot.pdf">polyglot.pdf</a></td> + <td>PDF/ZIP polyglot - 1.8 Mo</td> + </tr> + <tr> + <td><a href="/samples/zippdf/poc4/doc.pdf">doc.pdf</a></td> + <td><a href="/samples/zippdf/poc4/archive.zip">archive.zip</a></td> + <td>zippdf</td> + <td><a href="/samples/zippdf/poc4/polyglot.pdf">polyglot.pdf</a></td> + <td>PDF/ZIP polyglot - 112 Ko</td> + </tr> + <tr> + <td><a href="/samples/szippdf/poc5/electronics.pdf">electronics.pdf</a></td> + <td><a href="/samples/szippdf/poc5/hello_world.jar">hello_world.jar</a></td> + <td>szippdf</td> + <td><a href="/samples/szippdf/poc5/polyglot.pdf">polyglot.pdf</a></td> + <td>PDF/JAR polyglot - 778 Ko</td> + </tr> + <tr> + <td><a href="/samples/pdfzip/poc6/hexinator.pdf">hexinator.pdf</a></td> + <td><a href="/samples/pdfzip/poc6/eicar.zip">eicar.zip</a> (<a href="https://www.virustotal.com/#/file/2174e17e6b03bb398666c128e6ab0a27d4ad6f7d7922127fe828e07aa94ab79d/detection">scan virustotal.com</a>)</td> + <td>pdfzip</td> + <td><a href="/samples/pdfzip/poc6/polyglot.pdf">polyglot.pdf</a> (<a href="https://www.virustotal.com/#/file/883d08efc14e0cacc9a260d84fdef285b383cc9a9125366dfb0bf676ddeb0f98/detection">scan virustotal.com</a>)</td> + <td>PDF/ZIP polyglot with Eicar test in Zip - 2.9 Mo</td> + </tr> + </table> + </div> + + <h2>Manual</h2> +<pre> +usage: truepolyglot format [options] output-file + +Generate a polyglot file. + +Formats availables: +* pdfzip: Generate a file valid as PDF and ZIP. The format is closest to PDF. +* zippdf: Generate a file valid as ZIP and PDF. The format is closest to ZIP. +* szippdf: Generate a file valid as ZIP and PDF. The format is strictly a ZIP. Archive is modified. + +positional arguments: + {pdfzip,zippdf,szippdf} + Output polyglot format + output_file Output polyglot file path + +optional arguments: + -h, --help show this help message and exit + --pdffile PDFFILE PDF input file + --zipfile ZIPFILE ZIP input file + --verbose {none,error,info,debug} + Verbosity level (default: debug) + +TruePolyglot v1.3 +</pre> + + <h2>Code</h2> + +<a href="https://git.hackade.org/truepolyglot.git/">Project Git repository</a> + + <h2>Contact</h2> +On <a href="https://webchat.freenode.net/">IRC Freenode</a> my nickname is hackade or by mail at <a href="mailtp:truepolyglot@hackade.org">truepolyglot@hackade.org</a>. + +</body> +</html> diff --git a/website/robots.txt b/website/robots.txt new file mode 100644 index 0000000..77470cb --- /dev/null +++ b/website/robots.txt @@ -0,0 +1,2 @@ +User-agent: * +Disallow: /
\ No newline at end of file diff --git a/website/samples/pdfzip/poc1/archive.zip b/website/samples/pdfzip/poc1/archive.zip Binary files differnew file mode 100644 index 0000000..c1eaa24 --- /dev/null +++ b/website/samples/pdfzip/poc1/archive.zip diff --git a/website/samples/pdfzip/poc1/doc.pdf b/website/samples/pdfzip/poc1/doc.pdf Binary files differnew file mode 100644 index 0000000..b31c420 --- /dev/null +++ b/website/samples/pdfzip/poc1/doc.pdf diff --git a/website/samples/pdfzip/poc1/polyglot.pdf b/website/samples/pdfzip/poc1/polyglot.pdf Binary files differnew file mode 100644 index 0000000..5943e1c --- /dev/null +++ b/website/samples/pdfzip/poc1/polyglot.pdf diff --git a/website/samples/pdfzip/poc2/file-FILE5_32.zip b/website/samples/pdfzip/poc2/file-FILE5_32.zip Binary files differnew file mode 100644 index 0000000..8a4b897 --- /dev/null +++ b/website/samples/pdfzip/poc2/file-FILE5_32.zip diff --git a/website/samples/pdfzip/poc2/orwell_1984.pdf b/website/samples/pdfzip/poc2/orwell_1984.pdf Binary files differnew file mode 100644 index 0000000..687d12a --- /dev/null +++ b/website/samples/pdfzip/poc2/orwell_1984.pdf diff --git a/website/samples/pdfzip/poc2/polyglot.pdf b/website/samples/pdfzip/poc2/polyglot.pdf Binary files differnew file mode 100644 index 0000000..497a427 --- /dev/null +++ b/website/samples/pdfzip/poc2/polyglot.pdf diff --git a/website/samples/pdfzip/poc3/fasmw17304.zip b/website/samples/pdfzip/poc3/fasmw17304.zip Binary files differnew file mode 100644 index 0000000..8e96137 --- /dev/null +++ b/website/samples/pdfzip/poc3/fasmw17304.zip diff --git a/website/samples/pdfzip/poc3/polyglot.pdf b/website/samples/pdfzip/poc3/polyglot.pdf Binary files differnew file mode 100644 index 0000000..a82491b --- /dev/null +++ b/website/samples/pdfzip/poc3/polyglot.pdf diff --git a/website/samples/pdfzip/poc3/x86asm.pdf b/website/samples/pdfzip/poc3/x86asm.pdf Binary files differnew file mode 100644 index 0000000..b7b9f4a --- /dev/null +++ b/website/samples/pdfzip/poc3/x86asm.pdf diff --git a/website/samples/pdfzip/poc6/eicar.zip b/website/samples/pdfzip/poc6/eicar.zip Binary files differnew file mode 100644 index 0000000..02850ca --- /dev/null +++ b/website/samples/pdfzip/poc6/eicar.zip diff --git a/website/samples/pdfzip/poc6/hexinator.pdf b/website/samples/pdfzip/poc6/hexinator.pdf Binary files differnew file mode 100644 index 0000000..0f87b0f --- /dev/null +++ b/website/samples/pdfzip/poc6/hexinator.pdf diff --git a/website/samples/pdfzip/poc6/polyglot.pdf b/website/samples/pdfzip/poc6/polyglot.pdf Binary files differnew file mode 100644 index 0000000..7957207 --- /dev/null +++ b/website/samples/pdfzip/poc6/polyglot.pdf diff --git a/website/samples/szippdf/poc5/electronics.pdf b/website/samples/szippdf/poc5/electronics.pdf Binary files differnew file mode 100644 index 0000000..6582363 --- /dev/null +++ b/website/samples/szippdf/poc5/electronics.pdf diff --git a/website/samples/szippdf/poc5/hello_world.jar b/website/samples/szippdf/poc5/hello_world.jar Binary files differnew file mode 100644 index 0000000..b875e1c --- /dev/null +++ b/website/samples/szippdf/poc5/hello_world.jar diff --git a/website/samples/szippdf/poc5/polyglot.pdf b/website/samples/szippdf/poc5/polyglot.pdf Binary files differnew file mode 100644 index 0000000..7733caf --- /dev/null +++ b/website/samples/szippdf/poc5/polyglot.pdf diff --git a/website/samples/zippdf/poc4/archive.zip b/website/samples/zippdf/poc4/archive.zip Binary files differnew file mode 100644 index 0000000..c1eaa24 --- /dev/null +++ b/website/samples/zippdf/poc4/archive.zip diff --git a/website/samples/zippdf/poc4/doc.pdf b/website/samples/zippdf/poc4/doc.pdf Binary files differnew file mode 100644 index 0000000..b31c420 --- /dev/null +++ b/website/samples/zippdf/poc4/doc.pdf diff --git a/website/samples/zippdf/poc4/polyglot.pdf b/website/samples/zippdf/poc4/polyglot.pdf Binary files differnew file mode 100644 index 0000000..0993dd4 --- /dev/null +++ b/website/samples/zippdf/poc4/polyglot.pdf diff --git a/website/start_server.sh b/website/start_server.sh new file mode 100755 index 0000000..0060e39 --- /dev/null +++ b/website/start_server.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "http://127.0.0.1:8000" +python -m SimpleHTTPServer 8000 diff --git a/website/update.sh b/website/update.sh new file mode 100755 index 0000000..ed7f515 --- /dev/null +++ b/website/update.sh @@ -0,0 +1,2 @@ +#!/bin/bash +rsync -av --progress ./ -e ssh dragon:/var/www/html/truepolyglot/ |