diff options
Diffstat (limited to 'PdfFileTransformer/PyPDF2/generic.py')
-rw-r--r-- | PdfFileTransformer/PyPDF2/generic.py | 1228 |
1 files changed, 1228 insertions, 0 deletions
diff --git a/PdfFileTransformer/PyPDF2/generic.py b/PdfFileTransformer/PyPDF2/generic.py new file mode 100644 index 0000000..959957d --- /dev/null +++ b/PdfFileTransformer/PyPDF2/generic.py @@ -0,0 +1,1228 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import re +from .utils import readNonWhitespace, RC4_encrypt, skipOverComment +from .utils import b_, u_, chr_, ord_ +from .utils import PdfStreamError +import warnings +from . import filters +from . import utils +import decimal +import codecs +import sys +#import debugging + +ObjectPrefix = b_('/<[tf(n%') +NumberSigns = b_('+-') +IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")) + + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + idx = ObjectPrefix.find(tok) + if idx == 0: + # name object + return NameObject.readFromStream(stream, pdf) + elif idx == 1: + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == b_('<<'): + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif idx == 2: + # array object + return ArrayObject.readFromStream(stream, pdf) + elif idx == 3 or idx == 4: + # boolean object + return BooleanObject.readFromStream(stream) + elif idx == 5: + # string object + return readStringFromStream(stream) + elif idx == 6: + # null object + return NullObject.readFromStream(stream) + elif idx == 7: + # comment + while tok not in (b_('\r'), b_('\n')): + tok = stream.read(1) + # Prevents an infinite loop by raising an error if the stream is at + # the EOF + if len(tok) <= 0: + raise PdfStreamError("File ended unexpectedly.") + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + + +class PdfObject(object): + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("null")) + + def readFromStream(stream): + nulltxt = stream.read(4) + if nulltxt != b_("null"): + raise utils.PdfReadError("Could not read Null object") + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write(b_("true")) + else: + stream.write(b_("false")) + + def readFromStream(stream): + word = stream.read(4) + if word == b_("true"): + return BooleanObject(True) + elif word == b_("fals"): + stream.read(1) + return BooleanObject(False) + else: + raise utils.PdfReadError('Could not read Boolean object') + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("[")) + for data in self: + stream.write(b_(" ")) + data.writeToStream(stream, encryption_key) + stream.write(b_(" ]")) + + def readFromStream(stream, pdf): + arr = ArrayObject() + tmp = stream.read(1) + if tmp != b_("["): + raise utils.PdfReadError("Could not read array") + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == b_("]"): + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("%s %s R" % (self.idnum, self.generation))) + + def readFromStream(stream, pdf): + idnum = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + break + idnum += tok + generation = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + if not generation: + continue + break + generation += tok + r = readNonWhitespace(stream) + if r != b_("R"): + raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(decimal.Decimal, PdfObject): + def __new__(cls, value="0", context=None): + try: + return decimal.Decimal.__new__(cls, utils.str_(value), context) + except: + return decimal.Decimal.__new__(cls, str(value)) + + def __repr__(self): + if self == self.to_integral(): + return str(self.quantize(decimal.Decimal(1))) + else: + # Standard formatting adds useless extraneous zeros. + o = "%.5f" % self + # Remove the zeros. + while o and o[-1] == '0': + o = o[:-1] + return o + + def as_numeric(self): + return float(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + +class NumberObject(int, PdfObject): + NumberPattern = re.compile(b_('[^+-.0-9]')) + ByteDot = b_(".") + + def __new__(cls, value): + val = int(value) + try: + return int.__new__(cls, val) + except OverflowError: + return int.__new__(cls, 0) + + def as_numeric(self): + return int(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + def readFromStream(stream): + num = utils.readUntilRegex(stream, NumberObject.NumberPattern) + if num.find(NumberObject.ByteDot) != -1: + return FloatObject(num) + else: + return NumberObject(num) + readFromStream = staticmethod(readFromStream) + + +## +# Given a string (either a "str" or "unicode"), create a ByteStringObject or a +# TextStringObject to represent the string. +def createStringObject(string): + if isinstance(string, utils.string_type): + return TextStringObject(string) + elif isinstance(string, utils.bytes_type): + try: + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = b_("") + while True: + tok = readNonWhitespace(stream) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_(">"): + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = b_("") + if len(x) == 1: + x += b_("0") + if len(x) == 2: + txt += chr(int(x, base=16)) + return createStringObject(b_(txt)) + + +def readStringFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_("("): + parens += 1 + elif tok == b_(")"): + parens -= 1 + if parens == 0: + break + elif tok == b_("\\"): + tok = stream.read(1) + ESCAPE_DICT = {b_("n") : b_("\n"), + b_("r") : b_("\r"), + b_("t") : b_("\t"), + b_("b") : b_("\b"), + b_("f") : b_("\f"), + b_("c") : b_("\c"), + b_("(") : b_("("), + b_(")") : b_(")"), + b_("/") : b_("/"), + b_("\\") : b_("\\"), + b_(" ") : b_(" "), + b_("/") : b_("/"), + b_("%") : b_("%"), + b_("<") : b_("<"), + b_(">") : b_(">"), + b_("[") : b_("["), + b_("]") : b_("]"), + b_("#") : b_("#"), + b_("_") : b_("_"), + b_("&") : b_("&"), + b_('$') : b_('$'), + } + try: + tok = ESCAPE_DICT[tok] + except KeyError: + if tok.isdigit(): + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for i in range(2): + ntok = stream.read(1) + if ntok.isdigit(): + tok += ntok + else: + break + tok = b_(chr(int(tok, base=8))) + elif tok in b_("\n\r"): + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in b_("\n\r"): + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b_('') + else: + raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) + txt += tok + return createStringObject(txt) + + +## +# Represents a string object where the text encoding could not be determined. +# This occurs quite often, as the PDF spec doesn't provide an alternate way to +# represent strings -- for example, the encryption data stored in files (like +# /O) is clearly not text, but is still stored in a "String" object. +class ByteStringObject(utils.bytes_type, PdfObject): + + ## + # For compatibility with TextStringObject.original_bytes. This method + # returns self. + original_bytes = property(lambda self: self) + + def writeToStream(self, stream, encryption_key): + bytearr = self + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + stream.write(b_("<")) + stream.write(utils.hexencode(bytearr)) + stream.write(b_(">")) + + +## +# Represents a string object that has been decoded into a real unicode string. +# If read from a PDF document, this string appeared to match the +# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to +# occur. +class TextStringObject(utils.string_type, PdfObject): + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + ## + # It is occasionally possible that a text string object gets created where + # a byte string object was expected due to the autodetection mechanism -- + # if that occurs, this "original_bytes" property can be used to + # back-calculate what the original encoded bytes were. + original_bytes = property(lambda self: self.get_original_bytes()) + + def get_original_bytes(self): + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def writeToStream(self, stream, encryption_key): + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.writeToStream(stream, None) + else: + stream.write(b_("(")) + for c in bytearr: + if not chr_(c).isalnum() and c != b_(' '): + stream.write(b_("\\%03o" % ord_(c))) + else: + stream.write(b_(chr_(c))) + stream.write(b_(")")) + + +class NameObject(str, PdfObject): + delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) + surfix = b_("/") + + def writeToStream(self, stream, encryption_key): + stream.write(b_(self)) + + def readFromStream(stream, pdf): + debug = False + if debug: print((stream.tell())) + name = stream.read(1) + if name != NameObject.surfix: + raise utils.PdfReadError("name read error") + name += utils.readUntilRegex(stream, NameObject.delimiterPattern, + ignore_eof=True) + if debug: print(name) + try: + return NameObject(name.decode('utf-8')) + except (UnicodeEncodeError, UnicodeDecodeError) as e: + # Name objects should represent irregular characters + # with a '#' followed by the symbol's hex number + if not pdf.strict: + warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) + return NameObject(name) + else: + raise utils.PdfReadError("Illegal character in Name Object") + + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + def raw_get(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, value): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value=None): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) + + def __getitem__(self, key): + return dict.__getitem__(self, key).getObject() + + ## + # Retrieves XMP (Extensible Metadata Platform) data relevant to the + # this object, if available. + # <p> + # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + from . import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + # <p> + # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + for key, value in list(self.items()): + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + stream.write(b_(">>")) + + def readFromStream(stream, pdf): + debug = False + tmp = stream.read(2) + if tmp != b_("<<"): + raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) + data = {} + while True: + tok = readNonWhitespace(stream) + if tok == b_('\x00'): + continue + elif tok == b_('%'): + stream.seek(-1, 1) + skipOverComment(stream) + continue + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + + if debug: print(("Tok:", tok)) + if tok == b_(">"): + stream.read(1) + break + stream.seek(-1, 1) + key = readObject(stream, pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, pdf) + if not data.get(key): + data[key] = value + elif pdf.strict: + # multiple definitions of key not permitted + raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key)) + else: + warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning) + + pos = stream.tell() + s = readNonWhitespace(stream) + if s == b_('s') and stream.read(5) == b_('tream'): + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == b_(' '): + eol = stream.read(1) + assert eol in (b_("\n"), b_("\r")) + if eol == b_("\r"): + # read \n after + if stream.read(1) != b_('\n'): + stream.seek(-1, 1) + # this is a stream object, not a dictionary + assert "/Length" in data + length = data["/Length"] + if debug: print(data) + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.getObject(length) + stream.seek(t, 0) + data["__streamdata__"] = stream.read(length) + if debug: print("here") + #if debug: print(binascii.hexlify(data["__streamdata__"])) + e = readNonWhitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != b_("endstream"): + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == b_("endstream"): + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + else: + if debug: print(("E", e, ndstream, debugging.toHex(end))) + stream.seek(pos, 0) + raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) + else: + stream.seek(pos, 0) + if "__streamdata__" in data: + return StreamObject.initializeFromDictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + readFromStream = staticmethod(readFromStream) + + +class TreeObject(DictionaryObject): + def __init__(self): + DictionaryObject.__init__(self) + + def hasChildren(self): + return '/First' in self + + def __iter__(self): + return self.children() + + def children(self): + if not self.hasChildren(): + raise StopIteration + + child = self['/First'] + while True: + yield child + if child == self['/Last']: + raise StopIteration + child = child['/Next'] + + def addChild(self, child, pdf): + childObj = child.getObject() + child = pdf.getReference(childObj) + assert isinstance(child, IndirectObject) + + if '/First' not in self: + self[NameObject('/First')] = child + self[NameObject('/Count')] = NumberObject(0) + prev = None + else: + prev = self['/Last'] + + self[NameObject('/Last')] = child + self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1) + + if prev: + prevRef = pdf.getReference(prev) + assert isinstance(prevRef, IndirectObject) + childObj[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = child + + parentRef = pdf.getReference(self) + assert isinstance(parentRef, IndirectObject) + childObj[NameObject('/Parent')] = parentRef + + def removeChild(self, child): + childObj = child.getObject() + + if NameObject('/Parent') not in childObj: + raise ValueError("Removed child does not appear to be a tree item") + elif childObj[NameObject('/Parent')] != self: + raise ValueError("Removed child is not a member of this tree") + + found = False + prevRef = None + prev = None + curRef = self[NameObject('/First')] + cur = curRef.getObject() + lastRef = self[NameObject('/Last')] + last = lastRef.getObject() + while cur != None: + if cur == childObj: + if prev == None: + if NameObject('/Next') in cur: + # Removing first tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + del next[NameObject('/Prev')] + self[NameObject('/First')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + + else: + # Removing only tree node + assert self[NameObject('/Count')] == 1 + del self[NameObject('/Count')] + del self[NameObject('/First')] + if NameObject('/Last') in self: + del self[NameObject('/Last')] + else: + if NameObject('/Next') in cur: + # Removing middle tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + next[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + else: + # Removing last tree node + assert cur == last + del prev[NameObject('/Next')] + self[NameObject('/Last')] = prevRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + found = True + break + + prevRef = curRef + prev = cur + if NameObject('/Next') in cur: + curRef = cur[NameObject('/Next')] + cur = curRef.getObject() + else: + curRef = None + cur = None + + if not found: + raise ValueError("Removal couldn't find item in tree") + + del childObj[NameObject('/Parent')] + if NameObject('/Next') in childObj: + del childObj[NameObject('/Next')] + if NameObject('/Prev') in childObj: + del childObj[NameObject('/Prev')] + + def emptyTree(self): + for child in self: + childObj = child.getObject() + del childObj[NameObject('/Parent')] + if NameObject('/Next') in childObj: + del childObj[NameObject('/Next')] + if NameObject('/Prev') in childObj: + del childObj[NameObject('/Prev')] + + if NameObject('/Count') in self: + del self[NameObject('/Count')] + if NameObject('/First') in self: + del self[NameObject('/First')] + if NameObject('/Last') in self: + del self[NameObject('/Last')] + + +class StreamObject(DictionaryObject): + def __init__(self): + self._data = None + self.decodedSelf = None + + def writeToStream(self, stream, encryption_key): + self[NameObject("/Length")] = NumberObject(len(self._data)) + DictionaryObject.writeToStream(self, stream, encryption_key) + del self["/Length"] + stream.write(b_("\nstream\n")) + data = self._data + if encryption_key: + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write(b_("\nendstream")) + + def initializeFromDictionary(data): + if "/Filter" in data: + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data["/Length"] + retval.update(data) + return retval + initializeFromDictionary = staticmethod(initializeFromDictionary) + + def flateEncode(self): + if "/Filter" in self: + f = self["/Filter"] + if isinstance(f, ArrayObject): + f.insert(0, NameObject("/FlateDecode")) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject("/Filter")] = f + retval._data = filters.FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def getData(self): + return self._data + + def setData(self, data): + self._data = data + + +class EncodedStreamObject(StreamObject): + def __init__(self): + self.decodedSelf = None + + def getData(self): + if self.decodedSelf: + # cached version of decoded object + return self.decodedSelf.getData() + else: + # create decoded object + decoded = DecodedStreamObject() + + decoded._data = filters.decodeStreamData(self) + for key, value in list(self.items()): + if not key in ("/Length", "/Filter", "/DecodeParms"): + decoded[key] = value + self.decodedSelf = decoded + return decoded._data + + def setData(self, data): + raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported") + + +class RectangleObject(ArrayObject): + """ + This class is used to represent *page boxes* in PyPDF2. These boxes include: + + * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>` + * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>` + * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>` + * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>` + * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>` + """ + def __init__(self, arr): + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + + def ensureIsNumber(self, value): + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) + return value + + def __repr__(self): + return "RectangleObject(%s)" % repr(list(self)) + + def getLowerLeft_x(self): + return self[0] + + def getLowerLeft_y(self): + return self[1] + + def getUpperRight_x(self): + return self[2] + + def getUpperRight_y(self): + return self[3] + + def getUpperLeft_x(self): + return self.getLowerLeft_x() + + def getUpperLeft_y(self): + return self.getUpperRight_y() + + def getLowerRight_x(self): + return self.getUpperRight_x() + + def getLowerRight_y(self): + return self.getLowerLeft_y() + + def getLowerLeft(self): + return self.getLowerLeft_x(), self.getLowerLeft_y() + + def getLowerRight(self): + return self.getLowerRight_x(), self.getLowerRight_y() + + def getUpperLeft(self): + return self.getUpperLeft_x(), self.getUpperLeft_y() + + def getUpperRight(self): + return self.getUpperRight_x(), self.getUpperRight_y() + + def setLowerLeft(self, value): + self[0], self[1] = [self.ensureIsNumber(x) for x in value] + + def setLowerRight(self, value): + self[2], self[1] = [self.ensureIsNumber(x) for x in value] + + def setUpperLeft(self, value): + self[0], self[3] = [self.ensureIsNumber(x) for x in value] + + def setUpperRight(self, value): + self[2], self[3] = [self.ensureIsNumber(x) for x in value] + + def getWidth(self): + return self.getUpperRight_x() - self.getLowerLeft_x() + + def getHeight(self): + return self.getUpperRight_y() - self.getLowerLeft_y() + + lowerLeft = property(getLowerLeft, setLowerLeft, None, None) + """ + Property to read and modify the lower left coordinate of this box + in (x,y) form. + """ + lowerRight = property(getLowerRight, setLowerRight, None, None) + """ + Property to read and modify the lower right coordinate of this box + in (x,y) form. + """ + upperLeft = property(getUpperLeft, setUpperLeft, None, None) + """ + Property to read and modify the upper left coordinate of this box + in (x,y) form. + """ + upperRight = property(getUpperRight, setUpperRight, None, None) + """ + Property to read and modify the upper right coordinate of this box + in (x,y) form. + """ + + +class Field(TreeObject): + """ + A class representing a field dictionary. This class is accessed through + :meth:`getFields()<PyPDF2.PdfFileReader.getFields>` + """ + def __init__(self, data): + DictionaryObject.__init__(self) + attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff", + "/V", "/DV", "/AA") + for attr in attributes: + try: + self[NameObject(attr)] = data[attr] + except KeyError: + pass + + fieldType = property(lambda self: self.get("/FT")) + """ + Read-only property accessing the type of this field. + """ + + parent = property(lambda self: self.get("/Parent")) + """ + Read-only property accessing the parent of this field. + """ + + kids = property(lambda self: self.get("/Kids")) + """ + Read-only property accessing the kids of this field. + """ + + name = property(lambda self: self.get("/T")) + """ + Read-only property accessing the name of this field. + """ + + altName = property(lambda self: self.get("/TU")) + """ + Read-only property accessing the alternate name of this field. + """ + + mappingName = property(lambda self: self.get("/TM")) + """ + Read-only property accessing the mapping name of this field. This + name is used by PyPDF2 as a key in the dictionary returned by + :meth:`getFields()<PyPDF2.PdfFileReader.getFields>` + """ + + flags = property(lambda self: self.get("/Ff")) + """ + Read-only property accessing the field flags, specifying various + characteristics of the field (see Table 8.70 of the PDF 1.7 reference). + """ + + value = property(lambda self: self.get("/V")) + """ + Read-only property accessing the value of this field. Format + varies based on field type. + """ + + defaultValue = property(lambda self: self.get("/DV")) + """ + Read-only property accessing the default value of this field. + """ + + additionalActions = property(lambda self: self.get("/AA")) + """ + Read-only property accessing the additional actions dictionary. + This dictionary defines the field's behavior in response to trigger events. + See Section 8.5.2 of the PDF 1.7 reference. + """ + + +class Destination(TreeObject): + """ + A class representing a destination within a PDF file. + See section 8.2.1 of the PDF 1.6 reference. + + :param str title: Title of this destination. + :param int page: Page number of this destination. + :param str typ: How the destination is displayed. + :param args: Additional arguments may be necessary depending on the type. + :raises PdfReadError: If destination type is invalid. + + Valid ``typ`` arguments (see PDF spec for details): + /Fit No additional arguments + /XYZ [left] [top] [zoomFactor] + /FitH [top] + /FitV [left] + /FitR [left] [bottom] [right] [top] + /FitB No additional arguments + /FitBH [top] + /FitBV [left] + """ + def __init__(self, title, page, typ, *args): + DictionaryObject.__init__(self) + self[NameObject("/Title")] = title + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ + + # from table 8.2 of the PDF 1.7 reference. + if typ == "/XYZ": + (self[NameObject("/Left")], self[NameObject("/Top")], + self[NameObject("/Zoom")]) = args + elif typ == "/FitR": + (self[NameObject("/Left")], self[NameObject("/Bottom")], + self[NameObject("/Right")], self[NameObject("/Top")]) = args + elif typ in ["/FitH", "/FitBH"]: + self[NameObject("/Top")], = args + elif typ in ["/FitV", "/FitBV"]: + self[NameObject("/Left")], = args + elif typ in ["/Fit", "/FitB"]: + pass + else: + raise utils.PdfReadError("Unknown Destination Type: %r" % typ) + + def getDestArray(self): + return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self]) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + key = NameObject('/D') + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + + key = NameObject("/S") + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = NameObject("/GoTo") + value.writeToStream(stream, encryption_key) + + stream.write(b_("\n")) + stream.write(b_(">>")) + + title = property(lambda self: self.get("/Title")) + """ + Read-only property accessing the destination title. + + :rtype: str + """ + + page = property(lambda self: self.get("/Page")) + """ + Read-only property accessing the destination page number. + + :rtype: int + """ + + typ = property(lambda self: self.get("/Type")) + """ + Read-only property accessing the destination type. + + :rtype: str + """ + + zoom = property(lambda self: self.get("/Zoom", None)) + """ + Read-only property accessing the zoom factor. + + :rtype: int, or ``None`` if not available. + """ + + left = property(lambda self: self.get("/Left", None)) + """ + Read-only property accessing the left horizontal coordinate. + + :rtype: int, or ``None`` if not available. + """ + + right = property(lambda self: self.get("/Right", None)) + """ + Read-only property accessing the right horizontal coordinate. + + :rtype: int, or ``None`` if not available. + """ + + top = property(lambda self: self.get("/Top", None)) + """ + Read-only property accessing the top vertical coordinate. + + :rtype: int, or ``None`` if not available. + """ + + bottom = property(lambda self: self.get("/Bottom", None)) + """ + Read-only property accessing the bottom vertical coordinate. + + :rtype: int, or ``None`` if not available. + """ + + +class Bookmark(Destination): + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]: + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.raw_get(key) + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + key = NameObject('/Dest') + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + stream.write(b_(">>")) + + +def encode_pdfdocencoding(unicode_string): + retval = b_('') + for c in unicode_string: + try: + retval += b_(chr(_pdfDocEncoding_rev[c])) + except KeyError: + raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, + "does not exist in translation table") + return retval + + +def decode_pdfdocencoding(byte_array): + retval = u_('') + for b in byte_array: + c = _pdfDocEncoding[ord_(b)] + if c == u_('\u0000'): + raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1, + "does not exist in translation table") + retval += c + return retval + +_pdfDocEncoding = ( + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), + u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), + u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), + u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), + u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), + u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), + u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), + u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), + u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), + u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), + u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), + u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), + u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), + u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), + u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), + u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), + u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), + u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), + u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), + u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), + u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), + u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), + u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), + u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), + u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), + u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), + u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), + u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), + u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') +) + +assert len(_pdfDocEncoding) == 256 + +_pdfDocEncoding_rev = {} +for i in range(256): + char = _pdfDocEncoding[i] + if char == u_("\u0000"): + continue + assert char not in _pdfDocEncoding_rev + _pdfDocEncoding_rev[char] = i |