aboutsummaryrefslogtreecommitdiffstats
path: root/PdfFileTransformer/PyPDF2/generic.py
diff options
context:
space:
mode:
authorben2018-09-18 10:52:38 +0200
committerben2018-09-18 10:52:38 +0200
commitf57654b84b4cf0ffa1287034fc9f66ba200bb259 (patch)
tree5ffb371ce5b5008052e425955f45c8b808ba7fa0 /PdfFileTransformer/PyPDF2/generic.py
downloadtruepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.gz
truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.bz2
truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.xz
First public commit
Diffstat (limited to 'PdfFileTransformer/PyPDF2/generic.py')
-rw-r--r--PdfFileTransformer/PyPDF2/generic.py1228
1 files changed, 1228 insertions, 0 deletions
diff --git a/PdfFileTransformer/PyPDF2/generic.py b/PdfFileTransformer/PyPDF2/generic.py
new file mode 100644
index 0000000..959957d
--- /dev/null
+++ b/PdfFileTransformer/PyPDF2/generic.py
@@ -0,0 +1,1228 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of generic PDF objects (dictionary, number, string, and so on)
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "biziqe@mathieu.fenniak.net"
+
+import re
+from .utils import readNonWhitespace, RC4_encrypt, skipOverComment
+from .utils import b_, u_, chr_, ord_
+from .utils import PdfStreamError
+import warnings
+from . import filters
+from . import utils
+import decimal
+import codecs
+import sys
+#import debugging
+
+ObjectPrefix = b_('/<[tf(n%')
+NumberSigns = b_('+-')
+IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
+
+
+def readObject(stream, pdf):
+ tok = stream.read(1)
+ stream.seek(-1, 1) # reset to start
+ idx = ObjectPrefix.find(tok)
+ if idx == 0:
+ # name object
+ return NameObject.readFromStream(stream, pdf)
+ elif idx == 1:
+ # hexadecimal string OR dictionary
+ peek = stream.read(2)
+ stream.seek(-2, 1) # reset to start
+ if peek == b_('<<'):
+ return DictionaryObject.readFromStream(stream, pdf)
+ else:
+ return readHexStringFromStream(stream)
+ elif idx == 2:
+ # array object
+ return ArrayObject.readFromStream(stream, pdf)
+ elif idx == 3 or idx == 4:
+ # boolean object
+ return BooleanObject.readFromStream(stream)
+ elif idx == 5:
+ # string object
+ return readStringFromStream(stream)
+ elif idx == 6:
+ # null object
+ return NullObject.readFromStream(stream)
+ elif idx == 7:
+ # comment
+ while tok not in (b_('\r'), b_('\n')):
+ tok = stream.read(1)
+ # Prevents an infinite loop by raising an error if the stream is at
+ # the EOF
+ if len(tok) <= 0:
+ raise PdfStreamError("File ended unexpectedly.")
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ return readObject(stream, pdf)
+ else:
+ # number object OR indirect reference
+ peek = stream.read(20)
+ stream.seek(-len(peek), 1) # reset to start
+ if IndirectPattern.match(peek) != None:
+ return IndirectObject.readFromStream(stream, pdf)
+ else:
+ return NumberObject.readFromStream(stream)
+
+
+class PdfObject(object):
+ def getObject(self):
+ """Resolves indirect references."""
+ return self
+
+
+class NullObject(PdfObject):
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("null"))
+
+ def readFromStream(stream):
+ nulltxt = stream.read(4)
+ if nulltxt != b_("null"):
+ raise utils.PdfReadError("Could not read Null object")
+ return NullObject()
+ readFromStream = staticmethod(readFromStream)
+
+
+class BooleanObject(PdfObject):
+ def __init__(self, value):
+ self.value = value
+
+ def writeToStream(self, stream, encryption_key):
+ if self.value:
+ stream.write(b_("true"))
+ else:
+ stream.write(b_("false"))
+
+ def readFromStream(stream):
+ word = stream.read(4)
+ if word == b_("true"):
+ return BooleanObject(True)
+ elif word == b_("fals"):
+ stream.read(1)
+ return BooleanObject(False)
+ else:
+ raise utils.PdfReadError('Could not read Boolean object')
+ readFromStream = staticmethod(readFromStream)
+
+
+class ArrayObject(list, PdfObject):
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("["))
+ for data in self:
+ stream.write(b_(" "))
+ data.writeToStream(stream, encryption_key)
+ stream.write(b_(" ]"))
+
+ def readFromStream(stream, pdf):
+ arr = ArrayObject()
+ tmp = stream.read(1)
+ if tmp != b_("["):
+ raise utils.PdfReadError("Could not read array")
+ while True:
+ # skip leading whitespace
+ tok = stream.read(1)
+ while tok.isspace():
+ tok = stream.read(1)
+ stream.seek(-1, 1)
+ # check for array ending
+ peekahead = stream.read(1)
+ if peekahead == b_("]"):
+ break
+ stream.seek(-1, 1)
+ # read and append obj
+ arr.append(readObject(stream, pdf))
+ return arr
+ readFromStream = staticmethod(readFromStream)
+
+
+class IndirectObject(PdfObject):
+ def __init__(self, idnum, generation, pdf):
+ self.idnum = idnum
+ self.generation = generation
+ self.pdf = pdf
+
+ def getObject(self):
+ return self.pdf.getObject(self).getObject()
+
+ def __repr__(self):
+ return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
+
+ def __eq__(self, other):
+ return (
+ other != None and
+ isinstance(other, IndirectObject) and
+ self.idnum == other.idnum and
+ self.generation == other.generation and
+ self.pdf is other.pdf
+ )
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("%s %s R" % (self.idnum, self.generation)))
+
+ def readFromStream(stream, pdf):
+ idnum = b_("")
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok.isspace():
+ break
+ idnum += tok
+ generation = b_("")
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok.isspace():
+ if not generation:
+ continue
+ break
+ generation += tok
+ r = readNonWhitespace(stream)
+ if r != b_("R"):
+ raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
+ return IndirectObject(int(idnum), int(generation), pdf)
+ readFromStream = staticmethod(readFromStream)
+
+
+class FloatObject(decimal.Decimal, PdfObject):
+ def __new__(cls, value="0", context=None):
+ try:
+ return decimal.Decimal.__new__(cls, utils.str_(value), context)
+ except:
+ return decimal.Decimal.__new__(cls, str(value))
+
+ def __repr__(self):
+ if self == self.to_integral():
+ return str(self.quantize(decimal.Decimal(1)))
+ else:
+ # Standard formatting adds useless extraneous zeros.
+ o = "%.5f" % self
+ # Remove the zeros.
+ while o and o[-1] == '0':
+ o = o[:-1]
+ return o
+
+ def as_numeric(self):
+ return float(b_(repr(self)))
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_(repr(self)))
+
+
+class NumberObject(int, PdfObject):
+ NumberPattern = re.compile(b_('[^+-.0-9]'))
+ ByteDot = b_(".")
+
+ def __new__(cls, value):
+ val = int(value)
+ try:
+ return int.__new__(cls, val)
+ except OverflowError:
+ return int.__new__(cls, 0)
+
+ def as_numeric(self):
+ return int(b_(repr(self)))
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_(repr(self)))
+
+ def readFromStream(stream):
+ num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
+ if num.find(NumberObject.ByteDot) != -1:
+ return FloatObject(num)
+ else:
+ return NumberObject(num)
+ readFromStream = staticmethod(readFromStream)
+
+
+##
+# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
+# TextStringObject to represent the string.
+def createStringObject(string):
+ if isinstance(string, utils.string_type):
+ return TextStringObject(string)
+ elif isinstance(string, utils.bytes_type):
+ try:
+ if string.startswith(codecs.BOM_UTF16_BE):
+ retval = TextStringObject(string.decode("utf-16"))
+ retval.autodetect_utf16 = True
+ return retval
+ else:
+ # This is probably a big performance hit here, but we need to
+ # convert string objects into the text/unicode-aware version if
+ # possible... and the only way to check if that's possible is
+ # to try. Some strings are strings, some are just byte arrays.
+ retval = TextStringObject(decode_pdfdocencoding(string))
+ retval.autodetect_pdfdocencoding = True
+ return retval
+ except UnicodeDecodeError:
+ return ByteStringObject(string)
+ else:
+ raise TypeError("createStringObject should have str or unicode arg")
+
+
+def readHexStringFromStream(stream):
+ stream.read(1)
+ txt = ""
+ x = b_("")
+ while True:
+ tok = readNonWhitespace(stream)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok == b_(">"):
+ break
+ x += tok
+ if len(x) == 2:
+ txt += chr(int(x, base=16))
+ x = b_("")
+ if len(x) == 1:
+ x += b_("0")
+ if len(x) == 2:
+ txt += chr(int(x, base=16))
+ return createStringObject(b_(txt))
+
+
+def readStringFromStream(stream):
+ tok = stream.read(1)
+ parens = 1
+ txt = b_("")
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok == b_("("):
+ parens += 1
+ elif tok == b_(")"):
+ parens -= 1
+ if parens == 0:
+ break
+ elif tok == b_("\\"):
+ tok = stream.read(1)
+ ESCAPE_DICT = {b_("n") : b_("\n"),
+ b_("r") : b_("\r"),
+ b_("t") : b_("\t"),
+ b_("b") : b_("\b"),
+ b_("f") : b_("\f"),
+ b_("c") : b_("\c"),
+ b_("(") : b_("("),
+ b_(")") : b_(")"),
+ b_("/") : b_("/"),
+ b_("\\") : b_("\\"),
+ b_(" ") : b_(" "),
+ b_("/") : b_("/"),
+ b_("%") : b_("%"),
+ b_("<") : b_("<"),
+ b_(">") : b_(">"),
+ b_("[") : b_("["),
+ b_("]") : b_("]"),
+ b_("#") : b_("#"),
+ b_("_") : b_("_"),
+ b_("&") : b_("&"),
+ b_('$') : b_('$'),
+ }
+ try:
+ tok = ESCAPE_DICT[tok]
+ except KeyError:
+ if tok.isdigit():
+ # "The number ddd may consist of one, two, or three
+ # octal digits; high-order overflow shall be ignored.
+ # Three octal digits shall be used, with leading zeros
+ # as needed, if the next character of the string is also
+ # a digit." (PDF reference 7.3.4.2, p 16)
+ for i in range(2):
+ ntok = stream.read(1)
+ if ntok.isdigit():
+ tok += ntok
+ else:
+ break
+ tok = b_(chr(int(tok, base=8)))
+ elif tok in b_("\n\r"):
+ # This case is hit when a backslash followed by a line
+ # break occurs. If it's a multi-char EOL, consume the
+ # second character:
+ tok = stream.read(1)
+ if not tok in b_("\n\r"):
+ stream.seek(-1, 1)
+ # Then don't add anything to the actual string, since this
+ # line break was escaped:
+ tok = b_('')
+ else:
+ raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
+ txt += tok
+ return createStringObject(txt)
+
+
+##
+# Represents a string object where the text encoding could not be determined.
+# This occurs quite often, as the PDF spec doesn't provide an alternate way to
+# represent strings -- for example, the encryption data stored in files (like
+# /O) is clearly not text, but is still stored in a "String" object.
+class ByteStringObject(utils.bytes_type, PdfObject):
+
+ ##
+ # For compatibility with TextStringObject.original_bytes. This method
+ # returns self.
+ original_bytes = property(lambda self: self)
+
+ def writeToStream(self, stream, encryption_key):
+ bytearr = self
+ if encryption_key:
+ bytearr = RC4_encrypt(encryption_key, bytearr)
+ stream.write(b_("<"))
+ stream.write(utils.hexencode(bytearr))
+ stream.write(b_(">"))
+
+
+##
+# Represents a string object that has been decoded into a real unicode string.
+# If read from a PDF document, this string appeared to match the
+# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
+# occur.
+class TextStringObject(utils.string_type, PdfObject):
+ autodetect_pdfdocencoding = False
+ autodetect_utf16 = False
+
+ ##
+ # It is occasionally possible that a text string object gets created where
+ # a byte string object was expected due to the autodetection mechanism --
+ # if that occurs, this "original_bytes" property can be used to
+ # back-calculate what the original encoded bytes were.
+ original_bytes = property(lambda self: self.get_original_bytes())
+
+ def get_original_bytes(self):
+ # We're a text string object, but the library is trying to get our raw
+ # bytes. This can happen if we auto-detected this string as text, but
+ # we were wrong. It's pretty common. Return the original bytes that
+ # would have been used to create this object, based upon the autodetect
+ # method.
+ if self.autodetect_utf16:
+ return codecs.BOM_UTF16_BE + self.encode("utf-16be")
+ elif self.autodetect_pdfdocencoding:
+ return encode_pdfdocencoding(self)
+ else:
+ raise Exception("no information about original bytes")
+
+ def writeToStream(self, stream, encryption_key):
+ # Try to write the string out as a PDFDocEncoding encoded string. It's
+ # nicer to look at in the PDF file. Sadly, we take a performance hit
+ # here for trying...
+ try:
+ bytearr = encode_pdfdocencoding(self)
+ except UnicodeEncodeError:
+ bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
+ if encryption_key:
+ bytearr = RC4_encrypt(encryption_key, bytearr)
+ obj = ByteStringObject(bytearr)
+ obj.writeToStream(stream, None)
+ else:
+ stream.write(b_("("))
+ for c in bytearr:
+ if not chr_(c).isalnum() and c != b_(' '):
+ stream.write(b_("\\%03o" % ord_(c)))
+ else:
+ stream.write(b_(chr_(c)))
+ stream.write(b_(")"))
+
+
+class NameObject(str, PdfObject):
+ delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
+ surfix = b_("/")
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_(self))
+
+ def readFromStream(stream, pdf):
+ debug = False
+ if debug: print((stream.tell()))
+ name = stream.read(1)
+ if name != NameObject.surfix:
+ raise utils.PdfReadError("name read error")
+ name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
+ ignore_eof=True)
+ if debug: print(name)
+ try:
+ return NameObject(name.decode('utf-8'))
+ except (UnicodeEncodeError, UnicodeDecodeError) as e:
+ # Name objects should represent irregular characters
+ # with a '#' followed by the symbol's hex number
+ if not pdf.strict:
+ warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
+ return NameObject(name)
+ else:
+ raise utils.PdfReadError("Illegal character in Name Object")
+
+ readFromStream = staticmethod(readFromStream)
+
+
+class DictionaryObject(dict, PdfObject):
+ def raw_get(self, key):
+ return dict.__getitem__(self, key)
+
+ def __setitem__(self, key, value):
+ if not isinstance(key, PdfObject):
+ raise ValueError("key must be PdfObject")
+ if not isinstance(value, PdfObject):
+ raise ValueError("value must be PdfObject")
+ return dict.__setitem__(self, key, value)
+
+ def setdefault(self, key, value=None):
+ if not isinstance(key, PdfObject):
+ raise ValueError("key must be PdfObject")
+ if not isinstance(value, PdfObject):
+ raise ValueError("value must be PdfObject")
+ return dict.setdefault(self, key, value)
+
+ def __getitem__(self, key):
+ return dict.__getitem__(self, key).getObject()
+
+ ##
+ # Retrieves XMP (Extensible Metadata Platform) data relevant to the
+ # this object, if available.
+ # <p>
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
+ # that can be used to access XMP metadata from the document. Can also
+ # return None if no metadata was found on the document root.
+ def getXmpMetadata(self):
+ metadata = self.get("/Metadata", None)
+ if metadata == None:
+ return None
+ metadata = metadata.getObject()
+ from . import xmp
+ if not isinstance(metadata, xmp.XmpInformation):
+ metadata = xmp.XmpInformation(metadata)
+ self[NameObject("/Metadata")] = metadata
+ return metadata
+
+ ##
+ # Read-only property that accesses the {@link
+ # #DictionaryObject.getXmpData getXmpData} function.
+ # <p>
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ for key, value in list(self.items()):
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+ def readFromStream(stream, pdf):
+ debug = False
+ tmp = stream.read(2)
+ if tmp != b_("<<"):
+ raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
+ data = {}
+ while True:
+ tok = readNonWhitespace(stream)
+ if tok == b_('\x00'):
+ continue
+ elif tok == b_('%'):
+ stream.seek(-1, 1)
+ skipOverComment(stream)
+ continue
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+
+ if debug: print(("Tok:", tok))
+ if tok == b_(">"):
+ stream.read(1)
+ break
+ stream.seek(-1, 1)
+ key = readObject(stream, pdf)
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ value = readObject(stream, pdf)
+ if not data.get(key):
+ data[key] = value
+ elif pdf.strict:
+ # multiple definitions of key not permitted
+ raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
+ % (utils.hexStr(stream.tell()), key))
+ else:
+ warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
+ % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
+
+ pos = stream.tell()
+ s = readNonWhitespace(stream)
+ if s == b_('s') and stream.read(5) == b_('tream'):
+ eol = stream.read(1)
+ # odd PDF file output has spaces after 'stream' keyword but before EOL.
+ # patch provided by Danial Sandler
+ while eol == b_(' '):
+ eol = stream.read(1)
+ assert eol in (b_("\n"), b_("\r"))
+ if eol == b_("\r"):
+ # read \n after
+ if stream.read(1) != b_('\n'):
+ stream.seek(-1, 1)
+ # this is a stream object, not a dictionary
+ assert "/Length" in data
+ length = data["/Length"]
+ if debug: print(data)
+ if isinstance(length, IndirectObject):
+ t = stream.tell()
+ length = pdf.getObject(length)
+ stream.seek(t, 0)
+ data["__streamdata__"] = stream.read(length)
+ if debug: print("here")
+ #if debug: print(binascii.hexlify(data["__streamdata__"]))
+ e = readNonWhitespace(stream)
+ ndstream = stream.read(8)
+ if (e + ndstream) != b_("endstream"):
+ # (sigh) - the odd PDF file has a length that is too long, so
+ # we need to read backwards to find the "endstream" ending.
+ # ReportLab (unknown version) generates files with this bug,
+ # and Python users into PDF files tend to be our audience.
+ # we need to do this to correct the streamdata and chop off
+ # an extra character.
+ pos = stream.tell()
+ stream.seek(-10, 1)
+ end = stream.read(9)
+ if end == b_("endstream"):
+ # we found it by looking back one character further.
+ data["__streamdata__"] = data["__streamdata__"][:-1]
+ else:
+ if debug: print(("E", e, ndstream, debugging.toHex(end)))
+ stream.seek(pos, 0)
+ raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
+ else:
+ stream.seek(pos, 0)
+ if "__streamdata__" in data:
+ return StreamObject.initializeFromDictionary(data)
+ else:
+ retval = DictionaryObject()
+ retval.update(data)
+ return retval
+ readFromStream = staticmethod(readFromStream)
+
+
+class TreeObject(DictionaryObject):
+ def __init__(self):
+ DictionaryObject.__init__(self)
+
+ def hasChildren(self):
+ return '/First' in self
+
+ def __iter__(self):
+ return self.children()
+
+ def children(self):
+ if not self.hasChildren():
+ raise StopIteration
+
+ child = self['/First']
+ while True:
+ yield child
+ if child == self['/Last']:
+ raise StopIteration
+ child = child['/Next']
+
+ def addChild(self, child, pdf):
+ childObj = child.getObject()
+ child = pdf.getReference(childObj)
+ assert isinstance(child, IndirectObject)
+
+ if '/First' not in self:
+ self[NameObject('/First')] = child
+ self[NameObject('/Count')] = NumberObject(0)
+ prev = None
+ else:
+ prev = self['/Last']
+
+ self[NameObject('/Last')] = child
+ self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
+
+ if prev:
+ prevRef = pdf.getReference(prev)
+ assert isinstance(prevRef, IndirectObject)
+ childObj[NameObject('/Prev')] = prevRef
+ prev[NameObject('/Next')] = child
+
+ parentRef = pdf.getReference(self)
+ assert isinstance(parentRef, IndirectObject)
+ childObj[NameObject('/Parent')] = parentRef
+
+ def removeChild(self, child):
+ childObj = child.getObject()
+
+ if NameObject('/Parent') not in childObj:
+ raise ValueError("Removed child does not appear to be a tree item")
+ elif childObj[NameObject('/Parent')] != self:
+ raise ValueError("Removed child is not a member of this tree")
+
+ found = False
+ prevRef = None
+ prev = None
+ curRef = self[NameObject('/First')]
+ cur = curRef.getObject()
+ lastRef = self[NameObject('/Last')]
+ last = lastRef.getObject()
+ while cur != None:
+ if cur == childObj:
+ if prev == None:
+ if NameObject('/Next') in cur:
+ # Removing first tree node
+ nextRef = cur[NameObject('/Next')]
+ next = nextRef.getObject()
+ del next[NameObject('/Prev')]
+ self[NameObject('/First')] = nextRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+
+ else:
+ # Removing only tree node
+ assert self[NameObject('/Count')] == 1
+ del self[NameObject('/Count')]
+ del self[NameObject('/First')]
+ if NameObject('/Last') in self:
+ del self[NameObject('/Last')]
+ else:
+ if NameObject('/Next') in cur:
+ # Removing middle tree node
+ nextRef = cur[NameObject('/Next')]
+ next = nextRef.getObject()
+ next[NameObject('/Prev')] = prevRef
+ prev[NameObject('/Next')] = nextRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+ else:
+ # Removing last tree node
+ assert cur == last
+ del prev[NameObject('/Next')]
+ self[NameObject('/Last')] = prevRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+ found = True
+ break
+
+ prevRef = curRef
+ prev = cur
+ if NameObject('/Next') in cur:
+ curRef = cur[NameObject('/Next')]
+ cur = curRef.getObject()
+ else:
+ curRef = None
+ cur = None
+
+ if not found:
+ raise ValueError("Removal couldn't find item in tree")
+
+ del childObj[NameObject('/Parent')]
+ if NameObject('/Next') in childObj:
+ del childObj[NameObject('/Next')]
+ if NameObject('/Prev') in childObj:
+ del childObj[NameObject('/Prev')]
+
+ def emptyTree(self):
+ for child in self:
+ childObj = child.getObject()
+ del childObj[NameObject('/Parent')]
+ if NameObject('/Next') in childObj:
+ del childObj[NameObject('/Next')]
+ if NameObject('/Prev') in childObj:
+ del childObj[NameObject('/Prev')]
+
+ if NameObject('/Count') in self:
+ del self[NameObject('/Count')]
+ if NameObject('/First') in self:
+ del self[NameObject('/First')]
+ if NameObject('/Last') in self:
+ del self[NameObject('/Last')]
+
+
+class StreamObject(DictionaryObject):
+ def __init__(self):
+ self._data = None
+ self.decodedSelf = None
+
+ def writeToStream(self, stream, encryption_key):
+ self[NameObject("/Length")] = NumberObject(len(self._data))
+ DictionaryObject.writeToStream(self, stream, encryption_key)
+ del self["/Length"]
+ stream.write(b_("\nstream\n"))
+ data = self._data
+ if encryption_key:
+ data = RC4_encrypt(encryption_key, data)
+ stream.write(data)
+ stream.write(b_("\nendstream"))
+
+ def initializeFromDictionary(data):
+ if "/Filter" in data:
+ retval = EncodedStreamObject()
+ else:
+ retval = DecodedStreamObject()
+ retval._data = data["__streamdata__"]
+ del data["__streamdata__"]
+ del data["/Length"]
+ retval.update(data)
+ return retval
+ initializeFromDictionary = staticmethod(initializeFromDictionary)
+
+ def flateEncode(self):
+ if "/Filter" in self:
+ f = self["/Filter"]
+ if isinstance(f, ArrayObject):
+ f.insert(0, NameObject("/FlateDecode"))
+ else:
+ newf = ArrayObject()
+ newf.append(NameObject("/FlateDecode"))
+ newf.append(f)
+ f = newf
+ else:
+ f = NameObject("/FlateDecode")
+ retval = EncodedStreamObject()
+ retval[NameObject("/Filter")] = f
+ retval._data = filters.FlateDecode.encode(self._data)
+ return retval
+
+
+class DecodedStreamObject(StreamObject):
+ def getData(self):
+ return self._data
+
+ def setData(self, data):
+ self._data = data
+
+
+class EncodedStreamObject(StreamObject):
+ def __init__(self):
+ self.decodedSelf = None
+
+ def getData(self):
+ if self.decodedSelf:
+ # cached version of decoded object
+ return self.decodedSelf.getData()
+ else:
+ # create decoded object
+ decoded = DecodedStreamObject()
+
+ decoded._data = filters.decodeStreamData(self)
+ for key, value in list(self.items()):
+ if not key in ("/Length", "/Filter", "/DecodeParms"):
+ decoded[key] = value
+ self.decodedSelf = decoded
+ return decoded._data
+
+ def setData(self, data):
+ raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
+
+
+class RectangleObject(ArrayObject):
+ """
+ This class is used to represent *page boxes* in PyPDF2. These boxes include:
+
+ * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>`
+ * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>`
+ * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>`
+ * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>`
+ * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>`
+ """
+ def __init__(self, arr):
+ # must have four points
+ assert len(arr) == 4
+ # automatically convert arr[x] into NumberObject(arr[x]) if necessary
+ ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
+
+ def ensureIsNumber(self, value):
+ if not isinstance(value, (NumberObject, FloatObject)):
+ value = FloatObject(value)
+ return value
+
+ def __repr__(self):
+ return "RectangleObject(%s)" % repr(list(self))
+
+ def getLowerLeft_x(self):
+ return self[0]
+
+ def getLowerLeft_y(self):
+ return self[1]
+
+ def getUpperRight_x(self):
+ return self[2]
+
+ def getUpperRight_y(self):
+ return self[3]
+
+ def getUpperLeft_x(self):
+ return self.getLowerLeft_x()
+
+ def getUpperLeft_y(self):
+ return self.getUpperRight_y()
+
+ def getLowerRight_x(self):
+ return self.getUpperRight_x()
+
+ def getLowerRight_y(self):
+ return self.getLowerLeft_y()
+
+ def getLowerLeft(self):
+ return self.getLowerLeft_x(), self.getLowerLeft_y()
+
+ def getLowerRight(self):
+ return self.getLowerRight_x(), self.getLowerRight_y()
+
+ def getUpperLeft(self):
+ return self.getUpperLeft_x(), self.getUpperLeft_y()
+
+ def getUpperRight(self):
+ return self.getUpperRight_x(), self.getUpperRight_y()
+
+ def setLowerLeft(self, value):
+ self[0], self[1] = [self.ensureIsNumber(x) for x in value]
+
+ def setLowerRight(self, value):
+ self[2], self[1] = [self.ensureIsNumber(x) for x in value]
+
+ def setUpperLeft(self, value):
+ self[0], self[3] = [self.ensureIsNumber(x) for x in value]
+
+ def setUpperRight(self, value):
+ self[2], self[3] = [self.ensureIsNumber(x) for x in value]
+
+ def getWidth(self):
+ return self.getUpperRight_x() - self.getLowerLeft_x()
+
+ def getHeight(self):
+ return self.getUpperRight_y() - self.getLowerLeft_y()
+
+ lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
+ """
+ Property to read and modify the lower left coordinate of this box
+ in (x,y) form.
+ """
+ lowerRight = property(getLowerRight, setLowerRight, None, None)
+ """
+ Property to read and modify the lower right coordinate of this box
+ in (x,y) form.
+ """
+ upperLeft = property(getUpperLeft, setUpperLeft, None, None)
+ """
+ Property to read and modify the upper left coordinate of this box
+ in (x,y) form.
+ """
+ upperRight = property(getUpperRight, setUpperRight, None, None)
+ """
+ Property to read and modify the upper right coordinate of this box
+ in (x,y) form.
+ """
+
+
+class Field(TreeObject):
+ """
+ A class representing a field dictionary. This class is accessed through
+ :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
+ """
+ def __init__(self, data):
+ DictionaryObject.__init__(self)
+ attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff",
+ "/V", "/DV", "/AA")
+ for attr in attributes:
+ try:
+ self[NameObject(attr)] = data[attr]
+ except KeyError:
+ pass
+
+ fieldType = property(lambda self: self.get("/FT"))
+ """
+ Read-only property accessing the type of this field.
+ """
+
+ parent = property(lambda self: self.get("/Parent"))
+ """
+ Read-only property accessing the parent of this field.
+ """
+
+ kids = property(lambda self: self.get("/Kids"))
+ """
+ Read-only property accessing the kids of this field.
+ """
+
+ name = property(lambda self: self.get("/T"))
+ """
+ Read-only property accessing the name of this field.
+ """
+
+ altName = property(lambda self: self.get("/TU"))
+ """
+ Read-only property accessing the alternate name of this field.
+ """
+
+ mappingName = property(lambda self: self.get("/TM"))
+ """
+ Read-only property accessing the mapping name of this field. This
+ name is used by PyPDF2 as a key in the dictionary returned by
+ :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
+ """
+
+ flags = property(lambda self: self.get("/Ff"))
+ """
+ Read-only property accessing the field flags, specifying various
+ characteristics of the field (see Table 8.70 of the PDF 1.7 reference).
+ """
+
+ value = property(lambda self: self.get("/V"))
+ """
+ Read-only property accessing the value of this field. Format
+ varies based on field type.
+ """
+
+ defaultValue = property(lambda self: self.get("/DV"))
+ """
+ Read-only property accessing the default value of this field.
+ """
+
+ additionalActions = property(lambda self: self.get("/AA"))
+ """
+ Read-only property accessing the additional actions dictionary.
+ This dictionary defines the field's behavior in response to trigger events.
+ See Section 8.5.2 of the PDF 1.7 reference.
+ """
+
+
+class Destination(TreeObject):
+ """
+ A class representing a destination within a PDF file.
+ See section 8.2.1 of the PDF 1.6 reference.
+
+ :param str title: Title of this destination.
+ :param int page: Page number of this destination.
+ :param str typ: How the destination is displayed.
+ :param args: Additional arguments may be necessary depending on the type.
+ :raises PdfReadError: If destination type is invalid.
+
+ Valid ``typ`` arguments (see PDF spec for details):
+ /Fit No additional arguments
+ /XYZ [left] [top] [zoomFactor]
+ /FitH [top]
+ /FitV [left]
+ /FitR [left] [bottom] [right] [top]
+ /FitB No additional arguments
+ /FitBH [top]
+ /FitBV [left]
+ """
+ def __init__(self, title, page, typ, *args):
+ DictionaryObject.__init__(self)
+ self[NameObject("/Title")] = title
+ self[NameObject("/Page")] = page
+ self[NameObject("/Type")] = typ
+
+ # from table 8.2 of the PDF 1.7 reference.
+ if typ == "/XYZ":
+ (self[NameObject("/Left")], self[NameObject("/Top")],
+ self[NameObject("/Zoom")]) = args
+ elif typ == "/FitR":
+ (self[NameObject("/Left")], self[NameObject("/Bottom")],
+ self[NameObject("/Right")], self[NameObject("/Top")]) = args
+ elif typ in ["/FitH", "/FitBH"]:
+ self[NameObject("/Top")], = args
+ elif typ in ["/FitV", "/FitBV"]:
+ self[NameObject("/Left")], = args
+ elif typ in ["/Fit", "/FitB"]:
+ pass
+ else:
+ raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
+
+ def getDestArray(self):
+ return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self])
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ key = NameObject('/D')
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = self.getDestArray()
+ value.writeToStream(stream, encryption_key)
+
+ key = NameObject("/S")
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = NameObject("/GoTo")
+ value.writeToStream(stream, encryption_key)
+
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+ title = property(lambda self: self.get("/Title"))
+ """
+ Read-only property accessing the destination title.
+
+ :rtype: str
+ """
+
+ page = property(lambda self: self.get("/Page"))
+ """
+ Read-only property accessing the destination page number.
+
+ :rtype: int
+ """
+
+ typ = property(lambda self: self.get("/Type"))
+ """
+ Read-only property accessing the destination type.
+
+ :rtype: str
+ """
+
+ zoom = property(lambda self: self.get("/Zoom", None))
+ """
+ Read-only property accessing the zoom factor.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ left = property(lambda self: self.get("/Left", None))
+ """
+ Read-only property accessing the left horizontal coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ right = property(lambda self: self.get("/Right", None))
+ """
+ Read-only property accessing the right horizontal coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ top = property(lambda self: self.get("/Top", None))
+ """
+ Read-only property accessing the top vertical coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ bottom = property(lambda self: self.get("/Bottom", None))
+ """
+ Read-only property accessing the bottom vertical coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+
+class Bookmark(Destination):
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]:
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = self.raw_get(key)
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ key = NameObject('/Dest')
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = self.getDestArray()
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+
+def encode_pdfdocencoding(unicode_string):
+ retval = b_('')
+ for c in unicode_string:
+ try:
+ retval += b_(chr(_pdfDocEncoding_rev[c]))
+ except KeyError:
+ raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
+ "does not exist in translation table")
+ return retval
+
+
+def decode_pdfdocencoding(byte_array):
+ retval = u_('')
+ for b in byte_array:
+ c = _pdfDocEncoding[ord_(b)]
+ if c == u_('\u0000'):
+ raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1,
+ "does not exist in translation table")
+ retval += c
+ return retval
+
+_pdfDocEncoding = (
+ u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
+ u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
+ u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
+ u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
+ u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
+ u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
+ u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
+ u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
+ u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
+ u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
+ u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
+ u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
+ u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
+ u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
+ u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
+ u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
+ u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
+ u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
+ u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
+ u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
+ u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
+ u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
+ u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
+ u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
+ u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
+ u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
+ u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
+ u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
+ u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
+ u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
+ u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
+ u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
+)
+
+assert len(_pdfDocEncoding) == 256
+
+_pdfDocEncoding_rev = {}
+for i in range(256):
+ char = _pdfDocEncoding[i]
+ if char == u_("\u0000"):
+ continue
+ assert char not in _pdfDocEncoding_rev
+ _pdfDocEncoding_rev[char] = i