From f57654b84b4cf0ffa1287034fc9f66ba200bb259 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Sep 2018 10:52:38 +0200 Subject: First public commit --- PdfFileTransformer/PyPDF2/generic.py | 1228 ++++++++++++++++++++++++++++++++++ 1 file changed, 1228 insertions(+) create mode 100644 PdfFileTransformer/PyPDF2/generic.py (limited to 'PdfFileTransformer/PyPDF2/generic.py') diff --git a/PdfFileTransformer/PyPDF2/generic.py b/PdfFileTransformer/PyPDF2/generic.py new file mode 100644 index 0000000..959957d --- /dev/null +++ b/PdfFileTransformer/PyPDF2/generic.py @@ -0,0 +1,1228 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import re +from .utils import readNonWhitespace, RC4_encrypt, skipOverComment +from .utils import b_, u_, chr_, ord_ +from .utils import PdfStreamError +import warnings +from . import filters +from . import utils +import decimal +import codecs +import sys +#import debugging + +ObjectPrefix = b_('/<[tf(n%') +NumberSigns = b_('+-') +IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")) + + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + idx = ObjectPrefix.find(tok) + if idx == 0: + # name object + return NameObject.readFromStream(stream, pdf) + elif idx == 1: + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == b_('<<'): + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif idx == 2: + # array object + return ArrayObject.readFromStream(stream, pdf) + elif idx == 3 or idx == 4: + # boolean object + return BooleanObject.readFromStream(stream) + elif idx == 5: + # string object + return readStringFromStream(stream) + elif idx == 6: + # null object + return NullObject.readFromStream(stream) + elif idx == 7: + # comment + while tok not in (b_('\r'), b_('\n')): + tok = stream.read(1) + # Prevents an infinite loop by raising an error if the stream is at + # the EOF + if len(tok) <= 0: + raise PdfStreamError("File ended unexpectedly.") + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + + +class PdfObject(object): + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("null")) + + def readFromStream(stream): + nulltxt = stream.read(4) + if nulltxt != b_("null"): + raise utils.PdfReadError("Could not read Null object") + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write(b_("true")) + else: + stream.write(b_("false")) + + def readFromStream(stream): + word = stream.read(4) + if word == b_("true"): + return BooleanObject(True) + elif word == b_("fals"): + stream.read(1) + return BooleanObject(False) + else: + raise utils.PdfReadError('Could not read Boolean object') + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("[")) + for data in self: + stream.write(b_(" ")) + data.writeToStream(stream, encryption_key) + stream.write(b_(" ]")) + + def readFromStream(stream, pdf): + arr = ArrayObject() + tmp = stream.read(1) + if tmp != b_("["): + raise utils.PdfReadError("Could not read array") + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == b_("]"): + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("%s %s R" % (self.idnum, self.generation))) + + def readFromStream(stream, pdf): + idnum = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + break + idnum += tok + generation = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + if not generation: + continue + break + generation += tok + r = readNonWhitespace(stream) + if r != b_("R"): + raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(decimal.Decimal, PdfObject): + def __new__(cls, value="0", context=None): + try: + return decimal.Decimal.__new__(cls, utils.str_(value), context) + except: + return decimal.Decimal.__new__(cls, str(value)) + + def __repr__(self): + if self == self.to_integral(): + return str(self.quantize(decimal.Decimal(1))) + else: + # Standard formatting adds useless extraneous zeros. + o = "%.5f" % self + # Remove the zeros. + while o and o[-1] == '0': + o = o[:-1] + return o + + def as_numeric(self): + return float(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + +class NumberObject(int, PdfObject): + NumberPattern = re.compile(b_('[^+-.0-9]')) + ByteDot = b_(".") + + def __new__(cls, value): + val = int(value) + try: + return int.__new__(cls, val) + except OverflowError: + return int.__new__(cls, 0) + + def as_numeric(self): + return int(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + def readFromStream(stream): + num = utils.readUntilRegex(stream, NumberObject.NumberPattern) + if num.find(NumberObject.ByteDot) != -1: + return FloatObject(num) + else: + return NumberObject(num) + readFromStream = staticmethod(readFromStream) + + +## +# Given a string (either a "str" or "unicode"), create a ByteStringObject or a +# TextStringObject to represent the string. +def createStringObject(string): + if isinstance(string, utils.string_type): + return TextStringObject(string) + elif isinstance(string, utils.bytes_type): + try: + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = b_("") + while True: + tok = readNonWhitespace(stream) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_(">"): + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = b_("") + if len(x) == 1: + x += b_("0") + if len(x) == 2: + txt += chr(int(x, base=16)) + return createStringObject(b_(txt)) + + +def readStringFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_("("): + parens += 1 + elif tok == b_(")"): + parens -= 1 + if parens == 0: + break + elif tok == b_("\\"): + tok = stream.read(1) + ESCAPE_DICT = {b_("n") : b_("\n"), + b_("r") : b_("\r"), + b_("t") : b_("\t"), + b_("b") : b_("\b"), + b_("f") : b_("\f"), + b_("c") : b_("\c"), + b_("(") : b_("("), + b_(")") : b_(")"), + b_("/") : b_("/"), + b_("\\") : b_("\\"), + b_(" ") : b_(" "), + b_("/") : b_("/"), + b_("%") : b_("%"), + b_("<") : b_("<"), + b_(">") : b_(">"), + b_("[") : b_("["), + b_("]") : b_("]"), + b_("#") : b_("#"), + b_("_") : b_("_"), + b_("&") : b_("&"), + b_('$') : b_('$'), + } + try: + tok = ESCAPE_DICT[tok] + except KeyError: + if tok.isdigit(): + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for i in range(2): + ntok = stream.read(1) + if ntok.isdigit(): + tok += ntok + else: + break + tok = b_(chr(int(tok, base=8))) + elif tok in b_("\n\r"): + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in b_("\n\r"): + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b_('') + else: + raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) + txt += tok + return createStringObject(txt) + + +## +# Represents a string object where the text encoding could not be determined. +# This occurs quite often, as the PDF spec doesn't provide an alternate way to +# represent strings -- for example, the encryption data stored in files (like +# /O) is clearly not text, but is still stored in a "String" object. +class ByteStringObject(utils.bytes_type, PdfObject): + + ## + # For compatibility with TextStringObject.original_bytes. This method + # returns self. + original_bytes = property(lambda self: self) + + def writeToStream(self, stream, encryption_key): + bytearr = self + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + stream.write(b_("<")) + stream.write(utils.hexencode(bytearr)) + stream.write(b_(">")) + + +## +# Represents a string object that has been decoded into a real unicode string. +# If read from a PDF document, this string appeared to match the +# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to +# occur. +class TextStringObject(utils.string_type, PdfObject): + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + ## + # It is occasionally possible that a text string object gets created where + # a byte string object was expected due to the autodetection mechanism -- + # if that occurs, this "original_bytes" property can be used to + # back-calculate what the original encoded bytes were. + original_bytes = property(lambda self: self.get_original_bytes()) + + def get_original_bytes(self): + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def writeToStream(self, stream, encryption_key): + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.writeToStream(stream, None) + else: + stream.write(b_("(")) + for c in bytearr: + if not chr_(c).isalnum() and c != b_(' '): + stream.write(b_("\\%03o" % ord_(c))) + else: + stream.write(b_(chr_(c))) + stream.write(b_(")")) + + +class NameObject(str, PdfObject): + delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) + surfix = b_("/") + + def writeToStream(self, stream, encryption_key): + stream.write(b_(self)) + + def readFromStream(stream, pdf): + debug = False + if debug: print((stream.tell())) + name = stream.read(1) + if name != NameObject.surfix: + raise utils.PdfReadError("name read error") + name += utils.readUntilRegex(stream, NameObject.delimiterPattern, + ignore_eof=True) + if debug: print(name) + try: + return NameObject(name.decode('utf-8')) + except (UnicodeEncodeError, UnicodeDecodeError) as e: + # Name objects should represent irregular characters + # with a '#' followed by the symbol's hex number + if not pdf.strict: + warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) + return NameObject(name) + else: + raise utils.PdfReadError("Illegal character in Name Object") + + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + def raw_get(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, value): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value=None): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) + + def __getitem__(self, key): + return dict.__getitem__(self, key).getObject() + + ## + # Retrieves XMP (Extensible Metadata Platform) data relevant to the + # this object, if available. + #
+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + from . import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + #
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ for key, value in list(self.items()):
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+ def readFromStream(stream, pdf):
+ debug = False
+ tmp = stream.read(2)
+ if tmp != b_("<<"):
+ raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
+ data = {}
+ while True:
+ tok = readNonWhitespace(stream)
+ if tok == b_('\x00'):
+ continue
+ elif tok == b_('%'):
+ stream.seek(-1, 1)
+ skipOverComment(stream)
+ continue
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+
+ if debug: print(("Tok:", tok))
+ if tok == b_(">"):
+ stream.read(1)
+ break
+ stream.seek(-1, 1)
+ key = readObject(stream, pdf)
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ value = readObject(stream, pdf)
+ if not data.get(key):
+ data[key] = value
+ elif pdf.strict:
+ # multiple definitions of key not permitted
+ raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
+ % (utils.hexStr(stream.tell()), key))
+ else:
+ warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
+ % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
+
+ pos = stream.tell()
+ s = readNonWhitespace(stream)
+ if s == b_('s') and stream.read(5) == b_('tream'):
+ eol = stream.read(1)
+ # odd PDF file output has spaces after 'stream' keyword but before EOL.
+ # patch provided by Danial Sandler
+ while eol == b_(' '):
+ eol = stream.read(1)
+ assert eol in (b_("\n"), b_("\r"))
+ if eol == b_("\r"):
+ # read \n after
+ if stream.read(1) != b_('\n'):
+ stream.seek(-1, 1)
+ # this is a stream object, not a dictionary
+ assert "/Length" in data
+ length = data["/Length"]
+ if debug: print(data)
+ if isinstance(length, IndirectObject):
+ t = stream.tell()
+ length = pdf.getObject(length)
+ stream.seek(t, 0)
+ data["__streamdata__"] = stream.read(length)
+ if debug: print("here")
+ #if debug: print(binascii.hexlify(data["__streamdata__"]))
+ e = readNonWhitespace(stream)
+ ndstream = stream.read(8)
+ if (e + ndstream) != b_("endstream"):
+ # (sigh) - the odd PDF file has a length that is too long, so
+ # we need to read backwards to find the "endstream" ending.
+ # ReportLab (unknown version) generates files with this bug,
+ # and Python users into PDF files tend to be our audience.
+ # we need to do this to correct the streamdata and chop off
+ # an extra character.
+ pos = stream.tell()
+ stream.seek(-10, 1)
+ end = stream.read(9)
+ if end == b_("endstream"):
+ # we found it by looking back one character further.
+ data["__streamdata__"] = data["__streamdata__"][:-1]
+ else:
+ if debug: print(("E", e, ndstream, debugging.toHex(end)))
+ stream.seek(pos, 0)
+ raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
+ else:
+ stream.seek(pos, 0)
+ if "__streamdata__" in data:
+ return StreamObject.initializeFromDictionary(data)
+ else:
+ retval = DictionaryObject()
+ retval.update(data)
+ return retval
+ readFromStream = staticmethod(readFromStream)
+
+
+class TreeObject(DictionaryObject):
+ def __init__(self):
+ DictionaryObject.__init__(self)
+
+ def hasChildren(self):
+ return '/First' in self
+
+ def __iter__(self):
+ return self.children()
+
+ def children(self):
+ if not self.hasChildren():
+ raise StopIteration
+
+ child = self['/First']
+ while True:
+ yield child
+ if child == self['/Last']:
+ raise StopIteration
+ child = child['/Next']
+
+ def addChild(self, child, pdf):
+ childObj = child.getObject()
+ child = pdf.getReference(childObj)
+ assert isinstance(child, IndirectObject)
+
+ if '/First' not in self:
+ self[NameObject('/First')] = child
+ self[NameObject('/Count')] = NumberObject(0)
+ prev = None
+ else:
+ prev = self['/Last']
+
+ self[NameObject('/Last')] = child
+ self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
+
+ if prev:
+ prevRef = pdf.getReference(prev)
+ assert isinstance(prevRef, IndirectObject)
+ childObj[NameObject('/Prev')] = prevRef
+ prev[NameObject('/Next')] = child
+
+ parentRef = pdf.getReference(self)
+ assert isinstance(parentRef, IndirectObject)
+ childObj[NameObject('/Parent')] = parentRef
+
+ def removeChild(self, child):
+ childObj = child.getObject()
+
+ if NameObject('/Parent') not in childObj:
+ raise ValueError("Removed child does not appear to be a tree item")
+ elif childObj[NameObject('/Parent')] != self:
+ raise ValueError("Removed child is not a member of this tree")
+
+ found = False
+ prevRef = None
+ prev = None
+ curRef = self[NameObject('/First')]
+ cur = curRef.getObject()
+ lastRef = self[NameObject('/Last')]
+ last = lastRef.getObject()
+ while cur != None:
+ if cur == childObj:
+ if prev == None:
+ if NameObject('/Next') in cur:
+ # Removing first tree node
+ nextRef = cur[NameObject('/Next')]
+ next = nextRef.getObject()
+ del next[NameObject('/Prev')]
+ self[NameObject('/First')] = nextRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+
+ else:
+ # Removing only tree node
+ assert self[NameObject('/Count')] == 1
+ del self[NameObject('/Count')]
+ del self[NameObject('/First')]
+ if NameObject('/Last') in self:
+ del self[NameObject('/Last')]
+ else:
+ if NameObject('/Next') in cur:
+ # Removing middle tree node
+ nextRef = cur[NameObject('/Next')]
+ next = nextRef.getObject()
+ next[NameObject('/Prev')] = prevRef
+ prev[NameObject('/Next')] = nextRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+ else:
+ # Removing last tree node
+ assert cur == last
+ del prev[NameObject('/Next')]
+ self[NameObject('/Last')] = prevRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+ found = True
+ break
+
+ prevRef = curRef
+ prev = cur
+ if NameObject('/Next') in cur:
+ curRef = cur[NameObject('/Next')]
+ cur = curRef.getObject()
+ else:
+ curRef = None
+ cur = None
+
+ if not found:
+ raise ValueError("Removal couldn't find item in tree")
+
+ del childObj[NameObject('/Parent')]
+ if NameObject('/Next') in childObj:
+ del childObj[NameObject('/Next')]
+ if NameObject('/Prev') in childObj:
+ del childObj[NameObject('/Prev')]
+
+ def emptyTree(self):
+ for child in self:
+ childObj = child.getObject()
+ del childObj[NameObject('/Parent')]
+ if NameObject('/Next') in childObj:
+ del childObj[NameObject('/Next')]
+ if NameObject('/Prev') in childObj:
+ del childObj[NameObject('/Prev')]
+
+ if NameObject('/Count') in self:
+ del self[NameObject('/Count')]
+ if NameObject('/First') in self:
+ del self[NameObject('/First')]
+ if NameObject('/Last') in self:
+ del self[NameObject('/Last')]
+
+
+class StreamObject(DictionaryObject):
+ def __init__(self):
+ self._data = None
+ self.decodedSelf = None
+
+ def writeToStream(self, stream, encryption_key):
+ self[NameObject("/Length")] = NumberObject(len(self._data))
+ DictionaryObject.writeToStream(self, stream, encryption_key)
+ del self["/Length"]
+ stream.write(b_("\nstream\n"))
+ data = self._data
+ if encryption_key:
+ data = RC4_encrypt(encryption_key, data)
+ stream.write(data)
+ stream.write(b_("\nendstream"))
+
+ def initializeFromDictionary(data):
+ if "/Filter" in data:
+ retval = EncodedStreamObject()
+ else:
+ retval = DecodedStreamObject()
+ retval._data = data["__streamdata__"]
+ del data["__streamdata__"]
+ del data["/Length"]
+ retval.update(data)
+ return retval
+ initializeFromDictionary = staticmethod(initializeFromDictionary)
+
+ def flateEncode(self):
+ if "/Filter" in self:
+ f = self["/Filter"]
+ if isinstance(f, ArrayObject):
+ f.insert(0, NameObject("/FlateDecode"))
+ else:
+ newf = ArrayObject()
+ newf.append(NameObject("/FlateDecode"))
+ newf.append(f)
+ f = newf
+ else:
+ f = NameObject("/FlateDecode")
+ retval = EncodedStreamObject()
+ retval[NameObject("/Filter")] = f
+ retval._data = filters.FlateDecode.encode(self._data)
+ return retval
+
+
+class DecodedStreamObject(StreamObject):
+ def getData(self):
+ return self._data
+
+ def setData(self, data):
+ self._data = data
+
+
+class EncodedStreamObject(StreamObject):
+ def __init__(self):
+ self.decodedSelf = None
+
+ def getData(self):
+ if self.decodedSelf:
+ # cached version of decoded object
+ return self.decodedSelf.getData()
+ else:
+ # create decoded object
+ decoded = DecodedStreamObject()
+
+ decoded._data = filters.decodeStreamData(self)
+ for key, value in list(self.items()):
+ if not key in ("/Length", "/Filter", "/DecodeParms"):
+ decoded[key] = value
+ self.decodedSelf = decoded
+ return decoded._data
+
+ def setData(self, data):
+ raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
+
+
+class RectangleObject(ArrayObject):
+ """
+ This class is used to represent *page boxes* in PyPDF2. These boxes include:
+
+ * :attr:`artBox