diff options
Diffstat (limited to 'PdfFileTransformer/PyPDF2/pdf.py')
-rw-r--r-- | PdfFileTransformer/PyPDF2/pdf.py | 3074 |
1 files changed, 3074 insertions, 0 deletions
diff --git a/PdfFileTransformer/PyPDF2/pdf.py b/PdfFileTransformer/PyPDF2/pdf.py new file mode 100644 index 0000000..3bd0066 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/pdf.py @@ -0,0 +1,3074 @@ +# -*- coding: utf-8 -*- +# +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +A pure-Python PDF library with an increasing number of capabilities. +See README for links to FAQ, documentation, homepage, etc. +""" + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +__maintainer__ = "Phaseit, Inc." +__maintainer_email = "PyPDF2@phaseit.net" + +import string +import math +import struct +import sys +import uuid +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO +else: + from io import StringIO + +if version_info < ( 3, 0 ): + BytesIO = StringIO +else: + from io import BytesIO + +from . import filters +from . import utils +import warnings +import codecs +from .generic import * +from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList +from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning + +if version_info < ( 2, 4 ): + from sets import ImmutableSet as frozenset + +if version_info < ( 2, 5 ): + from md5 import md5 +else: + from hashlib import md5 +import uuid + + +class PdfFileWriter(object): + """ + This class supports writing PDF files out, given pages produced by another + class (typically :class:`PdfFileReader<PdfFileReader>`). + """ + def __init__(self): + self._header = b_("%PDF-1.3") + self._objects = [] # array of indirect objects + + # The root of our page tree node. + pages = DictionaryObject() + pages.update({ + NameObject("/Type"): NameObject("/Pages"), + NameObject("/Count"): NumberObject(0), + NameObject("/Kids"): ArrayObject(), + }) + self._pages = self._addObject(pages) + + # info object + info = DictionaryObject() + info.update({ + NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be')) + }) + self._info = self._addObject(info) + + # root object + root = DictionaryObject() + root.update({ + NameObject("/Type"): NameObject("/Catalog"), + NameObject("/Pages"): self._pages, + }) + self._root = None + self._root_object = root + + def setHeader(self, header): + self._header = header + + def _addObject(self, obj): + self._objects.append(obj) + return IndirectObject(len(self._objects), 0, self) + + def getObject(self, ido): + if ido.pdf != self: + raise ValueError("pdf must be self") + return self._objects[ido.idnum - 1] + + def _addPage(self, page, action): + assert page["/Type"] == "/Page" + page[NameObject("/Parent")] = self._pages + page = self._addObject(page) + pages = self.getObject(self._pages) + action(pages["/Kids"], page) + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) + + def addPage(self, page): + """ + Adds a page to this PDF file. The page is usually acquired from a + :class:`PdfFileReader<PdfFileReader>` instance. + + :param PageObject page: The page to add to the document. Should be + an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` + """ + self._addPage(page, list.append) + + def insertPage(self, page, index=0): + """ + Insert a page in this PDF file. The page is usually acquired from a + :class:`PdfFileReader<PdfFileReader>` instance. + + :param PageObject page: The page to add to the document. This + argument should be an instance of :class:`PageObject<pdf.PageObject>`. + :param int index: Position at which the page will be inserted. + """ + self._addPage(page, lambda l, p: l.insert(index, p)) + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: the page at the index given by *pageNumber* + :rtype: :class:`PageObject<pdf.PageObject>` + """ + pages = self.getObject(self._pages) + # XXX: crude hack + return pages["/Kids"][pageNumber].getObject() + + def getNumPages(self): + """ + :return: the number of pages. + :rtype: int + """ + pages = self.getObject(self._pages) + return int(pages[NameObject("/Count")]) + + def addBlankPage(self, width=None, height=None): + """ + Appends a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :return: the newly appended page + :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + page = PageObject.createBlankPage(self, width, height) + self.addPage(page) + return page + + def insertBlankPage(self, width=None, height=None, index=0): + """ + Inserts a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :param int index: Position to add the page. + :return: the newly appended page + :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + if width is None or height is None and \ + (self.getNumPages() - 1) >= index: + oldpage = self.getPage(index) + width = oldpage.mediaBox.getWidth() + height = oldpage.mediaBox.getHeight() + page = PageObject.createBlankPage(self, width, height) + self.insertPage(page, index) + return page + + def addJS(self, javascript): + """ + Add Javascript which will launch upon opening this PDF. + + :param str javascript: Your Javascript. + + >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + # Example: This will launch the print window when the PDF is opened. + """ + js = DictionaryObject() + js.update({ + NameObject("/Type"): NameObject("/Action"), + NameObject("/S"): NameObject("/JavaScript"), + NameObject("/JS"): NameObject("(%s)" % javascript) + }) + js_indirect_object = self._addObject(js) + + # We need a name for parameterized javascript in the pdf file, but it can be anything. + js_string_name = str(uuid.uuid4()) + + js_name_tree = DictionaryObject() + js_name_tree.update({ + NameObject("/JavaScript"): DictionaryObject({ + NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object]) + }) + }) + self._addObject(js_name_tree) + + self._root_object.update({ + NameObject("/OpenAction"): js_indirect_object, + NameObject("/Names"): js_name_tree + }) + + def addAttachment(self, fname, fdata): + """ + Embed a file inside the PDF. + + :param str fname: The filename to display. + :param str fdata: The data in the file. + + Reference: + https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + Section 7.11.3 + """ + + # We need 3 entries: + # * The file's data + # * The /Filespec entry + # * The file's name, which goes in the Catalog + + + # The entry for the file + """ Sample: + 8 0 obj + << + /Length 12 + /Type /EmbeddedFile + >> + stream + Hello world! + endstream + endobj + """ + file_entry = DecodedStreamObject() + file_entry.setData(fdata) + file_entry.update({ + NameObject("/Type"): NameObject("/EmbeddedFile") + }) + + # The Filespec entry + """ Sample: + 7 0 obj + << + /Type /Filespec + /F (hello.txt) + /EF << /F 8 0 R >> + >> + """ + efEntry = DictionaryObject() + efEntry.update({ NameObject("/F"):file_entry }) + + filespec = DictionaryObject() + filespec.update({ + NameObject("/Type"): NameObject("/Filespec"), + NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject + NameObject("/EF"): efEntry + }) + + # Then create the entry for the root, as it needs a reference to the Filespec + """ Sample: + 1 0 obj + << + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R + /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> + >> + endobj + + """ + embeddedFilesNamesDictionary = DictionaryObject() + embeddedFilesNamesDictionary.update({ + NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) + }) + + embeddedFilesDictionary = DictionaryObject() + embeddedFilesDictionary.update({ + NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary + }) + # Update the root + self._root_object.update({ + NameObject("/Names"): embeddedFilesDictionary + }) + + def appendPagesFromReader(self, reader, after_page_append=None): + """ + Copy pages from reader to writer. Includes an optional callback parameter + which is invoked after pages are appended to the writer. + + :param reader: a PdfFileReader object from which to copy page + annotations to this writer object. The writer's annots + will then be updated + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page + appended to the writer. + """ + # Get page count from writer and reader + reader_num_pages = reader.getNumPages() + writer_num_pages = self.getNumPages() + + # Copy pages from reader to writer + for rpagenum in range(0, reader_num_pages): + reader_page = reader.getPage(rpagenum) + self.addPage(reader_page) + writer_page = self.getPage(writer_num_pages+rpagenum) + # Trigger callback, pass writer page as parameter + if callable(after_page_append): after_page_append(writer_page) + + def updatePageFormFieldValues(self, page, fields): + ''' + Update the form field values for a given page from a fields dictionary. + Copy field texts and values from fields to page. + + :param page: Page reference from PDF writer where the annotations + and field data will be updated. + :param fields: a Python dictionary of field names (/T) and text + values (/V) + ''' + # Iterate through pages, update field values + for j in range(0, len(page['/Annots'])): + writer_annot = page['/Annots'][j].getObject() + for field in fields: + if writer_annot.get('/T') == field: + writer_annot.update({ + NameObject("/V"): TextStringObject(fields[field]) + }) + + def cloneReaderDocumentRoot(self, reader): + ''' + Copy the reader document root to the writer. + + :param reader: PdfFileReader from the document root should be copied. + :callback after_page_append + ''' + self._root_object = reader.trailer['/Root'] + + def cloneDocumentFromReader(self, reader, after_page_append=None): + ''' + Create a copy (clone) of a document from a PDF file reader + + :param reader: PDF file reader instance from which the clone + should be created. + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Signature includes a reference to the + appended page (delegates to appendPagesFromReader). Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page just + appended to the document. + ''' + self.cloneReaderDocumentRoot(reader) + self.appendPagesFromReader(reader, after_page_append) + + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): + """ + Encrypt this PDF file with the PDF Standard encryption handler. + + :param str user_pwd: The "user password", which allows for opening + and reading the PDF file with the restrictions provided. + :param str owner_pwd: The "owner password", which allows for + opening the PDF files without any restrictions. By default, + the owner password is the same as the user password. + :param bool use_128bit: flag as to whether to use 128bit + encryption. When false, 40bit encryption will be used. By default, + this flag is on. + """ + import time, random + if owner_pwd == None: + owner_pwd = user_pwd + if use_128bit: + V = 2 + rev = 3 + keylen = int(128 / 8) + else: + V = 1 + rev = 2 + keylen = int(40 / 8) + # permit everything: + P = -1 + O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) + ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) + self._ID = ArrayObject((ID_1, ID_2)) + if rev == 2: + U, key = _alg34(user_pwd, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) + encrypt = DictionaryObject() + encrypt[NameObject("/Filter")] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject("/Length")] = NumberObject(keylen * 8) + encrypt[NameObject("/R")] = NumberObject(rev) + encrypt[NameObject("/O")] = ByteStringObject(O) + encrypt[NameObject("/U")] = ByteStringObject(U) + encrypt[NameObject("/P")] = NumberObject(P) + self._encrypt = self._addObject(encrypt) + self._encrypt_key = key + + def write(self, stream): + """ + Writes the collection of pages added to this object out as a PDF file. + + :param stream: An object to write the file to. The object must support + the write method and the tell method, similar to a file object. + """ + if hasattr(stream, 'mode') and 'b' not in stream.mode: + warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name) + debug = False + import struct + + if not self._root: + self._root = self._addObject(self._root_object) + + externalReferenceMap = {} + + # PDF objects sometimes have circular references to their /Page objects + # inside their object tree (for example, annotations). Those will be + # indirect references to objects that we've recreated in this PDF. To + # address this problem, PageObject's store their original object + # reference number, and we add it to the external reference map before + # we sweep for indirect references. This forces self-page-referencing + # trees to reference the correct new object location, rather than + # copying in a new copy of the page object. + for objIndex in range(len(self._objects)): + obj = self._objects[objIndex] + if isinstance(obj, PageObject) and obj.indirectRef != None: + data = obj.indirectRef + if data.pdf not in externalReferenceMap: + externalReferenceMap[data.pdf] = {} + if data.generation not in externalReferenceMap[data.pdf]: + externalReferenceMap[data.pdf][data.generation] = {} + externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) + + self.stack = [] + if debug: print(("ERM:", externalReferenceMap, "root:", self._root)) + self._sweepIndirectReferences(externalReferenceMap, self._root) + del self.stack + + # Begin writing: + object_positions = [] + stream.write(self._header + b_("\n")) + stream.write(b_("%\xE2\xE3\xCF\xD3\n")) + for i in range(len(self._objects)): + idnum = (i + 1) + obj = self._objects[i] + object_positions.append(stream.tell()) + stream.write(b_(str(idnum) + " 0 obj\n")) + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack("<i", i + 1)[:3] + pack2 = struct.pack("<i", 0)[:2] + key = self._encrypt_key + pack1 + pack2 + assert len(key) == (len(self._encrypt_key) + 5) + md5_hash = md5(key).digest() + key = md5_hash[:min(16, len(self._encrypt_key) + 5)] + obj.writeToStream(stream, key) + stream.write(b_("\nendobj\n")) + + # xref table + xref_location = stream.tell() + stream.write(b_("xref\n")) + stream.write(b_("0 %s\n" % (len(self._objects) + 1))) + stream.write(b_("%010d %05d f \n" % (0, 65535))) + for offset in object_positions: + stream.write(b_("%010d %05d n \n" % (offset, 0))) + + # trailer + stream.write(b_("trailer\n")) + trailer = DictionaryObject() + trailer.update({ + NameObject("/Size"): NumberObject(len(self._objects) + 1), + NameObject("/Root"): self._root, + NameObject("/Info"): self._info, + }) + if hasattr(self, "_ID"): + trailer[NameObject("/ID")] = self._ID + if hasattr(self, "_encrypt"): + trailer[NameObject("/Encrypt")] = self._encrypt + trailer.writeToStream(stream, None) + + # eof + stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) + + def addMetadata(self, infos): + """ + Add custom metadata to the output. + + :param dict infos: a Python dictionary where each key is a field + and each value is your new metadata. + """ + args = {} + for key, value in list(infos.items()): + args[NameObject(key)] = createStringObject(value) + self.getObject(self._info).update(args) + + def _sweepIndirectReferences(self, externMap, data): + debug = False + if debug: print((data, "TYPE", data.__class__.__name__)) + if isinstance(data, DictionaryObject): + for key, value in list(data.items()): + origvalue = value + value = self._sweepIndirectReferences(externMap, value) + if isinstance(value, StreamObject): + # a dictionary value is a stream. streams must be indirect + # objects, so we need to change this value. + value = self._addObject(value) + data[key] = value + return data + elif isinstance(data, ArrayObject): + for i in range(len(data)): + value = self._sweepIndirectReferences(externMap, data[i]) + if isinstance(value, StreamObject): + # an array value is a stream. streams must be indirect + # objects, so we need to change this value + value = self._addObject(value) + data[i] = value + return data + elif isinstance(data, IndirectObject): + # internal indirect references are fine + if data.pdf == self: + if data.idnum in self.stack: + return data + else: + self.stack.append(data.idnum) + realdata = self.getObject(data) + self._sweepIndirectReferences(externMap, realdata) + return data + else: + if data.pdf.stream.closed: + raise ValueError("I/O operation on closed file: {}".format(data.pdf.stream.name)) + newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None) + if newobj == None: + try: + newobj = data.pdf.getObject(data) + self._objects.append(None) # placeholder + idnum = len(self._objects) + newobj_ido = IndirectObject(idnum, 0, self) + if data.pdf not in externMap: + externMap[data.pdf] = {} + if data.generation not in externMap[data.pdf]: + externMap[data.pdf][data.generation] = {} + externMap[data.pdf][data.generation][data.idnum] = newobj_ido + newobj = self._sweepIndirectReferences(externMap, newobj) + self._objects[idnum-1] = newobj + return newobj_ido + except ValueError: + # Unable to resolve the Object, returning NullObject instead. + warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format( + data.__class__.__name__, data + )) + return NullObject() + return newobj + else: + return data + + def getReference(self, obj): + idnum = self._objects.index(obj) + 1 + ref = IndirectObject(idnum, 0, self) + assert ref.getObject() == obj + return ref + + def getOutlineRoot(self): + if '/Outlines' in self._root_object: + outline = self._root_object['/Outlines'] + idnum = self._objects.index(outline) + 1 + outlineRef = IndirectObject(idnum, 0, self) + assert outlineRef.getObject() == outline + else: + outline = TreeObject() + outline.update({ }) + outlineRef = self._addObject(outline) + self._root_object[NameObject('/Outlines')] = outlineRef + + return outline + + def getNamedDestRoot(self): + if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject): + names = self._root_object['/Names'] + idnum = self._objects.index(names) + 1 + namesRef = IndirectObject(idnum, 0, self) + assert namesRef.getObject() == names + if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): + dests = names['/Dests'] + idnum = self._objects.index(dests) + 1 + destsRef = IndirectObject(idnum, 0, self) + assert destsRef.getObject() == dests + if '/Names' in dests: + nd = dests['/Names'] + else: + nd = ArrayObject() + dests[NameObject('/Names')] = nd + else: + dests = DictionaryObject() + destsRef = self._addObject(dests) + names[NameObject('/Dests')] = destsRef + nd = ArrayObject() + dests[NameObject('/Names')] = nd + + else: + names = DictionaryObject() + namesRef = self._addObject(names) + self._root_object[NameObject('/Names')] = namesRef + dests = DictionaryObject() + destsRef = self._addObject(dests) + names[NameObject('/Dests')] = destsRef + nd = ArrayObject() + dests[NameObject('/Names')] = nd + + return nd + + def addBookmarkDestination(self, dest, parent=None): + destRef = self._addObject(dest) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + parent = parent.getObject() + #print parent.__class__.__name__ + parent.addChild(destRef, self) + + return destRef + + def addBookmarkDict(self, bookmark, parent=None): + bookmarkObj = TreeObject() + for k, v in list(bookmark.items()): + bookmarkObj[NameObject(str(k))] = v + bookmarkObj.update(bookmark) + + if '/A' in bookmark: + action = DictionaryObject() + for k, v in list(bookmark['/A'].items()): + action[NameObject(str(k))] = v + actionRef = self._addObject(action) + bookmarkObj[NameObject('/A')] = actionRef + + bookmarkRef = self._addObject(bookmarkObj) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + parent = parent.getObject() + parent.addChild(bookmarkRef, self) + + return bookmarkRef + + def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args): + """ + Add a bookmark to this PDF file. + + :param str title: Title to use for this bookmark. + :param int pagenum: Page number this bookmark will point to. + :param parent: A reference to a parent bookmark to create nested + bookmarks. + :param tuple color: Color of the bookmark as a red, green, blue tuple + from 0.0 to 1.0 + :param bool bold: Bookmark is bold + :param bool italic: Bookmark is italic + :param str fit: The fit of the destination page. See + :meth:`addLink()<addLink>` for details. + """ + pageRef = self.getObject(self._pages)['/Kids'][pagenum] + action = DictionaryObject() + zoomArgs = [] + for a in args: + if a is not None: + zoomArgs.append(NumberObject(a)) + else: + zoomArgs.append(NullObject()) + dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs) + destArray = dest.getDestArray() + action.update({ + NameObject('/D') : destArray, + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self._addObject(action) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A'): actionRef, + NameObject('/Title'): createStringObject(title), + }) + + if color is not None: + bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])}) + + format = 0 + if italic: + format += 1 + if bold: + format += 2 + if format: + bookmark.update({NameObject('/F'): NumberObject(format)}) + + bookmarkRef = self._addObject(bookmark) + + parent = parent.getObject() + parent.addChild(bookmarkRef, self) + + return bookmarkRef + + def addNamedDestinationObject(self, dest): + destRef = self._addObject(dest) + + nd = self.getNamedDestRoot() + nd.extend([dest['/Title'], destRef]) + + return destRef + + def addNamedDestination(self, title, pagenum): + pageRef = self.getObject(self._pages)['/Kids'][pagenum] + dest = DictionaryObject() + dest.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + + destRef = self._addObject(dest) + nd = self.getNamedDestRoot() + + nd.extend([title, destRef]) + + return destRef + + def removeLinks(self): + """ + Removes links and annotations from this output. + """ + pages = self.getObject(self._pages)['/Kids'] + for page in pages: + pageRef = self.getObject(page) + if "/Annots" in pageRef: + del pageRef['/Annots'] + + def removeImages(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)['/Kids'] + for j in range(len(pages)): + page = pages[j] + pageRef = self.getObject(page) + content = pageRef['/Contents'].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, pageRef) + + _operations = [] + seq_graphics = False + for operands, operator in content.operations: + if operator == b_('Tj'): + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_("'"): + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if ignoreByteStringObject: + if not isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + + if operator == b_('q'): + seq_graphics = True + if operator == b_('Q'): + seq_graphics = False + if seq_graphics: + if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), + b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), + b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]: + continue + if operator == b_('re'): + continue + _operations.append((operands, operator)) + + content.operations = _operations + pageRef.__setitem__(NameObject('/Contents'), content) + + def removeText(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)['/Kids'] + for j in range(len(pages)): + page = pages[j] + pageRef = self.getObject(page) + content = pageRef['/Contents'].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, pageRef) + for operands,operator in content.operations: + if operator == b_('Tj'): + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[0] = TextStringObject() + elif operator == b_("'"): + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[2] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if not ignoreByteStringObject: + if isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + else: + if isinstance(operands[0][i], TextStringObject) or \ + isinstance(operands[0][i], ByteStringObject): + operands[0][i] = TextStringObject() + + pageRef.__setitem__(NameObject('/Contents'), content) + + def addURI(self, pagenum, uri, rect, border=None): + """ + Add an URI from a rectangular area to the specified page. + This uses the basic structure of AddLink + + :param int pagenum: index of the page on which to place the URI action. + :param int uri: string -- uri of resource to link to. + :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + + REMOVED FIT/ZOOM ARG + -John Mulligan + """ + + pageLink = self.getObject(self._pages)['/Kids'][pagenum] + pageRef = self.getObject(pageLink) + + if border is not None: + borderArr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dashPattern = ArrayObject([NameObject(n) for n in border[3]]) + borderArr.append(dashPattern) + else: + borderArr = [NumberObject(2)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + lnk2 = DictionaryObject() + lnk2.update({ + NameObject('/S'): NameObject('/URI'), + NameObject('/URI'): TextStringObject(uri) + }); + lnk = DictionaryObject() + lnk.update({ + NameObject('/Type'): NameObject('/Annot'), + NameObject('/Subtype'): NameObject('/Link'), + NameObject('/P'): pageLink, + NameObject('/Rect'): rect, + NameObject('/H'): NameObject('/I'), + NameObject('/Border'): ArrayObject(borderArr), + NameObject('/A'): lnk2 + }) + lnkRef = self._addObject(lnk) + + if "/Annots" in pageRef: + pageRef['/Annots'].append(lnkRef) + else: + pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + + def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): + """ + Add an internal link from a rectangular area to the specified page. + + :param int pagenum: index of the page on which to place the link. + :param int pagedest: index of the page to which the link should go. + :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need + to be supplied. Passing ``None`` will be read as a null value for that coordinate. + + Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details): + /Fit No additional arguments + /XYZ [left] [top] [zoomFactor] + /FitH [top] + /FitV [left] + /FitR [left] [bottom] [right] [top] + /FitB No additional arguments + /FitBH [top] + /FitBV [left] + """ + + pageLink = self.getObject(self._pages)['/Kids'][pagenum] + pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link + pageRef = self.getObject(pageLink) + + if border is not None: + borderArr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dashPattern = ArrayObject([NameObject(n) for n in border[3]]) + borderArr.append(dashPattern) + else: + borderArr = [NumberObject(0)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + zoomArgs = [] + for a in args: + if a is not None: + zoomArgs.append(NumberObject(a)) + else: + zoomArgs.append(NullObject()) + dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link + destArray = dest.getDestArray() + + lnk = DictionaryObject() + lnk.update({ + NameObject('/Type'): NameObject('/Annot'), + NameObject('/Subtype'): NameObject('/Link'), + NameObject('/P'): pageLink, + NameObject('/Rect'): rect, + NameObject('/Border'): ArrayObject(borderArr), + NameObject('/Dest'): destArray + }) + lnkRef = self._addObject(lnk) + + if "/Annots" in pageRef: + pageRef['/Annots'].append(lnkRef) + else: + pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + + _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object['/PageLayout'] + except KeyError: + return None + + def setPageLayout(self, layout): + """ + Set the page layout + + :param str layout: The page layout to be used + + Valid layouts are: + /NoLayout Layout explicitly not specified + /SinglePage Show one page at a time + /OneColumn Show one column at a time + /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left + /TwoColumnRight Show pages in two columns, odd-numbered pages on the right + /TwoPageLeft Show two pages at a time, odd-numbered pages on the left + /TwoPageRight Show two pages at a time, odd-numbered pages on the right + """ + if not isinstance(layout, NameObject): + if layout not in self._valid_layouts: + warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts))) + layout = NameObject(layout) + self._root_object.update({NameObject('/PageLayout'): layout}) + + pageLayout = property(getPageLayout, setPageLayout) + """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>` + and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods.""" + + _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments'] + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description + of valid modes. + + :return: Page mode currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object['/PageMode'] + except KeyError: + return None + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + Valid modes are: + /UseNone Do not show outlines or thumbnails panels + /UseOutlines Show outlines (aka bookmarks) panel + /UseThumbs Show page thumbnails panel + /FullScreen Fullscreen view + /UseOC Show Optional Content Group (OCG) panel + /UseAttachments Show attachments panel + """ + if not isinstance(mode, NameObject): + if mode not in self._valid_modes: + warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes))) + mode = NameObject(mode) + self._root_object.update({NameObject('/PageMode'): mode}) + + pageMode = property(getPageMode, setPageMode) + """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>` + and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods.""" + + +class PdfFileReader(object): + """ + Initializes a PdfFileReader object. This operation can take some time, as + the PDF stream's cross-reference tables are read into memory. + + :param stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + :param warndest: Destination for logging warnings (defaults to + ``sys.stderr``). + :param bool overwriteWarnings: Determines whether to override Python's + ``warnings.py`` module with a custom implementation (defaults to + ``True``). + """ + def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True): + if overwriteWarnings: + # have to dynamically override the default showwarning since there are no + # public methods that specify the 'file' parameter + def _showwarning(message, category, filename, lineno, file=warndest, line=None): + if file is None: + file = sys.stderr + try: + file.write(formatWarning(message, category, filename, lineno, line)) + except IOError: + pass + warnings.showwarning = _showwarning + self.strict = strict + self.flattenedPages = None + self.resolvedObjects = {} + self.xrefIndex = 0 + self._pageId2Num = None # map page IndirectRef number to Page Number + if hasattr(stream, 'mode') and 'b' not in stream.mode: + warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) + if isString(stream): + fileobj = open(stream, 'rb') + stream = BytesIO(b_(fileobj.read())) + fileobj.close() + self.read(stream) + self.stream = stream + + self._override_encryption = False + + def getDocumentInfo(self): + """ + Retrieves the PDF file's document information dictionary, if it exists. + Note that some PDF files use metadata streams instead of docinfo + dictionaries, and these metadata streams will not be accessed by this + function. + + :return: the document information of this PDF file + :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists. + """ + if "/Info" not in self.trailer: + return None + obj = self.trailer['/Info'] + retval = DocumentInformation() + retval.update(obj) + return retval + + documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function.""" + + def getXmpMetadata(self): + """ + Retrieves XMP (Extensible Metadata Platform) data from the PDF document + root. + + :return: a :class:`XmpInformation<xmp.XmpInformation>` + instance that can be used to access XMP metadata from the document. + :rtype: :class:`XmpInformation<xmp.XmpInformation>` or + ``None`` if no metadata was found on the document root. + """ + try: + self._override_encryption = True + return self.trailer["/Root"].getXmpMetadata() + finally: + self._override_encryption = False + + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + """ + Read-only property that accesses the + :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function. + """ + + def getNumPages(self): + """ + Calculates the number of pages in this PDF file. + + :return: number of pages + :rtype: int + :raises PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + + # Flattened pages will not work on an Encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.isEncrypted: + try: + self._override_encryption = True + self.decrypt('') + return self.trailer["/Root"]["/Pages"]["/Count"] + except: + raise utils.PdfReadError("File has not been decrypted") + finally: + self._override_encryption = False + else: + if self.flattenedPages == None: + self._flatten() + return len(self.flattenedPages) + + numPages = property(lambda self: self.getNumPages(), None, None) + """ + Read-only property that accesses the + :meth:`getNumPages()<PdfFileReader.getNumPages>` function. + """ + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: a :class:`PageObject<pdf.PageObject>` instance. + :rtype: :class:`PageObject<pdf.PageObject>` + """ + ## ensure that we're not trying to access an encrypted PDF + #assert not self.trailer.has_key("/Encrypt") + if self.flattenedPages == None: + self._flatten() + return self.flattenedPages[pageNumber] + + namedDestinations = property(lambda self: + self.getNamedDestinations(), None, None) + """ + Read-only property that accesses the + :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function. + """ + + # A select group of relevant field attributes. For the complete list, + # see section 8.6.2 of the PDF 1.7 reference. + + def getFields(self, tree = None, retval = None, fileobj = None): + """ + Extracts field data if this PDF contains interactive form fields. + The *tree* and *retval* parameters are for recursive use. + + :param fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + :return: A dictionary where each key is a field name, and each + value is a :class:`Field<PyPDF2.generic.Field>` object. By + default, the mapping name is used for keys. + :rtype: dict, or ``None`` if form data could not be located. + """ + fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent", + "/T" : "Field Name", "/TU" : "Alternate Field Name", + "/TM" : "Mapping Name", "/Ff" : "Field Flags", + "/V" : "Value", "/DV" : "Default Value"} + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + # get the AcroForm tree + if "/AcroForm" in catalog: + tree = catalog["/AcroForm"] + else: + return None + if tree == None: + return retval + + self._checkKids(tree, retval, fileobj) + for attr in fieldAttributes: + if attr in tree: + # Tree is a field + self._buildField(tree, retval, fileobj, fieldAttributes) + break + + if "/Fields" in tree: + fields = tree["/Fields"] + for f in fields: + field = f.getObject() + self._buildField(field, retval, fileobj, fieldAttributes) + + return retval + + def _buildField(self, field, retval, fileobj, fieldAttributes): + self._checkKids(field, retval, fileobj) + try: + key = field["/TM"] + except KeyError: + try: + key = field["/T"] + except KeyError: + # Ignore no-name field for now + return + if fileobj: + self._writeField(fileobj, field, fieldAttributes) + fileobj.write("\n") + retval[key] = Field(field) + + def _checkKids(self, tree, retval, fileobj): + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getFields(kid.getObject(), retval, fileobj) + + def _writeField(self, fileobj, field, fieldAttributes): + order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"] + for attr in order: + attrName = fieldAttributes[attr] + try: + if attr == "/FT": + # Make the field type value more clear + types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice", + "/Sig":"Signature"} + if field[attr] in types: + fileobj.write(attrName + ": " + types[field[attr]] + "\n") + elif attr == "/Parent": + # Let's just write the name of the parent + try: + name = field["/Parent"]["/TM"] + except KeyError: + name = field["/Parent"]["/T"] + fileobj.write(attrName + ": " + name + "\n") + else: + fileobj.write(attrName + ": " + str(field[attr]) + "\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def getFormTextFields(self): + ''' Retrieves form fields from the document with textual data (inputs, dropdowns) + ''' + # Retrieve document form fields + formfields = self.getFields() + return dict( + (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \ + if formfields[field].get('/FT') == '/Tx' + ) + + def getNamedDestinations(self, tree=None, retval=None): + """ + Retrieves the named destinations present in the document. + + :return: a dictionary which maps names to + :class:`Destinations<PyPDF2.generic.Destination>`. + :rtype: dict + """ + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + + # get the name tree + if "/Dests" in catalog: + tree = catalog["/Dests"] + elif "/Names" in catalog: + names = catalog['/Names'] + if "/Dests" in names: + tree = names['/Dests'] + + if tree == None: + return retval + + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) + + if "/Names" in tree: + names = tree["/Names"] + for i in range(0, len(names), 2): + key = names[i].getObject() + val = names[i+1].getObject() + if isinstance(val, DictionaryObject) and '/D' in val: + val = val['/D'] + dest = self._buildDestination(key, val) + if dest != None: + retval[key] = dest + + return retval + + outlines = property(lambda self: self.getOutlines(), None, None) + """ + Read-only property that accesses the + :meth:`getOutlines()<PdfFileReader.getOutlines>` function. + """ + + def getOutlines(self, node=None, outlines=None): + """ + Retrieves the document outline present in the document. + + :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`. + """ + if outlines == None: + outlines = [] + catalog = self.trailer["/Root"] + + # get the outline dictionary and named destinations + if "/Outlines" in catalog: + try: + lines = catalog["/Outlines"] + except utils.PdfReadError: + # this occurs if the /Outlines object reference is incorrect + # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf + # so continue to load the file without the Bookmarks + return outlines + + if "/First" in lines: + node = lines["/First"] + self._namedDests = self.getNamedDestinations() + + if node == None: + return outlines + + # see if there are any more outlines + while True: + outline = self._buildOutline(node) + if outline: + outlines.append(outline) + + # check for sub-outlines + if "/First" in node: + subOutlines = [] + self.getOutlines(node["/First"], subOutlines) + if subOutlines: + outlines.append(subOutlines) + + if "/Next" not in node: + break + node = node["/Next"] + + return outlines + + def _getPageNumberByIndirect(self, indirectRef): + """Generate _pageId2Num""" + if self._pageId2Num is None: + id2num = {} + for i, x in enumerate(self.pages): + id2num[x.indirectRef.idnum] = i + self._pageId2Num = id2num + + if isinstance(indirectRef, int): + idnum = indirectRef + else: + idnum = indirectRef.idnum + + ret = self._pageId2Num.get(idnum, -1) + return ret + + def getPageNumber(self, page): + """ + Retrieve page number of a given PageObject + + :param PageObject page: The page to get page number. Should be + an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` + :return: the page number or -1 if page not found + :rtype: int + """ + indirectRef = page.indirectRef + ret = self._getPageNumberByIndirect(indirectRef) + return ret + + def getDestinationPageNumber(self, destination): + """ + Retrieve page number of a given Destination object + + :param Destination destination: The destination to get page number. + Should be an instance of + :class:`Destination<PyPDF2.pdf.Destination>` + :return: the page number or -1 if page not found + :rtype: int + """ + indirectRef = destination.page + ret = self._getPageNumberByIndirect(indirectRef) + return ret + + def _buildDestination(self, title, array): + page, typ = array[0:2] + array = array[2:] + return Destination(title, page, typ, *array) + + def _buildOutline(self, node): + dest, title, outline = None, None, None + + if "/A" in node and "/Title" in node: + # Action, section 8.5 (only type GoTo supported) + title = node["/Title"] + action = node["/A"] + if action["/S"] == "/GoTo": + dest = action["/D"] + elif "/Dest" in node and "/Title" in node: + # Destination, section 8.2.1 + title = node["/Title"] + dest = node["/Dest"] + + # if destination found, then create outline + if dest: + if isinstance(dest, ArrayObject): + outline = self._buildDestination(title, dest) + elif isString(dest) and dest in self._namedDests: + outline = self._namedDests[dest] + outline[NameObject("/Title")] = title + else: + raise utils.PdfReadError("Unexpected destination %r" % dest) + return outline + + pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), + None, None) + """ + Read-only property that emulates a list based upon the + :meth:`getNumPages()<PdfFileReader.getNumPages>` and + :meth:`getPage()<PdfFileReader.getPage>` methods. + """ + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` + for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer['/Root']['/PageLayout'] + except KeyError: + return None + + pageLayout = property(getPageLayout) + """Read-only property accessing the + :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method.""" + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()<PdfFileWriter.setPageMode>` + for a description of valid modes. + + :return: Page mode currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer['/Root']['/PageMode'] + except KeyError: + return None + + pageMode = property(getPageMode) + """Read-only property accessing the + :meth:`getPageMode()<PdfFileReader.getPageMode>` method.""" + + def _flatten(self, pages=None, inherit=None, indirectRef=None): + inheritablePageAttributes = ( + NameObject("/Resources"), NameObject("/MediaBox"), + NameObject("/CropBox"), NameObject("/Rotate") + ) + if inherit == None: + inherit = dict() + if pages == None: + self.flattenedPages = [] + catalog = self.trailer["/Root"].getObject() + pages = catalog["/Pages"].getObject() + + t = "/Pages" + if "/Type" in pages: + t = pages["/Type"] + + if t == "/Pages": + for attr in inheritablePageAttributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in pages["/Kids"]: + addt = {} + if isinstance(page, IndirectObject): + addt["indirectRef"] = page + self._flatten(page.getObject(), inherit, **addt) + elif t == "/Page": + for attr, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr not in pages: + pages[attr] = value + pageObj = PageObject(self, indirectRef) + pageObj.update(pages) + self.flattenedPages.append(pageObj) + + def _getObjectFromStream(self, indirectReference): + # indirect reference to object in object stream + # read the entire object stream into memory + debug = False + stmnum, idx = self.xref_objStm[indirectReference.idnum] + if debug: print(("Here1: %s %s"%(stmnum, idx))) + objStm = IndirectObject(stmnum, 0, self).getObject() + if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData()))) + # This is an xref to a stream, so its type better be a stream + assert objStm['/Type'] == '/ObjStm' + # /N is the number of indirect objects in the stream + assert idx < objStm['/N'] + streamData = BytesIO(b_(objStm.getData())) + for i in range(objStm['/N']): + readNonWhitespace(streamData) + streamData.seek(-1, 1) + objnum = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + offset = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + if objnum != indirectReference.idnum: + # We're only interested in one object + continue + if self.strict and idx != i: + raise utils.PdfReadError("Object is in wrong index.") + streamData.seek(objStm['/First']+offset, 0) + if debug: + pos = streamData.tell() + streamData.seek(0, 0) + lines = streamData.readlines() + for i in range(0, len(lines)): + print((lines[i])) + streamData.seek(pos, 0) + try: + obj = readObject(streamData, self) + except utils.PdfStreamError as e: + # Stream object cannot be read. Normally, a critical error, but + # Adobe Reader doesn't complain, so continue (in strict mode?) + e = sys.exc_info()[1] + warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \ + (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning) + + if self.strict: + raise utils.PdfReadError("Can't read object stream: %s"%e) + # Replace with null. Hopefully it's nothing important. + obj = NullObject() + return obj + + if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") + return NullObject() + + def getObject(self, indirectReference): + debug = False + if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) + retval = self.cacheGetIndirectObject(indirectReference.generation, + indirectReference.idnum) + if retval != None: + return retval + if indirectReference.generation == 0 and \ + indirectReference.idnum in self.xref_objStm: + retval = self._getObjectFromStream(indirectReference) + elif indirectReference.generation in self.xref and \ + indirectReference.idnum in self.xref[indirectReference.generation]: + start = self.xref[indirectReference.generation][indirectReference.idnum] + if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start)) + self.stream.seek(start, 0) + idnum, generation = self.readObjectHeader(self.stream) + if idnum != indirectReference.idnum and self.xrefIndex: + # Xref table probably had bad indexes due to not being zero-indexed + if self.strict: + raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + else: pass # xref table is corrected in non-strict mode + elif idnum != indirectReference.idnum and self.strict: + # some other problem + raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + if self.strict: + assert generation == indirectReference.generation + retval = readObject(self.stream, self) + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self.isEncrypted: + # if we don't have the encryption key: + if not hasattr(self, '_decryption_key'): + raise utils.PdfReadError("file has not been decrypted") + # otherwise, decrypt here... + import struct + pack1 = struct.pack("<i", indirectReference.idnum)[:3] + pack2 = struct.pack("<i", indirectReference.generation)[:2] + key = self._decryption_key + pack1 + pack2 + assert len(key) == (len(self._decryption_key) + 5) + md5_hash = md5(key).digest() + key = md5_hash[:min(16, len(self._decryption_key) + 5)] + retval = self._decryptObject(retval, key) + else: + warnings.warn("Object %d %d not defined."%(indirectReference.idnum, + indirectReference.generation), utils.PdfReadWarning) + #if self.strict: + raise utils.PdfReadError("Could not find object.") + self.cacheIndirectObject(indirectReference.generation, + indirectReference.idnum, retval) + return retval + + def _decryptObject(self, obj, key): + if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): + obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) + elif isinstance(obj, StreamObject): + obj._data = utils.RC4_encrypt(key, obj._data) + elif isinstance(obj, DictionaryObject): + for dictkey, value in list(obj.items()): + obj[dictkey] = self._decryptObject(value, key) + elif isinstance(obj, ArrayObject): + for i in range(len(obj)): + obj[i] = self._decryptObject(obj[i], key) + return obj + + def readObjectHeader(self, stream): + # Should never be necessary to read out whitespace, since the + # cross-reference table should put us in the right spot to read the + # object header. In reality... some files have stupid cross reference + # tables that are off by whitespace bytes. + extra = False + utils.skipOverComment(stream) + extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) + idnum = readUntilWhitespace(stream) + extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) + generation = readUntilWhitespace(stream) + obj = stream.read(3) + readNonWhitespace(stream) + stream.seek(-1, 1) + if (extra and self.strict): + #not a fatal error + warnings.warn("Superfluous whitespace found in object header %s %s" % \ + (idnum, generation), utils.PdfReadWarning) + return int(idnum), int(generation) + + def cacheGetIndirectObject(self, generation, idnum): + debug = False + out = self.resolvedObjects.get((generation, idnum)) + if debug and out: print(("cache hit: %d %d"%(idnum, generation))) + elif debug: print(("cache miss: %d %d"%(idnum, generation))) + return out + + def cacheIndirectObject(self, generation, idnum, obj): + # return None # Sometimes we want to turn off cache for debugging. + if (generation, idnum) in self.resolvedObjects: + msg = "Overwriting cache for %s %s"%(generation, idnum) + if self.strict: raise utils.PdfReadError(msg) + else: warnings.warn(msg) + self.resolvedObjects[(generation, idnum)] = obj + return obj + + def read(self, stream): + debug = False + if debug: print(">>read", stream) + # start at the end: + stream.seek(-1, 2) + if not stream.tell(): + raise utils.PdfReadError('Cannot read an empty file') + last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream + line = b_('') + while line[:5] != b_("%%EOF"): + if stream.tell() < last1K: + raise utils.PdfReadError("EOF marker not found") + line = self.readNextEndLine(stream) + if debug: print(" line:",line) + + # find startxref entry - the location of the xref table + line = self.readNextEndLine(stream) + try: + startxref = int(line) + except ValueError: + # 'startxref' may be on the same line as the location + if not line.startswith(b_("startxref")): + raise utils.PdfReadError("startxref not found") + startxref = int(line[9:].strip()) + warnings.warn("startxref on same line as offset") + else: + line = self.readNextEndLine(stream) + if line[:9] != b_("startxref"): + raise utils.PdfReadError("startxref not found") + + # read all cross reference tables and their trailers + self.xref = {} + self.xref_objStm = {} + self.trailer = DictionaryObject() + while True: + # load the xref table + stream.seek(startxref, 0) + x = stream.read(1) + if x == b_("x"): + # standard cross-reference table + ref = stream.read(4) + if ref[:3] != b_("ref"): + raise utils.PdfReadError("xref table read error") + readNonWhitespace(stream) + stream.seek(-1, 1) + firsttime = True; # check if the first time looking at the xref table + while True: + num = readObject(stream, self) + if firsttime and num != 0: + self.xrefIndex = num + if self.strict: + warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning) + #if table not zero indexed, could be due to error from when PDF was created + #which will lead to mismatched indices later on, only warned and corrected if self.strict=True + firsttime = False + readNonWhitespace(stream) + stream.seek(-1, 1) + size = readObject(stream, self) + readNonWhitespace(stream) + stream.seek(-1, 1) + cnt = 0 + while cnt < size: + line = stream.read(20) + + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes (as of PDF 1.7). However, some files have + # 21-byte entries (or more) due to the use of \r\n + # (CRLF) EOL's. Detect that case, and adjust the line + # until it does not begin with a \r (CR) or \n (LF). + while line[0] in b_("\x0D\x0A"): + stream.seek(-20 + 1, 1) + line = stream.read(20) + + # On the other hand, some malformed PDF files + # use a single character EOL without a preceeding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in b_("0123456789t"): + stream.seek(-1, 1) + + offset, generation = line[:16].split(b_(" ")) + offset, generation = int(offset), int(generation) + if generation not in self.xref: + self.xref[generation] = {} + if num in self.xref[generation]: + # It really seems like we should allow the last + # xref table in the file to override previous + # ones. Since we read the file backwards, assume + # any existing key is already set correctly. + pass + else: + self.xref[generation][num] = offset + cnt += 1 + num += 1 + readNonWhitespace(stream) + stream.seek(-1, 1) + trailertag = stream.read(7) + if trailertag != b_("trailer"): + # more xrefs! + stream.seek(-7, 1) + else: + break + readNonWhitespace(stream) + stream.seek(-1, 1) + newTrailer = readObject(stream, self) + for key, value in list(newTrailer.items()): + if key not in self.trailer: + self.trailer[key] = value + if "/Prev" in newTrailer: + startxref = newTrailer["/Prev"] + else: + break + elif x.isdigit(): + # PDF 1.5+ Cross-Reference Stream + stream.seek(-1, 1) + idnum, generation = self.readObjectHeader(stream) + xrefstream = readObject(stream, self) + assert xrefstream["/Type"] == "/XRef" + self.cacheIndirectObject(generation, idnum, xrefstream) + streamData = BytesIO(b_(xrefstream.getData())) + # Index pairs specify the subsections in the dictionary. If + # none create one subsection that spans everything. + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs)))) + entrySizes = xrefstream.get("/W") + assert len(entrySizes) >= 3 + if self.strict and len(entrySizes) > 3: + raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes) + + def getEntry(i): + # Reads the correct number of bytes for each entry. See the + # discussion of the W parameter in PDF spec table 17. + if entrySizes[i] > 0: + d = streamData.read(entrySizes[i]) + return convertToInt(d, entrySizes[i]) + + # PDF Spec Table 17: A value of zero for an element in the + # W array indicates...the default value shall be used + if i == 0: return 1 # First value defaults to 1 + else: return 0 + + def used_before(num, generation): + # We move backwards through the xrefs, don't replace any. + return num in self.xref.get(generation, []) or \ + num in self.xref_objStm + + # Iterate through each subsection + last_end = 0 + for start, size in self._pairs(idx_pairs): + # The subsections must increase + assert start >= last_end + last_end = start + size + for num in range(start, start+size): + # The first entry is the type + xref_type = getEntry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = getEntry(1) + next_generation = getEntry(2) + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = getEntry(1) + generation = getEntry(2) + if generation not in self.xref: + self.xref[generation] = {} + if not used_before(num, generation): + self.xref[generation][num] = byte_offset + if debug: print(("XREF Uncompressed: %s %s"%( + num, generation))) + elif xref_type == 2: + # compressed objects + objstr_num = getEntry(1) + obstr_idx = getEntry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + if debug: print(("XREF Compressed: %s %s %s"%( + num, objstr_num, obstr_idx))) + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise utils.PdfReadError("Unknown xref type: %s"% + xref_type) + + trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" + for key in trailerKeys: + if key in xrefstream and key not in self.trailer: + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/Prev" in xrefstream: + startxref = xrefstream["/Prev"] + else: + break + else: + # bad xref character at startxref. Let's see if we can find + # the xref table nearby, as we've observed this error with an + # off-by-one before. + stream.seek(-11, 1) + tmp = stream.read(20) + xref_loc = tmp.find(b_("xref")) + if xref_loc != -1: + startxref -= (10 - xref_loc) + continue + # No explicit xref table, try finding a cross-reference stream. + stream.seek(startxref, 0) + found = False + for look in range(5): + if stream.read(1).isdigit(): + # This is not a standard PDF, consider adding a warning + startxref += look + found = True + break + if found: + continue + # no xref table found at specified location + raise utils.PdfReadError("Could not find xref table at specified location") + #if not zero-indexed, verify that the table is correct; change it if necessary + if self.xrefIndex and not self.strict: + loc = stream.tell() + for gen in self.xref: + if gen == 65535: continue + for id in self.xref[gen]: + stream.seek(self.xref[gen][id], 0) + try: + pid, pgen = self.readObjectHeader(stream) + except ValueError: + break + if pid == id - self.xrefIndex: + self._zeroXref(gen) + break + #if not, then either it's just plain wrong, or the non-zero-index is actually correct + stream.seek(loc, 0) #return to where it was + + def _zeroXref(self, generation): + self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) + + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i+1] + i += 2 + if (i+1) >= len(array): + break + + def readNextEndLine(self, stream): + debug = False + if debug: print(">>readNextEndLine") + line = b_("") + while True: + # Prevent infinite loops in malformed PDFs + if stream.tell() == 0: + raise utils.PdfReadError("Could not read malformed PDF file") + x = stream.read(1) + if debug: print((" x:", x, "%x"%ord(x))) + if stream.tell() < 2: + raise utils.PdfReadError("EOL marker not found") + stream.seek(-2, 1) + if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR + crlf = False + while x == b_('\n') or x == b_('\r'): + if debug: + if ord(x) == 0x0D: print(" x is CR 0D") + elif ord(x) == 0x0A: print(" x is LF 0A") + x = stream.read(1) + if x == b_('\n') or x == b_('\r'): # account for CR+LF + stream.seek(-1, 1) + crlf = True + if stream.tell() < 2: + raise utils.PdfReadError("EOL marker not found") + stream.seek(-2, 1) + stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1 + break + else: + if debug: print(" x is neither") + line = x + line + if debug: print((" RNEL line:", line)) + if debug: print("leaving RNEL") + return line + + def decrypt(self, password): + """ + When using an encrypted / secured PDF file with the PDF Standard + encryption handler, this function will allow the file to be decrypted. + It checks the given password against the document's user password and + owner password, and then stores the resulting decryption key if either + password is correct. + + It does not matter which password was matched. Both passwords provide + the correct decryption key that will allow the document to be used with + this library. + + :param str password: The password to match. + :return: ``0`` if the password failed, ``1`` if the password matched the user + password, and ``2`` if the password matched the owner password. + :rtype: int + :raises NotImplementedError: if document uses an unsupported encryption + method. + """ + + self._override_encryption = True + try: + return self._decrypt(password) + finally: + self._override_encryption = False + + def _decrypt(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + if encrypt['/Filter'] != '/Standard': + raise NotImplementedError("only Standard PDF encryption handler is available") + if not (encrypt['/V'] in (1, 2)): + raise NotImplementedError("only algorithm code 1 and 2 are supported. This PDF uses code %s" % encrypt['/V']) + user_password, key = self._authenticateUserPassword(password) + if user_password: + self._decryption_key = key + return 1 + else: + rev = encrypt['/R'].getObject() + if rev == 2: + keylen = 5 + else: + keylen = encrypt['/Length'].getObject() // 8 + key = _alg33_1(password, rev, keylen) + real_O = encrypt["/O"].getObject() + if rev == 2: + userpass = utils.RC4_encrypt(key, real_O) + else: + val = real_O + for i in range(19, -1, -1): + new_key = b_('') + for l in range(len(key)): + new_key += b_(chr(utils.ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + userpass = val + owner_password, key = self._authenticateUserPassword(userpass) + if owner_password: + self._decryption_key = key + return 2 + return 0 + + def _authenticateUserPassword(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + rev = encrypt['/R'].getObject() + owner_entry = encrypt['/O'].getObject() + p_entry = encrypt['/P'].getObject() + id_entry = self.trailer['/ID'].getObject() + id1_entry = id_entry[0].getObject() + real_U = encrypt['/U'].getObject().original_bytes + if rev == 2: + U, key = _alg34(password, owner_entry, p_entry, id1_entry) + elif rev >= 3: + U, key = _alg35(password, rev, + encrypt["/Length"].getObject() // 8, owner_entry, + p_entry, id1_entry, + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) + U, real_U = U[:16], real_U[:16] + return U == real_U, key + + def getIsEncrypted(self): + return "/Encrypt" in self.trailer + + isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) + """ + Read-only boolean property showing whether this PDF file is encrypted. + Note that this property, if true, will remain true even after the + :meth:`decrypt()<PdfFileReader.decrypt>` method is called. + """ + + +def getRectangle(self, name, defaults): + retval = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval == None: + for d in defaults: + retval = self.get(d) + if retval != None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.getObject(retval) + retval = RectangleObject(retval) + setRectangle(self, name, retval) + return retval + + +def setRectangle(self, name, value): + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + + +def deleteRectangle(self, name): + del self[name] + + +def createRectangleAccessor(name, fallback): + return \ + property( + lambda self: getRectangle(self, name, fallback), + lambda self, value: setRectangle(self, name, value), + lambda self: deleteRectangle(self, name) + ) + + +class PageObject(DictionaryObject): + """ + This class represents a single page within a PDF file. Typically this + object will be created by accessing the + :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the + :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is + also possible to create an empty page with the + :meth:`createBlankPage()<PageObject.createBlankPage>` static method. + + :param pdf: PDF file the page belongs to. + :param indirectRef: Stores the original indirect reference to + this object in its source PDF + """ + def __init__(self, pdf=None, indirectRef=None): + DictionaryObject.__init__(self) + self.pdf = pdf + self.indirectRef = indirectRef + + def createBlankPage(pdf=None, width=None, height=None): + """ + Returns a new blank page. + If ``width`` or ``height`` is ``None``, try to get the page size + from the last page of *pdf*. + + :param pdf: PDF file the page belongs to + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default user + space units. + :return: the new blank page: + :rtype: :class:`PageObject<PageObject>` + :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains + no page + """ + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject('/Type'), NameObject('/Page')) + page.__setitem__(NameObject('/Parent'), NullObject()) + page.__setitem__(NameObject('/Resources'), DictionaryObject()) + if width is None or height is None: + if pdf is not None and pdf.getNumPages() > 0: + lastpage = pdf.getPage(pdf.getNumPages() - 1) + width = lastpage.mediaBox.getWidth() + height = lastpage.mediaBox.getHeight() + else: + raise utils.PageSizeNotDefinedError() + page.__setitem__(NameObject('/MediaBox'), + RectangleObject([0, 0, width, height])) + + return page + createBlankPage = staticmethod(createBlankPage) + + def rotateClockwise(self, angle): + """ + Rotates a page clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + assert angle % 90 == 0 + self._rotate(angle) + return self + + def rotateCounterClockwise(self, angle): + """ + Rotates a page counter-clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + assert angle % 90 == 0 + self._rotate(-angle) + return self + + def _rotate(self, angle): + rotateObj = self.get("/Rotate", 0) + currentAngle = rotateObj if isinstance(rotateObj, int) else rotateObj.getObject() + self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) + + def _mergeResources(res1, res2, resource): + newRes = DictionaryObject() + newRes.update(res1.get(resource, DictionaryObject()).getObject()) + page2Res = res2.get(resource, DictionaryObject()).getObject() + renameRes = {} + for key in list(page2Res.keys()): + if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key): + newname = NameObject(key + str(uuid.uuid4())) + renameRes[key] = newname + newRes[newname] = page2Res[key] + elif key not in newRes: + newRes[key] = page2Res.raw_get(key) + return newRes, renameRes + _mergeResources = staticmethod(_mergeResources) + + def _contentStreamRename(stream, rename, pdf): + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands, operator in stream.operations: + for i in range(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op,op) + return stream + _contentStreamRename = staticmethod(_contentStreamRename) + + def _pushPopGS(contents, pdf): + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], "q"]) + stream.operations.append([[], "Q"]) + return stream + _pushPopGS = staticmethod(_pushPopGS) + + def _addTransformationMatrix(contents, pdf, ctm): + # adds transformation matrix at the beginning of the given + # contents stream. + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert(0, [[FloatObject(a), FloatObject(b), + FloatObject(c), FloatObject(d), FloatObject(e), + FloatObject(f)], " cm"]) + return contents + _addTransformationMatrix = staticmethod(_addTransformationMatrix) + + def getContents(self): + """ + Accesses the page contents. + + :return: the ``/Contents`` object, or ``None`` if it doesn't exist. + ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 + """ + if "/Contents" in self: + return self["/Contents"].getObject() + else: + return None + + def mergePage(self, page2): + """ + Merges the content streams of two pages into one. Resource references + (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + of this page are not altered. The parameter page's content stream will + be added to the end of this page's content stream, meaning that it will + be drawn after, or "on top" of this page. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + """ + self._mergePage(page2) + + def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + newResources = DictionaryObject() + rename = {} + originalResources = self["/Resources"].getObject() + page2Resources = page2["/Resources"].getObject() + newAnnots = ArrayObject() + + for page in (self, page2): + if "/Annots" in page: + annots = page["/Annots"] + if isinstance(annots, ArrayObject): + for ref in annots: + newAnnots.append(ref) + + for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": + new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) + if new: + newResources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + newResources[NameObject("/ProcSet")] = ArrayObject( + frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( + frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) + ) + ) + + newContentArray = ArrayObject() + + originalContent = self.getContents() + if originalContent is not None: + newContentArray.append(PageObject._pushPopGS( + originalContent, self.pdf)) + + page2Content = page2.getContents() + if page2Content is not None: + if page2transformation is not None: + page2Content = page2transformation(page2Content) + page2Content = PageObject._contentStreamRename( + page2Content, rename, self.pdf) + page2Content = PageObject._pushPopGS(page2Content, self.pdf) + newContentArray.append(page2Content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), + self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()] + corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), + page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), + page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), + page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()] + if ctm is not None: + ctm = [float(x) for x in ctm] + new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)] + new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)] + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = [min(new_x), min(new_y)] + upperright = [max(new_x), max(new_y)] + lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] + upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])] + + self.mediaBox.setLowerLeft(lowerleft) + self.mediaBox.setUpperRight(upperright) + + self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) + self[NameObject('/Resources')] = newResources + self[NameObject('/Annots')] = newAnnots + + def mergeTransformedPage(self, page2, ctm, expand=False): + """ + This is similar to mergePage, but a transformation matrix is + applied to the merged stream. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param tuple ctm: a 6-element tuple containing the operands of the + transformation matrix + :param bool expand: Whether the page should be expanded to fit the dimensions + of the page to be merged. + """ + self._mergePage(page2, lambda page2Content: + PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand) + + def mergeScaledPage(self, page2, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is scaled + by appling a transformation matrix. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + # CTM to scale : [ sx 0 0 sy 0 0 ] + return self.mergeTransformedPage(page2, [scale, 0, + 0, scale, + 0, 0], expand) + + def mergeRotatedPage(self, page2, rotation, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + return self.mergeTransformedPage(page2, + [math.cos(rotation), math.sin(rotation), + -math.sin(rotation), math.cos(rotation), + 0, 0], expand) + + def mergeTranslatedPage(self, page2, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + return self.mergeTransformedPage(page2, [1, 0, + 0, 1, + tx, ty], expand) + + def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and translated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], + [0, 1, 0], + [-tx, -ty, 1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + rtranslation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + ctm = utils.matrixMultiply(translation, rotating) + ctm = utils.matrixMultiply(ctm, rtranslation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + + return self.mergeTransformedPage(page2, + [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float scale: The scaling factor + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(scaling, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated, + rotated and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + translation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + ctm = utils.matrixMultiply(ctm, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + ## + # Applys a transformation matrix the page. + # + # @param ctm A 6 elements tuple containing the operands of the + # transformation matrix + def addTransformation(self, ctm): + """ + Applies a transformation matrix to the page. + + :param tuple ctm: A 6-element tuple containing the operands of the + transformation matrix. + """ + originalContent = self.getContents() + if originalContent is not None: + newContent = PageObject._addTransformationMatrix( + originalContent, self.pdf, ctm) + newContent = PageObject._pushPopGS(newContent, self.pdf) + self[NameObject('/Contents')] = newContent + + def scale(self, sx, sy): + """ + Scales a page by the given factors by appling a transformation + matrix to its content and updating the page size. + + :param float sx: The scaling factor on horizontal axis. + :param float sy: The scaling factor on vertical axis. + """ + self.addTransformation([sx, 0, + 0, sy, + 0, 0]) + self.mediaBox = RectangleObject([ + float(self.mediaBox.getLowerLeft_x()) * sx, + float(self.mediaBox.getLowerLeft_y()) * sy, + float(self.mediaBox.getUpperRight_x()) * sx, + float(self.mediaBox.getUpperRight_y()) * sy]) + if "/VP" in self: + viewport = self["/VP"] + if isinstance(viewport, ArrayObject): + bbox = viewport[0]["/BBox"] + else: + bbox = viewport["/BBox"] + scaled_bbox = RectangleObject([ + float(bbox[0]) * sx, + float(bbox[1]) * sy, + float(bbox[2]) * sx, + float(bbox[3]) * sy]) + if isinstance(viewport, ArrayObject): + self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox + else: + self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox + + def scaleBy(self, factor): + """ + Scales a page by the given factor by appling a transformation + matrix to its content and updating the page size. + + :param float factor: The scaling factor (for both X and Y axis). + """ + self.scale(factor, factor) + + def scaleTo(self, width, height): + """ + Scales a page to the specified dimentions by appling a + transformation matrix to its content and updating the page size. + + :param float width: The new width. + :param float height: The new heigth. + """ + sx = width / float(self.mediaBox.getUpperRight_x() - + self.mediaBox.getLowerLeft_x ()) + sy = height / float(self.mediaBox.getUpperRight_y() - + self.mediaBox.getLowerLeft_y ()) + self.scale(sx, sy) + + def compressContentStreams(self): + """ + Compresses the size of this page by joining all content streams and + applying a FlateDecode filter. + + However, it is possible that this function will perform no action if + content stream compression becomes "automatic" for some reason. + """ + content = self.getContents() + if content is not None: + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject("/Contents")] = content.flateEncode() + + def extractText(self): + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + + :return: a unicode string object. + """ + text = u_("") + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + for operands, operator in content.operations: + if operator == b_("Tj"): + _text = operands[0] + if isinstance(_text, TextStringObject): + text += _text + text += "\n" + elif operator == b_("T*"): + text += "\n" + elif operator == b_("'"): + text += "\n" + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == b_('"'): + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text + elif operator == b_("TJ"): + for i in operands[0]: + if isinstance(i, TextStringObject): + text += i + text += "\n" + return text + + mediaBox = createRectangleAccessor("/MediaBox", ()) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the boundaries of the physical medium on which the page is + intended to be displayed or printed. + """ + + cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the visible region of default user space. When the page is + displayed or printed, its contents are to be clipped (cropped) to this + rectangle and then imposed on the output medium in some + implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`. + """ + + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the region to which the contents of the page should be clipped + when output in a production enviroment. + """ + + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the intended dimensions of the finished page after trimming. + """ + + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, + defining the extent of the page's meaningful content as intended by the + page's creator. + """ + + +class ContentStream(DecodedStreamObject): + def __init__(self, stream, pdf): + self.pdf = pdf + self.operations = [] + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + stream = stream.getObject() + if isinstance(stream, ArrayObject): + data = b_("") + for s in stream: + data += b_(s.getObject().getData()) + stream = BytesIO(b_(data)) + else: + stream = BytesIO(b_(stream.getData())) + self.__parseContentStream(stream) + + def __parseContentStream(self, stream): + # file("f:\\tmp.txt", "w").write(stream.read()) + stream.seek(0, 0) + operands = [] + while True: + peek = readNonWhitespace(stream) + if peek == b_('') or ord_(peek) == 0: + break + stream.seek(-1, 1) + if peek.isalpha() or peek == b_("'") or peek == b_('"'): + operator = utils.readUntilRegex(stream, + NameObject.delimiterPattern, True) + if operator == b_("BI"): + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._readInlineImage(stream) + self.operations.append((ii, b_("INLINE IMAGE"))) + else: + self.operations.append((operands, operator)) + operands = [] + elif peek == b_('%'): + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, readObject will handle + # encountering a comment -- but readObject assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in (b_('\r'), b_('\n')): + peek = stream.read(1) + else: + operands.append(readObject(stream, None)) + + def _readInlineImage(self, stream): + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + if tok == b_("I"): + # "ID" - begin of image data + break + key = readObject(stream, self.pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == b_("ID") + data = b_("") + while True: + # Read the inline image, while checking for EI (End Image) operator. + tok = stream.read(1) + if tok == b_("E"): + # Check for End Image + tok2 = stream.read(1) + if tok2 == b_("I"): + # Data can contain EI, so check for the Q operator. + tok3 = stream.read(1) + info = tok + tok2 + # We need to find whitespace between EI and Q. + has_q_whitespace = False + while tok3 in utils.WHITESPACES: + has_q_whitespace = True + info += tok3 + tok3 = stream.read(1) + if tok3 == b_("Q") and has_q_whitespace: + stream.seek(-1, 1) + break + else: + stream.seek(-1,1) + data += info + else: + stream.seek(-1, 1) + data += tok + else: + data += tok + return {"settings": settings, "data": data} + + def _getData(self): + newdata = BytesIO() + for operands, operator in self.operations: + if operator == b_("INLINE IMAGE"): + newdata.write(b_("BI")) + dicttext = BytesIO() + operands["settings"].writeToStream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write(b_("ID ")) + newdata.write(operands["data"]) + newdata.write(b_("EI")) + else: + for op in operands: + op.writeToStream(newdata, None) + newdata.write(b_(" ")) + newdata.write(b_(operator)) + newdata.write(b_("\n")) + return newdata.getvalue() + + def _setData(self, value): + self.__parseContentStream(BytesIO(b_(value))) + + _data = property(_getData, _setData) + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>` + + All text properties of the document metadata have + *two* properties, eg. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where + the metadata is being displayed. The raw property can sometimes return + a ``ByteStringObject``, if PyPDF2 was unable to decode the string's + text encoding; this requires additional safety in the caller and + therefore is not as commonly accessed. + """ + + def __init__(self): + DictionaryObject.__init__(self) + + def getText(self, key): + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + title = property(lambda self: self.getText("/Title")) + """Read-only property accessing the document's **title**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the title is not specified.""" + title_raw = property(lambda self: self.get("/Title")) + """The "raw" version of title; can return a ``ByteStringObject``.""" + + author = property(lambda self: self.getText("/Author")) + """Read-only property accessing the document's **author**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the author is not specified.""" + author_raw = property(lambda self: self.get("/Author")) + """The "raw" version of author; can return a ``ByteStringObject``.""" + + subject = property(lambda self: self.getText("/Subject")) + """Read-only property accessing the document's **subject**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the subject is not specified.""" + subject_raw = property(lambda self: self.get("/Subject")) + """The "raw" version of subject; can return a ``ByteStringObject``.""" + + creator = property(lambda self: self.getText("/Creator")) + """Read-only property accessing the document's **creator**. If the + document was converted to PDF from another format, this is the name of the + application (e.g. OpenOffice) that created the original document from + which it was converted. Returns a unicode string (``TextStringObject``) + or ``None`` if the creator is not specified.""" + creator_raw = property(lambda self: self.get("/Creator")) + """The "raw" version of creator; can return a ``ByteStringObject``.""" + + producer = property(lambda self: self.getText("/Producer")) + """Read-only property accessing the document's **producer**. + If the document was converted to PDF from another format, this is + the name of the application (for example, OSX Quartz) that converted + it to PDF. Returns a unicode string (``TextStringObject``) + or ``None`` if the producer is not specified.""" + producer_raw = property(lambda self: self.get("/Producer")) + """The "raw" version of producer; can return a ``ByteStringObject``.""" + + +def convertToInt(d, size): + if size > 8: + raise utils.PdfReadError("invalid size in convertToInt") + d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) + d = d[-8:] + return struct.unpack(">q", d)[0] + +# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 +_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \ + b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \ + b_('\xa9\xfe\x64\x53\x69\x7a') + + +# Implementation of algorithm 3.2 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): + # 1. Pad or truncate the password string to exactly 32 bytes. If the + # password string is more than 32 bytes long, use only its first 32 bytes; + # if it is less than 32 bytes long, pad it by appending the required number + # of additional bytes from the beginning of the padding string + # (_encryption_padding). + password = b_((str_(password) + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + import struct + m = md5(password) + # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash + # function. + m.update(owner_entry.original_bytes) + # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass + # these bytes to the MD5 hash function, low-order byte first. + p_entry = struct.pack('<i', p_entry) + m.update(p_entry) + # 5. Pass the first element of the file's file identifier array to the MD5 + # hash function. + m.update(id1_entry.original_bytes) + # 6. (Revision 3 or greater) If document metadata is not being encrypted, + # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. + if rev >= 3 and not metadata_encrypt: + m.update(b_("\xff\xff\xff\xff")) + # 7. Finish the hash. + md5_hash = m.digest() + # 8. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass the first n bytes of the output as + # input into a new MD5 hash, where n is the number of bytes of the + # encryption key as defined by the value of the encryption dictionary's + # /Length entry. + if rev >= 3: + for i in range(50): + md5_hash = md5(md5_hash[:keylen]).digest() + # 9. Set the encryption key to the first n bytes of the output from the + # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or + # greater, depends on the value of the encryption dictionary's /Length + # entry. + return md5_hash[:keylen] + + +# Implementation of algorithm 3.3 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg33(owner_pwd, user_pwd, rev, keylen): + # steps 1 - 4 + key = _alg33_1(owner_pwd, rev, keylen) + # 5. Pad or truncate the user password string as described in step 1 of + # algorithm 3.2. + user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) + # 6. Encrypt the result of step 5, using an RC4 encryption function with + # the encryption key obtained in step 4. + val = utils.RC4_encrypt(key, user_pwd) + # 7. (Revision 3 or greater) Do the following 19 times: Take the output + # from the previous invocation of the RC4 function and pass it as input to + # a new invocation of the function; use an encryption key generated by + # taking each byte of the encryption key obtained in step 4 and performing + # an XOR operation between that byte and the single-byte value of the + # iteration counter (from 1 to 19). + if rev >= 3: + for i in range(1, 20): + new_key = '' + for l in range(len(key)): + new_key += chr(ord_(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + # 8. Store the output from the final invocation of the RC4 as the value of + # the /O entry in the encryption dictionary. + return val + + +# Steps 1-4 of algorithm 3.3 +def _alg33_1(password, rev, keylen): + # 1. Pad or truncate the owner password string as described in step 1 of + # algorithm 3.2. If there is no owner password, use the user password + # instead. + password = b_((password + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + m = md5(password) + # 3. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass it as input into a new MD5 hash. + md5_hash = m.digest() + if rev >= 3: + for i in range(50): + md5_hash = md5(md5_hash).digest() + # 4. Create an RC4 encryption key using the first n bytes of the output + # from the final MD5 hash, where n is always 5 for revision 2 but, for + # revision 3 or greater, depends on the value of the encryption + # dictionary's /Length entry. + key = md5_hash[:keylen] + return key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg34(password, owner_entry, p_entry, id1_entry): + # 1. Create an encryption key based on the user password string, as + # described in algorithm 3.2. + key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, + # using an RC4 encryption function with the encryption key from the + # preceding step. + U = utils.RC4_encrypt(key, _encryption_padding) + # 3. Store the result of step 2 as the value of the /U entry in the + # encryption dictionary. + return U, key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + # 1. Create an encryption key based on the user password string, as + # described in Algorithm 3.2. + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 2. Initialize the MD5 hash function and pass the 32-byte padding string + # shown in step 1 of Algorithm 3.2 as input to this function. + m = md5() + m.update(_encryption_padding) + # 3. Pass the first element of the file's file identifier array (the value + # of the ID entry in the document's trailer dictionary; see Table 3.13 on + # page 73) to the hash function and finish the hash. (See implementation + # note 25 in Appendix H.) + m.update(id1_entry.original_bytes) + md5_hash = m.digest() + # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption + # function with the encryption key from step 1. + val = utils.RC4_encrypt(key, md5_hash) + # 5. Do the following 19 times: Take the output from the previous + # invocation of the RC4 function and pass it as input to a new invocation + # of the function; use an encryption key generated by taking each byte of + # the original encryption key (obtained in step 2) and performing an XOR + # operation between that byte and the single-byte value of the iteration + # counter (from 1 to 19). + for i in range(1, 20): + new_key = b_('') + for l in range(len(key)): + new_key += b_(chr(ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + # 6. Append 16 bytes of arbitrary padding to the output from the final + # invocation of the RC4 function and store the 32-byte result as the value + # of the U entry in the encryption dictionary. + # (implementator note: I don't know what "arbitrary padding" is supposed to + # mean, so I have used null bytes. This seems to match a few other + # people's implementations) + return val + (b_('\x00') * 16), key |