diff options
author | ben | 2018-09-18 10:52:38 +0200 |
---|---|---|
committer | ben | 2018-09-18 10:52:38 +0200 |
commit | f57654b84b4cf0ffa1287034fc9f66ba200bb259 (patch) | |
tree | 5ffb371ce5b5008052e425955f45c8b808ba7fa0 /PdfFileTransformer/PyPDF2/merger.py | |
download | truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.gz truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.bz2 truepolyglot-f57654b84b4cf0ffa1287034fc9f66ba200bb259.tar.xz |
First public commit
Diffstat (limited to 'PdfFileTransformer/PyPDF2/merger.py')
-rw-r--r-- | PdfFileTransformer/PyPDF2/merger.py | 553 |
1 files changed, 553 insertions, 0 deletions
diff --git a/PdfFileTransformer/PyPDF2/merger.py b/PdfFileTransformer/PyPDF2/merger.py new file mode 100644 index 0000000..c3373e4 --- /dev/null +++ b/PdfFileTransformer/PyPDF2/merger.py @@ -0,0 +1,553 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from .generic import * +from .utils import isString, str_ +from .pdf import PdfFileReader, PdfFileWriter +from .pagerange import PageRange +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO + StreamIO = StringIO +else: + from io import BytesIO + from io import FileIO as file + StreamIO = BytesIO + + +class _MergedPage(object): + """ + _MergedPage is used internally by PdfFileMerger to collect necessary + information on each page that is being merged. + """ + def __init__(self, pagedata, src, id): + self.src = src + self.pagedata = pagedata + self.out_pagedata = None + self.id = id + + +class PdfFileMerger(object): + """ + Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs + into a single PDF. It can concatenate, slice, insert, or any combination + of the above. + + See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`) + and :meth:`write()<write>` for usage information. + + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + """ + + def __init__(self, strict=True): + self.inputs = [] + self.pages = [] + self.output = PdfFileWriter() + self.bookmarks = [] + self.named_dests = [] + self.id_count = 0 + self.strict = strict + + def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + Merges the pages from the given file into the output file at the + specified page number. + + :param int position: The *page number* to insert this file. File will + be inserted after the given number. + + :param fileobj: A File Object or an object that supports the standard read + and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str bookmark: Optionally, you may specify a bookmark to be applied at + the beginning of the included file by supplying the text of the bookmark. + + :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_bookmarks: You may prevent the source document's bookmarks + from being imported by specifying this as ``False``. + """ + + # This parameter is passed to self.inputs.append and means + # that the stream used was created in this method. + my_file = False + + # If the fileobj parameter is a string, assume it is a path + # and create a file object at that location. If it is a file, + # copy the file's contents into a BytesIO (or StreamIO) stream object; if + # it is a PdfFileReader, copy that reader's stream into a + # BytesIO (or StreamIO) stream. + # If fileobj is none of the above types, it is not modified + decryption_key = None + if isString(fileobj): + fileobj = file(fileobj, 'rb') + my_file = True + elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): + fileobj.seek(0) + filecontent = fileobj.read() + fileobj = StreamIO(filecontent) + my_file = True + elif isinstance(fileobj, PdfFileReader): + orig_tell = fileobj.stream.tell() + fileobj.stream.seek(0) + filecontent = StreamIO(fileobj.stream.read()) + fileobj.stream.seek(orig_tell) # reset the stream to its original location + fileobj = filecontent + if hasattr(fileobj, '_decryption_key'): + decryption_key = fileobj._decryption_key + my_file = True + + # Create a new PdfFileReader instance using the stream + # (either file or BytesIO or StringIO) created above + pdfr = PdfFileReader(fileobj, strict=self.strict) + if decryption_key is not None: + pdfr._decryption_key = decryption_key + + # Find the range of pages to merge. + if pages == None: + pages = (0, pdfr.getNumPages()) + elif isinstance(pages, PageRange): + pages = pages.indices(pdfr.getNumPages()) + elif not isinstance(pages, tuple): + raise TypeError('"pages" must be a tuple of (start, stop[, step])') + + srcpages = [] + if bookmark: + bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) + + outline = [] + if import_bookmarks: + outline = pdfr.getOutlines() + outline = self._trim_outline(pdfr, outline, pages) + + if bookmark: + self.bookmarks += [bookmark, outline] + else: + self.bookmarks += outline + + dests = pdfr.namedDestinations + dests = self._trim_dests(pdfr, dests, pages) + self.named_dests += dests + + # Gather all the pages that are going to be merged + for i in range(*pages): + pg = pdfr.getPage(i) + + id = self.id_count + self.id_count += 1 + + mp = _MergedPage(pg, pdfr, id) + + srcpages.append(mp) + + self._associate_dests_to_pages(srcpages) + self._associate_bookmarks_to_pages(srcpages) + + # Slice to insert the pages at the specified position + self.pages[position:position] = srcpages + + # Keep track of our input files so we can close them later + self.inputs.append((fileobj, pdfr, my_file)) + + def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate + all pages onto the end of the file instead of specifying a position. + + :param fileobj: A File Object or an object that supports the standard read + and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str bookmark: Optionally, you may specify a bookmark to be applied at + the beginning of the included file by supplying the text of the bookmark. + + :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_bookmarks: You may prevent the source document's bookmarks + from being imported by specifying this as ``False``. + """ + + self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) + + def write(self, fileobj): + """ + Writes all data that has been merged to the given output file. + + :param fileobj: Output file. Can be a filename or any kind of + file-like object. + """ + my_file = False + if isString(fileobj): + fileobj = file(fileobj, 'wb') + my_file = True + + # Add pages to the PdfFileWriter + # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 + for page in self.pages: + self.output.addPage(page.pagedata) + page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) + #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 + #page.out_pagedata = IndirectObject(idnum, 0, self.output) + + # Once all pages are added, create bookmarks to point at those pages + self._write_dests() + self._write_bookmarks() + + # Write the output to the file + self.output.write(fileobj) + + if my_file: + fileobj.close() + + def close(self): + """ + Shuts all file descriptors (input and output) and clears all memory + usage. + """ + self.pages = [] + for fo, pdfr, mine in self.inputs: + if mine: + fo.close() + + self.inputs = [] + self.output = None + + def addMetadata(self, infos): + """ + Add custom metadata to the output. + + :param dict infos: a Python dictionary where each key is a field + and each value is your new metadata. + Example: ``{u'/Title': u'My title'}`` + """ + self.output.addMetadata(infos) + + def setPageLayout(self, layout): + """ + Set the page layout + + :param str layout: The page layout to be used + + Valid layouts are: + /NoLayout Layout explicitly not specified + /SinglePage Show one page at a time + /OneColumn Show one column at a time + /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left + /TwoColumnRight Show pages in two columns, odd-numbered pages on the right + /TwoPageLeft Show two pages at a time, odd-numbered pages on the left + /TwoPageRight Show two pages at a time, odd-numbered pages on the right + """ + self.output.setPageLayout(layout) + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + Valid modes are: + /UseNone Do not show outlines or thumbnails panels + /UseOutlines Show outlines (aka bookmarks) panel + /UseThumbs Show page thumbnails panel + /FullScreen Fullscreen view + /UseOC Show Optional Content Group (OCG) panel + /UseAttachments Show attachments panel + """ + self.output.setPageMode(mode) + + def _trim_dests(self, pdf, dests, pages): + """ + Removes any named destinations that are not a part of the specified + page set. + """ + new_dests = [] + prev_header_added = True + for k, o in list(dests.items()): + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + assert str_(k) == str_(o['/Title']) + new_dests.append(o) + break + return new_dests + + def _trim_outline(self, pdf, outline, pages): + """ + Removes any outline/bookmark entries that are not a part of the + specified page set. + """ + new_outline = [] + prev_header_added = True + for i, o in enumerate(outline): + if isinstance(o, list): + sub = self._trim_outline(pdf, o, pages) + if sub: + if not prev_header_added: + new_outline.append(outline[i-1]) + new_outline.append(sub) + else: + prev_header_added = False + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + new_outline.append(o) + prev_header_added = True + break + return new_outline + + def _write_dests(self): + dests = self.named_dests + + for v in dests: + pageno = None + pdf = None + if '/Page' in v: + for i, p in enumerate(self.pages): + if p.id == v['/Page']: + v[NameObject('/Page')] = p.out_pagedata + pageno = i + pdf = p.src + break + if pageno != None: + self.output.addNamedDestinationObject(v) + + def _write_bookmarks(self, bookmarks=None, parent=None): + + if bookmarks == None: + bookmarks = self.bookmarks + + last_added = None + for b in bookmarks: + if isinstance(b, list): + self._write_bookmarks(b, last_added) + continue + + pageno = None + pdf = None + if '/Page' in b: + for i, p in enumerate(self.pages): + if p.id == b['/Page']: + #b[NameObject('/Page')] = p.out_pagedata + args = [NumberObject(p.id), NameObject(b['/Type'])] + #nothing more to add + #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' + if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Top'] + elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + del b['/Left'] + elif b['/Type'] == '/XYZ': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): + args.append(FloatObject(b['/Zoom'])) + else: + args.append(FloatObject(0)) + del b['/Top'], b['/Zoom'], b['/Left'] + elif b['/Type'] == '/FitR': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): + args.append(FloatObject(b['/Bottom'])) + else: + args.append(FloatObject(0)) + if '/Right' in b and not isinstance(b['/Right'], NullObject): + args.append(FloatObject(b['/Right'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] + + b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) + + pageno = i + pdf = p.src + break + if pageno != None: + del b['/Page'], b['/Type'] + last_added = self.output.addBookmarkDict(b, parent) + + def _associate_dests_to_pages(self, pages): + for nd in self.named_dests: + pageno = None + np = nd['/Page'] + + if isinstance(np, NumberObject): + continue + + for p in pages: + if np.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + nd[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],)) + + def _associate_bookmarks_to_pages(self, pages, bookmarks=None): + if bookmarks == None: + bookmarks = self.bookmarks + + for b in bookmarks: + if isinstance(b, list): + self._associate_bookmarks_to_pages(pages, b) + continue + + pageno = None + bp = b['/Page'] + + if isinstance(bp, NumberObject): + continue + + for p in pages: + if bp.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + b[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],)) + + def findBookmark(self, bookmark, root=None): + if root == None: + root = self.bookmarks + + for i, b in enumerate(root): + if isinstance(b, list): + res = self.findBookmark(bookmark, b) + if res: + return [i] + res + elif b == bookmark or b['/Title'] == bookmark: + return [i] + + return None + + def addBookmark(self, title, pagenum, parent=None): + """ + Add a bookmark to this PDF file. + + :param str title: Title to use for this bookmark. + :param int pagenum: Page number this bookmark will point to. + :param parent: A reference to a parent bookmark to create nested + bookmarks. + """ + if parent == None: + iloc = [len(self.bookmarks)-1] + elif isinstance(parent, list): + iloc = parent + else: + iloc = self.findBookmark(parent) + + dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + + if parent == None: + self.bookmarks.append(dest) + else: + bmparent = self.bookmarks + for i in iloc[:-1]: + bmparent = bmparent[i] + npos = iloc[-1]+1 + if npos < len(bmparent) and isinstance(bmparent[npos], list): + bmparent[npos].append(dest) + else: + bmparent.insert(npos, [dest]) + return dest + + def addNamedDestination(self, title, pagenum): + """ + Add a destination to the output. + + :param str title: Title to use + :param int pagenum: Page number this destination points at. + """ + + dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + self.named_dests.append(dest) + + +class OutlinesObject(list): + def __init__(self, pdf, tree, parent=None): + list.__init__(self) + self.tree = tree + self.pdf = pdf + self.parent = parent + + def remove(self, index): + obj = self[index] + del self[index] + self.tree.removeChild(obj) + + def add(self, title, pagenum): + pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] + action = DictionaryObject() + action.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self.pdf._addObject(action) + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A'): actionRef, + NameObject('/Title'): createStringObject(title), + }) + + self.pdf._addObject(bookmark) + + self.tree.addChild(bookmark) + + def removeAll(self): + for child in [x for x in self.tree.children()]: + self.tree.removeChild(child) + self.pop() |