File: //lib/python3/dist-packages/pikepdf/_methods.py
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
"""
In several cases the implementation of some higher levels features might as
well be in Python. Fortunately we can attach Python methods to C++ class
bindings after the fact.
We can also move the implementation to C++ if desired.
"""
import inspect
from collections import namedtuple
from collections.abc import KeysView
from io import BytesIO
from subprocess import PIPE, run
from tempfile import NamedTemporaryFile
from . import Array, Dictionary, Name, Object, Page, Pdf, Stream
from ._qpdf import PdfError, StreamParser, Token, _ObjectMapping
from .models import EncryptionInfo, PdfMetadata, Permissions
# pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object
__all__ = []
def augments(cls_cpp):
"""Attach methods of a Python support class to an existing class
This monkeypatches all methods defined in the support class onto an
existing class. Example:
.. code-block:: python
@augments(ClassDefinedInCpp)
class SupportClass:
def foo(self):
pass
The Python method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
has no meaning on its own and should not be used, but gets returned from
this function so IDE code inspection doesn't get too confused.
We don't subclass because it's much more convenient to monkeypatch Python
methods onto the existing Python binding of the C++ class. For one thing,
this allows the implementation to be moved from Python to C++ or vice
versa. It saves having to implement an intermediate Python subclass and then
ensures that the C++ superclass never 'leaks' to pikepdf users. Finally,
wrapper classes and subclasses can become problematic if the call stack
crosses the C++/Python boundary multiple times.
Any existing methods may be used, regardless of whether they defined
elsewhere in the support class or in the target class.
The target class does not have to be C++ or derived from pybind11.
THIS DOES NOT work for static methods or class methods. pybind11 does not seem
to support this sort of runtime modification.
"""
def class_augment(cls, cls_cpp=cls_cpp):
for name, member in inspect.getmembers(cls):
if inspect.isfunction(member):
member.__qualname__ = member.__qualname__.replace(
cls.__name__, cls_cpp.__name__
)
setattr(cls_cpp, name, member)
elif inspect.isdatadescriptor(member):
setattr(cls_cpp, name, member)
def block_init(self):
# Prevent initialization of the support class
raise NotImplementedError(self.__class__.__name__ + '.__init__')
cls.__init__ = block_init
return cls
return class_augment
def _single_page_pdf(page):
"""Construct a single page PDF from the provided page in memory"""
pdf = Pdf.new()
pdf.pages.append(page)
bio = BytesIO()
pdf.save(bio)
bio.seek(0)
return bio.read()
def _mudraw(buffer, fmt):
"""Use mupdf draw to rasterize the PDF in the memory buffer"""
with NamedTemporaryFile(suffix='.pdf') as tmp_in:
tmp_in.write(buffer)
tmp_in.seek(0)
tmp_in.flush()
proc = run(
['mudraw', '-F', fmt, '-o', '-', tmp_in.name], stdout=PIPE, stderr=PIPE
)
if proc.stderr:
raise RuntimeError(proc.stderr.decode())
return proc.stdout
@augments(Object)
class Extend_Object:
def _repr_mimebundle_(self, include, exclude, **kwargs):
"""Present options to IPython for rich display of this object
See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display
"""
if (
isinstance(self, Dictionary)
and Name.Type in self
and self.Type == Name.Page
):
return Page(self)._repr_mimebundle_(include, exclude, **kwargs)
return None
def _ipython_key_completions_(self):
if isinstance(self, Dictionary):
return self.keys()
elif isinstance(self, Stream):
return self.stream_dict.keys()
return None
def emplace(self, other):
"""Copy all items from other without making a new object.
Particularly when working with pages, it may be desirable to remove all
of the existing page's contents and emplace (insert) a new page on top
of it, in a way that preserves all links and references to the original
page. (Or similarly, for other Dictionary objects in a PDF.)
When a page is assigned (``pdf.pages[0] = new_page``), only the
application knows if references to the original the original page are
still valid. For example, a PDF optimizer might restructure a page
object into another visually similar one, and references would be valid;
but for a program that reorganizes page contents such as a N-up
compositor, references may not be valid anymore.
This method takes precautions to ensure that child objects in common
with ``self`` and ``other`` are not inadvertently deleted.
Example:
>>> pdf.pages[0].objgen
(16, 0)
>>> pdf.pages[0].emplace(pdf.pages[1])
>>> pdf.pages[0].objgen
(16, 0) # Same object
"""
del_keys = set(self.keys()) - set(other.keys())
for k in other.keys():
self[k] = other[k] # pylint: disable=unsupported-assignment-operation
for k in del_keys:
del self[k] # pylint: disable=unsupported-delete-operation
def write(self, data, *, filter=None, decode_parms=None, type_check=True):
"""
Replace stream object's data with new (possibly compressed) `data`.
`filter` and `decode_parms` specify that compression that is present on
the input `data`.
When writing the PDF in :meth:`pikepdf.Pdf.save`,
pikepdf may change the compression or apply compression to data that was
not compressed, depending on the parameters given to that function. It
will never change lossless to lossy encoding.
PNG and TIFF images, even if compressed, cannot be directly inserted
into a PDF and displayed as images.
Args:
data (bytes): the new data to use for replacement
filter (pikepdf.Name or pikepdf.Array): The filter(s) with which the
data is (already) encoded
decode_parms (pikepdf.Dictionary or pikepdf.Array): Parameters for the
filters with which the object is encode
type_check (bool): Check arguments; use False only if you want to
intentionally create malformed PDFs.
If only one `filter` is specified, it may be a name such as
`Name('/FlateDecode')`. If there are multiple filters, then array
of names should be given.
If there is only one filter, `decode_parms` is a Dictionary of
parameters for that filter. If there are multiple filters, then
`decode_parms` is an Array of Dictionary, where each array index
is corresponds to the filter.
"""
if type_check and filter is not None:
if isinstance(filter, list):
filter = Array(filter)
filter = filter.wrap_in_array()
if isinstance(decode_parms, list):
decode_parms = Array(decode_parms)
elif decode_parms is None:
decode_parms = Array([])
else:
decode_parms = decode_parms.wrap_in_array()
if not all(isinstance(item, Name) for item in filter):
raise TypeError(
"filter must be: pikepdf.Name or pikepdf.Array([pikepdf.Name])"
)
if not all(
(isinstance(item, Dictionary) or item is None) for item in decode_parms
):
raise TypeError(
"decode_parms must be: pikepdf.Dictionary or "
"pikepdf.Array([pikepdf.Dictionary])"
)
if len(decode_parms) != 0:
if len(filter) != len(decode_parms):
raise ValueError(
(
"filter ({}) and decode_parms ({}) must be arrays of "
" same length"
).format(repr(filter), repr(decode_parms))
)
if len(filter) == 1:
filter = filter[0]
if len(decode_parms) == 0:
decode_parms = None
elif len(decode_parms) == 1:
decode_parms = decode_parms[0]
self._write(data, filter=filter, decode_parms=decode_parms)
@augments(Pdf)
class Extend_Pdf:
def _repr_mimebundle_(self, **_kwargs):
"""
Present options to IPython or Jupyter for rich display of this object
See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display
"""
bio = BytesIO()
self.save(bio)
bio.seek(0)
data = {'application/pdf': bio.read()}
return data
def open_metadata(
self, set_pikepdf_as_editor=True, update_docinfo=True, strict=False
):
"""
Open the PDF's XMP metadata for editing
Recommend for use in a ``with`` block. Changes are committed to the
PDF when the block exits. (The ``Pdf`` must still be opened.)
Example:
>>> with pdf.open_metadata() as meta:
meta['dc:title'] = 'Set the Dublic Core Title'
meta['dc:description'] = 'Put the Abstract here'
Args:
set_pikepdf_as_editor (bool): Update the metadata to show that this
version of pikepdf is the most recent software to modify the metadata.
Recommended, except for testing.
update_docinfo (bool): Update the deprecated PDF DocumentInfo block
to be consistent with XMP.
strict (bool): If ``False`` (the default), we aggressively attempt
to recover from any parse errors in XMP, and if that fails we
overwrite the XMP with an empty XMP record. If ``True``, raise
errors when either metadata bytes are not valid and well-formed
XMP (and thus, XML). Some trivial cases that are equivalent to
empty or incomplete "XMP skeletons" are never treated as errors,
and always replaced with a proper empty XMP block. Certain
errors may be logged.
Returns:
pikepdf.models.PdfMetadata
"""
return PdfMetadata(
self,
pikepdf_mark=set_pikepdf_as_editor,
sync_docinfo=update_docinfo,
overwrite_invalid_xml=not strict,
)
def make_stream(self, data):
"""
Create a new pikepdf.Stream object that is attached to this PDF.
Args:
data (bytes): Binary data for the stream object
"""
return Stream(self, data)
def add_blank_page(self, *, page_size=(612, 792)):
"""
Add a blank page to this PD. If pages already exist, the page will be added to
the end. Pages may be reordered using ``Pdf.pages``.
The caller may add content to the page by modifying its objects after creating
it.
Args:
page_size (tuple): The size of the page in PDF units (1/72 inch or 0.35mm).
Default size is set to a US Letter 8.5" x 11" page.
"""
for dim in page_size:
if not (3 <= dim <= 14400):
raise ValueError('Page size must be between 3 and 14400 PDF units')
page_dict = Dictionary(
Type=Name.Page,
MediaBox=Array([0, 0, page_size[0], page_size[1]]),
Contents=self.make_stream(b''),
Resources=Dictionary(),
)
page = self.make_indirect(page_dict)
self._add_page(page, first=False)
return page
def close(self):
"""
Close a Pdf object and release resources acquired by pikepdf.
If pikepdf opened the file handle it will close it (e.g. when opened with a file
path). If the caller opened the file for pikepdf, the caller close the file.
pikepdf lazily loads data from PDFs, so some :class:`pikepdf.Object` may
implicitly depend on the :class:`pikepdf.Pdf` being open. This is always the
case for :class:`pikepdf.Stream` but can be true for any object. Do not close
the `Pdf` object if you might still be accessing content from it.
When an ``Object`` is copied from one ``Pdf`` to another, the ``Object`` is copied into
the destination ``Pdf`` immediately, so after accessing all desired information
from the source ``Pdf`` it may be closed.
Caution:
Closing the ``Pdf`` is currently implemented by resetting it to an empty
sentinel. It is currently possible to edit the sentinel as if it were a live
object. This behavior should not be relied on and is subject to change.
"""
# Can use QPDF::closeInputSource() but that requires libqpdf 9.0.0
EMPTY_PDF = (
b"%PDF-1.3\n"
b"1 0 obj\n"
b"<< /Type /Catalog /Pages 2 0 R >>\n"
b"endobj\n"
b"2 0 obj\n"
b"<< /Type /Pages /Kids [] /Count 0 >>\n"
b"endobj\n"
b"xref\n"
b"0 3\n"
b"0000000000 65535 f \n"
b"0000000009 00000 n \n"
b"0000000058 00000 n \n"
b"trailer << /Size 3 /Root 1 0 R >>\n"
b"startxref\n"
b"110\n"
b"%%EOF\n"
)
if self.filename:
description = "closed file: " + self.filename
else:
description = "closed object"
self._process(description, EMPTY_PDF)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
@property
def allow(self):
"""
Report permissions associated with this PDF.
By default these permissions will be replicated when the PDF is
saved. Permissions may also only be changed when a PDF is being saved,
and are only available for encrypted PDFs. If a PDF is not encrypted,
all operations are reported as allowed.
pikepdf has no way of enforcing permissions.
Returns:
pikepdf.models.Permissions
"""
results = {}
for field in Permissions.fields():
results[field] = getattr(self, '_allow_' + field)
return Permissions(**results)
@property
def encryption(self):
"""
Report encryption information for this PDF.
Encryption settings may only be changed when a PDF is saved.
Returns: pikepdf.models.EncryptionInfo
"""
return EncryptionInfo(self._encryption_data)
def check(self):
"""
Check if PDF is well-formed. Similar to ``qpdf --check``.
Returns:
list of strings describing errors of warnings in the PDF
"""
class DiscardingParser(StreamParser):
def __init__(self): # pylint: disable=useless-super-delegation
super().__init__() # required for C++
def handle_object(self, obj):
pass
def handle_eof(self):
pass
problems = []
try:
self._decode_all_streams_and_discard()
except PdfError as e:
problems.append(str(e))
discarding_parser = DiscardingParser()
for basic_page in self.pages:
page = Page(basic_page)
try:
page.parse_contents(discarding_parser)
except PdfError as e:
problems.append(str(e))
for warning in self.get_warnings():
problems.append("WARNING: " + warning)
return problems
def _attach(self, *, basename, filebytes, mime=None, desc=''): # pragma: no cover
"""
Attach a file to this PDF
Args:
basename (str): The basename (filename withouth path) to name the
file. Not necessarily the name of the file on disk. Will be s
hown to the user by the PDF viewer. filebytes (bytes): The file
contents.
mime (str or None): A MIME type for the filebytes. If omitted, we try
to guess based on the standard library's
:func:`mimetypes.guess_type`. If this cannot be determined, the
generic value `application/octet-stream` is used. This value is
used by PDF viewers to decide how to present the information to
the user.
desc (str): A extended description of the file contents. PDF viewers
also display this information to the user. In Acrobat DC this is
hidden in a context menu.
The PDF will also be modified to request the PDF viewer to display the
list of attachments when opened, as opposed to other viewing modes. Some
PDF viewers will not make it obvious to the user that attachments are
present unless this is done. This behavior may be overridden by changing
``pdf.Root.PageMode`` to some other valid value.
"""
if '/Names' not in self.Root:
self.Root.Names = self.make_indirect(Dictionary())
if '/EmbeddedFiles' not in self.Root:
self.Root.Names.EmbeddedFiles = self.make_indirect(Dictionary())
if '/Names' not in self.Root.Names.EmbeddedFiles:
self.Root.Names.EmbeddedFiles.Names = Array()
if '/' in basename or '\\' in basename:
raise ValueError("basename should be a basename (no / or \\)")
if not mime:
from mimetypes import guess_type
mime, _encoding = guess_type(basename)
if not mime:
mime = 'application/octet-stream'
filestream = Stream(self, filebytes)
filestream.Subtype = Name('/' + mime)
filespec = Dictionary(
{
'/Type': Name.Filespec,
'/F': basename,
'/UF': basename,
'/Desc': desc,
'/EF': Dictionary({'/F': filestream}),
}
)
# names = self.Root.Names.EmbeddedFiles.Names.as_list()
# names.append(filename) # Key
# names.append(self.make_indirect(filespec))
self.Root.Names.EmbeddedFiles.Names = Array(
[basename, self.make_indirect(filespec)] # key
)
if '/PageMode' not in self.Root:
self.Root.PageMode = Name.UseAttachments
@augments(_ObjectMapping)
class Extend_ObjectMapping:
def __contains__(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
def get(self, key, default=None):
try:
return self[key]
except KeyError:
return default
def keys(self):
return KeysView(self)
def values(self):
return (v for _k, v in self.items())
def check_is_box(obj):
try:
if obj.is_rectangle:
return True
except AttributeError:
pass
try:
pdfobj = Array(obj)
if pdfobj.is_rectangle:
return True
except Exception:
pass
raise ValueError("object is not a rectangle")
@augments(Page)
class Extend_Page:
@property
def mediabox(self):
return self._get_mediabox(True)
@mediabox.setter
def mediabox(self, value):
check_is_box(value)
self.obj['/MediaBox'] = value
@property
def cropbox(self):
return self._get_cropbox(True)
@cropbox.setter
def cropbox(self, value):
check_is_box(value)
self.obj['/CropBox'] = value
@property
def trimbox(self):
return self._get_trimbox(True)
@trimbox.setter
def trimbox(self, value):
check_is_box(value)
self.obj['/TrimBox'] = value
def __repr__(self):
return repr(self.obj).replace('Dictionary', 'Page')
def _repr_mimebundle_(self, include, exclude, **kwargs):
data = {}
bundle = {'application/pdf', 'image/png'}
if include:
bundle = {k for k in bundle if k in include}
if exclude:
bundle = {k for k in bundle if k not in exclude}
pagedata = _single_page_pdf(self.obj)
if 'application/pdf' in bundle:
data['application/pdf'] = pagedata
if 'image/png' in bundle:
try:
data['image/png'] = _mudraw(pagedata, 'png')
except (FileNotFoundError, RuntimeError):
pass
return data
@augments(Token)
class Extend_Token:
def __repr__(self):
return 'pikepdf.Token({}, {})'.format(self.type_, self.raw_value)