File: //usr/lib/python3/dist-packages/ocrmypdf/pdfa.py
# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
"""
Generate a PDFMARK file for Ghostscript >= 9.14, for PDF/A conversion
pdfmark is an extension to the Postscript language that describes some PDF
features like bookmarks and annotations. It was originally specified Adobe
Distiller, for Postscript to PDF conversion:
https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
Ghostscript uses pdfmark for PDF to PDF/A conversion as well. To use Ghostscript
to create a PDF/A, we need to create a pdfmark file with the necessary metadata.
This takes care of the many version-specific bugs and pecularities in
Ghostscript's handling of pdfmark.
"""
import base64
from pathlib import Path
from string import Template
import pikepdf
import pkg_resources
ICC_PROFILE_RELPATH = 'data/sRGB.icc'
SRGB_ICC_PROFILE = pkg_resources.resource_filename('ocrmypdf', ICC_PROFILE_RELPATH)
# This is a template written in PostScript which is needed to create PDF/A
# files, from the Ghostscript documentation. Lines beginning with % are
# comments. Python substitution variables have a '$' prefix.
pdfa_def_template = u"""%!
% Define an ICC profile :
/ICCProfile $icc_profile
def
[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
[{icc_PDFA} << /N 3 >> /PUT pdfmark
[{icc_PDFA} ICCProfile /PUT pdfmark
% Define the output intent dictionary :
[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
[{OutputIntent_PDFA} <<
/Type /OutputIntent % Must be so (the standard requires).
/S /GTS_PDFA1 % Must be so (the standard requires).
/DestOutputProfile {icc_PDFA} % Must be so (see above).
/OutputConditionIdentifier ($icc_identifier)
>> /PUT pdfmark
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
"""
def generate_pdfa_ps(target_filename, icc='sRGB'):
"""Create a Postscript pdfmark file for Ghostscript PDF/A conversion
A pdfmark file is a small Postscript program that provides some information
Ghostscript needs to perform PDF/A conversion. The only information we put
in specifies that we want the file to be a PDF/A, and we want to Ghostscript
to convert objects to the sRGB colorspace if it runs into any object that
it decides must be converted.
See the Adobe pdfmark Reference for details:
https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
:param target_filename: filename to save
:param icc: ICC identifier such as 'sRGB'
"""
if icc == 'sRGB':
icc_profile = SRGB_ICC_PROFILE
else:
raise NotImplementedError("Only supporting sRGB")
# Read the ICC profile, encode as ASCII85 and convert to a string which we
# will insert in the .ps file
bytes_icc_profile = Path(icc_profile).read_bytes()
icc_profile = base64.a85encode(bytes_icc_profile, adobe=True).decode('ascii')
t = Template(pdfa_def_template)
ps = t.substitute(icc_profile=icc_profile, icc_identifier=icc)
# We should have encoded everything to pure ASCII by this point, and
# to be safe, only allow ASCII in PostScript
Path(target_filename).write_text(ps, encoding='ascii')
return target_filename
def file_claims_pdfa(filename):
"""Determines if the file claims to be PDF/A compliant
This only checks if the XMP metadata contains a PDF/A marker. It does not
do full PDF/A validation.
"""
with pikepdf.open(filename) as pdf:
pdfmeta = pdf.open_metadata()
if not pdfmeta.pdfa_status:
return {
'pass': False,
'output': 'pdf',
'conformance': 'No PDF/A metadata in XMP',
}
valid_part_conforms = {'1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U'}
conformance = f'PDF/A-{pdfmeta.pdfa_status}'
pdfa_dict = {}
if pdfmeta.pdfa_status in valid_part_conforms:
pdfa_dict['pass'] = True
pdfa_dict['output'] = 'pdfa'
pdfa_dict['conformance'] = conformance
return pdfa_dict