File: //usr/lib/python3/dist-packages/ocrmypdf/_graft.py
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import os
from contextlib import suppress
from pathlib import Path
import pikepdf
MAX_REPLACE_PAGES = 100
def _update_page_resources(*, page, font, font_key, procset):
"""Update this page's fonts with a reference to the Glyphless font"""
if '/Resources' not in page:
page['/Resources'] = pikepdf.Dictionary({})
resources = page['/Resources']
try:
fonts = resources['/Font']
except KeyError:
fonts = pikepdf.Dictionary({})
if font_key is not None and font_key not in fonts:
fonts[font_key] = font
resources['/Font'] = fonts
# Reassign /ProcSet to one that just lists everything - ProcSet is
# obsolete and doesn't matter but recommended for old viewer support
resources['/ProcSet'] = procset
def strip_invisible_text(pdf, page):
stream = []
in_text_obj = False
render_mode = 0
text_objects = []
page.page_contents_coalesce()
for operands, operator in pikepdf.parse_content_stream(page, ''):
if not in_text_obj:
if operator == pikepdf.Operator('BT'):
in_text_obj = True
render_mode = 0
text_objects.append((operands, operator))
else:
stream.append((operands, operator))
else:
if operator == pikepdf.Operator('Tr'):
render_mode = operands[0]
text_objects.append((operands, operator))
if operator == pikepdf.Operator('ET'):
in_text_obj = False
if render_mode != 3:
stream.extend(text_objects)
text_objects.clear()
def convert(op):
try:
return op.unparse()
except AttributeError:
return str(op).encode('ascii')
lines = []
for operands, operator in stream:
if operator == pikepdf.Operator('INLINE IMAGE'):
iim = operands[0]
line = iim.unparse()
else:
line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse()
lines.append(line)
content_stream = b'\n'.join(lines)
page.Contents = pikepdf.Stream(pdf, content_stream)
def _graft_text_layer(
*, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log
):
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
log.debug("Grafting")
if Path(text).stat().st_size == 0:
return
# This is a pointer indicating a specific page in the base file
pdf_text = pikepdf.open(text)
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
base_page = pdf_base.pages.p(page_num)
# The text page always will be oriented up by this stage but the original
# content may have a rotation applied. Wrap the text stream with a rotation
# so it will be oriented the same way as the rest of the page content.
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
# -rotation because the input is a clockwise angle and this formula
# uses CCW
rotation = -rotation % 360
rotate = pikepdf.PdfMatrix().rotated(rotation)
# Because of rounding of DPI, we might get a text layer that is not
# identically sized to the target page. Scale to adjust. Normally this
# is within 0.998.
if rotation in (90, 270):
wt, ht = ht, wt
scale_x = wp / wt
scale_y = hp / ht
# log.debug('%r', scale_x, scale_y)
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
# Translate the text so it is centered at (0, 0), rotate it there, adjust
# for a size different between initial and text PDF, then untranslate, and
# finally move the lower left corner to match the mediabox
ctm = translate @ rotate @ scale @ untranslate @ corner
pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'
new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)
if strip_old_text:
strip_invisible_text(pdf_base, base_page)
base_page.page_contents_add(new_text_layer, prepend=True)
_update_page_resources(
page=base_page, font=font, font_key=font_key, procset=procset
)
pdf_text.close()
def _find_font(text, pdf_base):
"""Copy a font from the filename text into pdf_base"""
font, font_key = None, None
possible_font_names = ('/f-0-0', '/F1')
try:
with pikepdf.open(text) as pdf_text:
try:
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
except (AttributeError, IndexError, KeyError):
return None, None
for f in possible_font_names:
pdf_text_font = pdf_text_fonts.get(f, None)
if pdf_text_font is not None:
font_key = f
break
if pdf_text_font:
font = pdf_base.copy_foreign(pdf_text_font)
return font, font_key
except (FileNotFoundError, pikepdf.PdfError):
# PdfError occurs if a 0-length file is written e.g. due to OCR timeout
return None, None
class OcrGrafter:
def __init__(self, context):
self.context = context
self.log = context.log
self.path_base = Path(context.origin).resolve()
self.pdf_base = pikepdf.open(self.path_base)
self.font, self.font_key = None, None
self.pdfinfo = context.pdfinfo
self.output_file = context.get_path('graft_layers.pdf')
self.procset = self.pdf_base.make_indirect(
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
)
self.emplacements = 1
self.interim_count = 0
def graft_page(self, page_result):
pageno, image, text, _sidecar, autorotate_correction = page_result
if text and not self.font:
self.font, self.font_key = _find_font(text, self.pdf_base)
emplaced_page = False
content_rotation = self.pdfinfo[pageno].rotation
path_image = Path(image).resolve() if image else None
if path_image is not None and path_image != self.path_base:
# We are updating the old page with a rasterized PDF of the new
# page (without changing objgen, to preserve references)
self.log.debug("Emplacement update")
with pikepdf.open(image) as pdf_image:
self.emplacements += 1
foreign_image_page = pdf_image.pages[0]
self.pdf_base.pages.append(foreign_image_page)
local_image_page = self.pdf_base.pages[-1]
self.pdf_base.pages[pageno].emplace(local_image_page)
del self.pdf_base.pages[-1]
emplaced_page = True
if emplaced_page:
content_rotation = autorotate_correction
text_rotation = autorotate_correction
text_misaligned = (text_rotation - content_rotation) % 360
self.log.debug(
f"Rotations for page {pageno}: [text, auto, misalign, content] = "
f"{text_rotation}, {autorotate_correction}, "
f"{text_misaligned}, {content_rotation}"
)
if text and self.font:
# Graft the text layer onto this page, whether new or old
strip_old = self.context.options.redo_ocr
_graft_text_layer(
pdf_base=self.pdf_base,
page_num=pageno + 1,
text=text,
font=self.font,
font_key=self.font_key,
rotation=text_misaligned,
procset=self.procset,
strip_old_text=strip_old,
log=self.log,
)
# Correct the rotation if applicable
self.pdf_base.pages[pageno].Rotate = (
content_rotation - autorotate_correction
) % 360
if self.emplacements % MAX_REPLACE_PAGES == 0:
self.save_and_reload()
def save_and_reload(self):
# Periodically save and reload the Pdf object. This will keep a
# lid on our memory usage for very large files. Attach the font to
# page 1 even if page 1 doesn't use it, so we have a way to get it
# back.
page0 = self.pdf_base.pages[0]
_update_page_resources(
page=page0, font=self.font, font_key=self.font_key, procset=self.procset
)
# We cannot read and write the same file, that will corrupt it
# but we don't to keep more copies than we need to. Delete intermediates.
# {interim_count} is the opened file we were updateing
# {interim_count - 1} can be deleted
# {interim_count + 1} is the new file will produce and open
old_file = self.output_file + f'_working{self.interim_count - 1}.pdf'
if not self.context.options.keep_temporary_files:
with suppress(FileNotFoundError):
os.unlink(old_file)
next_file = self.output_file + f'_working{self.interim_count + 1}.pdf'
self.pdf_base.save(next_file)
self.pdf_base.close()
self.pdf_base = pikepdf.open(next_file)
self.procset = self.pdf_base.pages[0].Resources.ProcSet
self.font, self.font_key = None, None # Ensure we reacquire this information
self.interim_count += 1
def finalize(self):
self.pdf_base.save(self.output_file)
self.pdf_base.close()
return self.output_file