HEX

File: //usr/lib/python3/dist-packages/ocrmypdf/pdfinfo/ghosttext.py
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

import logging
import re
import xml.etree.ElementTree as ET

from ..exec import ghostscript

gslog = logging.getLogger()

# Forgive me for I have sinned
# I am using regular expressions to parse XML. However the XML in this case,
# generated by Ghostscript, is self-consistent enough to be parseable.
regex_remove_char_tags = re.compile(
    br"""
    <char\b
    (?:   [^>]   # anything single character but >
        | \">\"  # special case: trap ">"
    )*
    />           # terminate with '/>'
""",
    re.VERBOSE,
)


def page_get_textblocks(infile, pageno, xmltext, height):
    """Get text boxes out of Ghostscript txtwrite xml"""

    root = xmltext
    if not hasattr(xmltext, 'findall'):
        return []

    def blocks():
        for span in root.findall('.//span'):
            bbox_str = span.attrib['bbox']
            font_size = span.attrib['size']
            pts = [int(pt) for pt in bbox_str.split()]
            pts[1] = pts[1] - int(float(font_size) + 0.5)
            bbox_topdown = tuple(pts)
            bb = bbox_topdown
            bbox_bottomup = (bb[0], height - bb[3], bb[2], height - bb[1])
            yield bbox_bottomup

    def joined_blocks():
        prev = None
        for bbox in blocks():
            if prev is None:
                prev = bbox
            if bbox[1] == prev[1] and bbox[3] == prev[3]:
                gap = prev[2] - bbox[0]
                height = abs(bbox[3] - bbox[1])
                if gap < height:
                    # Join boxes
                    prev = (prev[0], prev[1], bbox[2], bbox[3])
                    continue
            # yield previously joined bboxes and start anew
            yield prev
            prev = bbox
        if prev is not None:
            yield prev

    return [block for block in joined_blocks()]


def extract_text_xml(infile, pdf, pageno=None, log=gslog):
    existing_text = ghostscript.extract_text(infile, pageno=None)
    existing_text = regex_remove_char_tags.sub(b' ', existing_text)

    try:
        root = ET.fromstringlist([b'<document>\n', existing_text, b'</document>\n'])
        page_xml = root.findall('page')
    except ET.ParseError as e:
        log.error(
            "An error occurred while attempting to retrieve existing text in "
            "the input file. Will attempt to continue assuming that there is "
            "no existing text in the file. The error was:"
        )
        log.error(e)
        page_xml = [None] * len(pdf.pages)

    page_count_difference = len(pdf.pages) - len(page_xml)
    if page_count_difference != 0:
        log.error("The number of pages in the input file is inconsistent.")
        log.error(f"Expected {len(pdf.pages)}, txtwrite says {len(page_xml)}")
        if page_count_difference > 0:
            page_xml.extend([None] * page_count_difference)
    return page_xml