HEX

File: //usr/lib/python3/dist-packages/ocrmypdf/exec/tesseract.py
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

"""Interface to Tesseract executable"""

import logging
import os
import shutil
from collections import namedtuple
from contextlib import suppress
from os import fspath
from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired

from ..exceptions import (
    MissingDependencyError,
    SubprocessOutputError,
    TesseractConfigError,
)
from ..helpers import page_number, safe_symlink
from . import get_version, run

OrientationConfidence = namedtuple('OrientationConfidence', ('angle', 'confidence'))

HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
  <title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
  <meta name='ocr-system' content='tesseract 4.0.0' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
  <div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
  </div>
 </body>
</html>
"""


class TesseractLoggerAdapter(logging.LoggerAdapter):
    def process(self, msg, kwargs):
        kwargs['extra'] = self.extra
        return '[tesseract] %s' % (msg), kwargs


def version(tesseract_env=None):
    return get_version('tesseract', regex=r'tesseract\s(.+)', env=tesseract_env)


def v4(tesseract_env=None):
    "Is this Tesseract v4.0?"
    return version(tesseract_env) >= '4'


def has_textonly_pdf(tesseract_env=None, langs=None):
    """Does Tesseract have textonly_pdf capability?

    Available in v4.00.00alpha since January 2017. Best to
    parse the parameter list.
    """
    args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf']
    params = ''
    try:
        proc = run(
            args_tess,
            check=True,
            universal_newlines=True,
            stdout=PIPE,
            stderr=STDOUT,
            env=tesseract_env,
        )
        params = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(
            "Could not --print-parameters from tesseract"
        ) from e
    if 'textonly_pdf' in params:
        return True
    return False


def has_user_words(tesseract_env=None):
    """Does Tesseract have --user-words capability?

    Not available in 4.0, but available in 4.1. Also available in 3.x, but
    we no longer support 3.x.
    """
    return version(tesseract_env) >= '4.1'


def languages(tesseract_env=None):
    def lang_error(output):
        msg = (
            "Tesseract failed to report available languages.\n"
            "Output from Tesseract:\n"
            "-----------\n"
        )
        msg += output
        return msg

    args_tess = ['tesseract', '--list-langs']
    try:
        proc = run(
            args_tess,
            universal_newlines=True,
            stdout=PIPE,
            stderr=STDOUT,
            check=True,
            env=tesseract_env,
        )
        output = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(lang_error(e.output)) from e

    for line in output.splitlines():
        if line.startswith('Error'):
            raise MissingDependencyError(lang_error(output))
    header, *rest = output.splitlines()
    return set(lang.strip() for lang in rest)


def tess_base_args(langs, engine_mode):
    args = ['tesseract']
    if langs:
        args.extend(['-l', '+'.join(langs)])
    if engine_mode is not None:
        args.extend(['--oem', str(engine_mode)])
    return args


def get_orientation(input_file, engine_mode, timeout: float, log, tesseract_env=None):
    args_tesseract = tess_base_args(['osd'], engine_mode) + [
        '--psm',
        '0',
        fspath(input_file),
        'stdout',
    ]

    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=tesseract_env,
        )
        stdout = p.stdout
    except TimeoutExpired:
        return OrientationConfidence(angle=0, confidence=0.0)
    except CalledProcessError as e:
        tesseract_log_output(log, e.output, input_file)
        if (
            b'Too few characters. Skipping this page' in e.output
            or b'Image too large' in e.output
        ):
            return OrientationConfidence(0, 0)
        raise SubprocessOutputError() from e
    else:
        osd = {}
        for line in stdout.decode().splitlines():
            line = line.strip()
            parts = line.split(':', maxsplit=2)
            if len(parts) == 2:
                osd[parts[0].strip()] = parts[1].strip()

        angle = int(osd.get('Orientation in degrees', 0))
        oc = OrientationConfidence(
            angle=angle, confidence=float(osd.get('Orientation confidence', 0))
        )
        return oc


def tesseract_log_output(mainlog, stdout, input_file):
    log = TesseractLoggerAdapter(
        mainlog, extra=mainlog.extra if hasattr(mainlog, 'extra') else None
    )

    try:
        text = stdout.decode()
    except UnicodeDecodeError:
        text = stdout.decode('utf-8', 'ignore')

    lines = text.splitlines()
    for line in lines:
        if line.startswith("Tesseract Open Source"):
            continue
        elif line.startswith("Warning in pixReadMem"):
            continue
        elif 'diacritics' in line:
            log.warning("lots of diacritics - possibly poor OCR")
        elif line.startswith('OSD: Weak margin'):
            log.warning("unsure about page orientation")
        elif 'Error in pixScanForForeground' in line:
            pass  # Appears to be spurious/problem with nonwhite borders
        elif 'Error in boxClipToRectangle' in line:
            pass  # Always appears with pixScanForForeground message
        elif 'parameter not found: ' in line.lower():
            log.error(line.strip())
            problem = line.split('found: ')[1]
            raise TesseractConfigError(problem)
        elif 'error' in line.lower() or 'exception' in line.lower():
            log.error(line.strip())
        elif 'warning' in line.lower():
            log.warning(line.strip())
        elif 'read_params_file' in line.lower():
            log.error(line.strip())
        else:
            log.info(line.strip())


def page_timedout(log, input_file, timeout):
    if timeout == 0:
        return
    prefix = f"{(page_number(input_file)):4d}: [tesseract] "
    log.warning(prefix + " took too long to OCR - skipping")


def _generate_null_hocr(output_hocr, output_sidecar, image):
    """Produce a .hocr file that reports no text detected on a page that is
    the same size as the input image."""
    from PIL import Image

    with Image.open(image) as im:
        w, h = im.size

    with open(output_hocr, 'w', encoding="utf-8") as f:
        f.write(HOCR_TEMPLATE.format(w, h))
    with open(output_sidecar, 'w', encoding='utf-8') as f:
        f.write('[skipped page]')


def generate_hocr(
    input_file,
    output_files,
    language: list,
    engine_mode,
    tessconfig: list,
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
    tesseract_env,
    log,
):

    output_hocr = next(o for o in output_files if fspath(o).endswith('.hocr'))
    output_sidecar = next(o for o in output_files if fspath(o).endswith('.txt'))
    prefix = os.path.splitext(output_hocr)[0]

    args_tesseract = tess_base_args(language, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    # Reminder: test suite tesseract spoofers will break after any changes
    # to the number of order parameters here
    args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=tesseract_env,
        )
        stdout = p.stdout
    except TimeoutExpired:
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        page_timedout(log, input_file, timeout)
        _generate_null_hocr(output_hocr, output_sidecar, input_file)
    except CalledProcessError as e:
        tesseract_log_output(log, e.output, input_file)
        if b'Image too large' in e.output:
            _generate_null_hocr(output_hocr, output_sidecar, input_file)
            return

        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(log, stdout, input_file)
        # The sidecar text file will get the suffix .txt; rename it to
        # whatever caller wants it named
        if os.path.exists(prefix + '.txt'):
            shutil.move(prefix + '.txt', output_sidecar)


def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
    with open(output_text, 'w') as f:
        f.write('[skipped page]')

    if skip_pdf and not text_only:
        # Substitute a "skipped page"
        with suppress(FileNotFoundError):
            os.remove(output_pdf)  # In case it was partially created
        safe_symlink(skip_pdf, output_pdf)
        return

    # Or normally, just write a 0 byte file to the output to indicate a skip
    with open(output_pdf, 'wb') as out:
        out.write(b'')


def generate_pdf(
    *,
    input_image,
    skip_pdf=None,
    output_pdf,
    output_text,
    language: list,
    engine_mode,
    text_only: bool,
    tessconfig: list,
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
    tesseract_env,
    log,
):
    """Use Tesseract to render a PDF.

    input_image -- image to analyze
    skip_pdf -- if we time out, use this file as output
    output_pdf -- file to generate
    output_text -- OCR text file
    language -- list of languages to consider
    engine_mode -- engine mode argument for tess v4
    text_only -- enable tesseract text only mode?
    tessconfig -- tesseract configuration
    timeout -- timeout (seconds)
    log -- logger object
    """

    args_tesseract = tess_base_args(language, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    if text_only and has_textonly_pdf(tesseract_env, language):
        args_tesseract.extend(['-c', 'textonly_pdf=1'])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    prefix = os.path.splitext(output_pdf)[0]  # Tesseract appends suffixes

    # Reminder: test suite tesseract spoofers might break after any changes
    # to the number of order parameters here

    args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig)
    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=tesseract_env,
        )
        stdout = p.stdout
        if os.path.exists(prefix + '.txt'):
            shutil.move(prefix + '.txt', output_text)
    except TimeoutExpired:
        page_timedout(log, input_image, timeout)
        use_skip_page(text_only, skip_pdf, output_pdf, output_text)
    except CalledProcessError as e:
        tesseract_log_output(log, e.output, input_image)
        if b'Image too large' in e.output:
            use_skip_page(text_only, skip_pdf, output_pdf, output_text)
            return
        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(log, stdout, input_image)