File: //usr/lib/python3/dist-packages/ocrmypdf/api.py
# © 2019 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
import os
import sys
from contextlib import suppress
from enum import IntEnum
from pathlib import Path
from typing import Dict, List
from tqdm import tqdm
from ._sync import run_pipeline
from ._validation import check_options
from .cli import parser
class TqdmConsole:
"""Wrapper to log messages in a way that is compatible with tqdm progress bar
This routes log messages through tqdm so that it can print them above the
progress bar, and then refresh the progress bar, rather than overwriting
it which looks messy.
For some reason Python 3.6 prints extra empty messages from time to time,
so we suppress those.
"""
def __init__(self, file):
self.file = file
self.py36 = sys.version_info[0:2] == (3, 6)
def write(self, msg):
# When no progress bar is active, tqdm.write() routes to print()
if self.py36:
if msg.strip() != '':
tqdm.write(msg.rstrip(), end='\n', file=self.file)
else:
tqdm.write(msg.rstrip(), end='\n', file=self.file)
def flush(self):
with suppress(AttributeError):
self.file.flush()
class Verbosity(IntEnum):
"""Verbosity level for configure_logging."""
quiet = -1 #: Suppress most messages
default = 0 #: Default level of logging
debug = 1 #: Output ocrmypdf debug messages
debug_all = 2 #: More detailed debugging from ocrmypdf and dependent modules
def configure_logging(verbosity, progress_bar_friendly=True, manage_root_logger=False):
"""Set up logging.
Library users may wish to use this function if they want their log output to be
similar to ocrmypdf command line interface. If not used, the external application
should configure logging on its own.
ocrmypdf will perform all of its logging under the ``"ocrmypdf"`` logging namespace.
In addition, ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library
user may wish to configure both; note that pdfminer is extremely chatty at the log
level ``logging.INFO``.
Library users may perform additional configuration afterwards.
Args:
verbosity (Verbosity): Verbosity level.
progress_bar_friendly (bool): Install the TqdmConsole log handler, which is
compatible with the tqdm progress bar; without this log messages will
overwrite the progress bar
manage_root_logger (bool): Configure the process's root logger, to ensure
all log output is sent through
Returns:
The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
"""
prefix = '' if manage_root_logger else 'ocrmypdf'
log = logging.getLogger(prefix)
log.setLevel(logging.DEBUG)
if progress_bar_friendly:
console = logging.StreamHandler(stream=TqdmConsole(sys.stderr))
else:
console = logging.StreamHandler(stream=sys.stderr)
if verbosity < 0:
console.setLevel(logging.ERROR)
elif verbosity >= 1:
console.setLevel(logging.DEBUG)
else:
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)7s - %(message)s')
if verbosity >= 2:
formatter = logging.Formatter('%(name)s - %(levelname)7s - %(message)s')
console.setFormatter(formatter)
log.addHandler(console)
if verbosity <= 1:
pdfminer_log = logging.getLogger('pdfminer')
pdfminer_log.setLevel(logging.ERROR)
pil_log = logging.getLogger('PIL')
pil_log.setLevel(logging.INFO)
if manage_root_logger:
logging.captureWarnings(True)
return log
def create_options(*, input_file, output_file, **kwargs):
cmdline = []
deferred = []
for arg, val in kwargs.items():
if val is None:
continue
# These arguments with special handling for which we bypass
# argparse
if arg in {'tesseract_env', 'progress_bar'}:
deferred.append((arg, val))
continue
cmd_style_arg = arg.replace('_', '-')
# Booleans are special: add only if True, omit for False
if isinstance(val, bool):
if val:
cmdline.append(f"--{cmd_style_arg}")
continue
# We have a parameter
cmdline.append(f"--{cmd_style_arg}")
if isinstance(val, (int, float)):
cmdline.append(str(val))
elif isinstance(val, str):
cmdline.append(val)
elif isinstance(val, Path):
cmdline.append(str(val))
else:
raise TypeError(f"{arg}: {val} ({type(val)})")
cmdline.append(str(input_file))
cmdline.append(str(output_file))
parser.api_mode = True
options = parser.parse_args(cmdline)
for keyword, val in deferred:
setattr(options, keyword, val)
# If we are running a Tesseract spoof, ensure it knows what the input file is
if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
return options
def ocr( # pylint: disable=unused-argument
input_file: os.PathLike,
output_file: os.PathLike,
*,
language: List[str] = None,
image_dpi: int = None,
output_type=None,
sidecar: os.PathLike = None,
jobs: int = None,
use_threads: bool = None,
title: str = None,
author: str = None,
subject: str = None,
keywords: str = None,
rotate_pages: bool = None,
remove_background: bool = None,
deskew: bool = None,
clean: bool = None,
clean_final: bool = None,
unpaper_args: str = None,
oversample: int = None,
remove_vectors: bool = None,
threshold: bool = None,
force_ocr: bool = None,
skip_text: bool = None,
redo_ocr: bool = None,
skip_big: float = None,
optimize: int = None,
jpg_quality: int = None,
png_quality: int = None,
jbig2_lossy: bool = None,
jbig2_page_group_size: int = None,
pages: str = None,
max_image_mpixels: float = None,
tesseract_config: List[str] = None,
tesseract_pagesegmode: int = None,
tesseract_oem: int = None,
pdf_renderer=None,
tesseract_timeout: float = None,
rotate_pages_threshold: float = None,
pdfa_image_compression=None,
user_words: os.PathLike = None,
user_patterns: os.PathLike = None,
fast_web_view: float = None,
keep_temporary_files: bool = None,
progress_bar: bool = None,
tesseract_env: Dict[str, str] = None,
):
"""Run OCRmyPDF on one PDF or image.
For most arguments, see documentation for the equivalent command line parameter.
A few specific arguments are discussed here:
Args:
use_threads (bool): Use worker threads instead of processes. This reduces
performance but may make debugging easier since it is easier to set
breakpoints.
tesseract_env (dict): Override environment variables for Tesseract
Raises:
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
with the OCR layer.
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
was not found on PATH.
ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
could not be read, or some other file type that is not a PDF.
ocrmypdf.DpiError: If the input file is an image, but the resolution of the
image is not credible (allowing it to proceed would cause poor OCR).
ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
file failed.
ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
text already, and settings did not tell us to proceed.
ocrmypdf.InputFileError: Any other problem with the input file.
ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
OCRmyPDF does not remove passwords.
ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
valid.
Returns:
:class:`ocrmypdf.ExitCode`
"""
options = create_options(**locals())
check_options(options)
return run_pipeline(options, api=True)