HEX

File: //usr/lib/python3/dist-packages/ocrmypdf/quality.py
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

import re
from typing import Iterable

"""Utilities to measure OCR quality"""


class OcrQualityDictionary:
    """Manages a dictionary for simple OCR quality checks."""

    def __init__(self, *, wordlist: Iterable[str] = []):
        """Construct a dictionary from a list of words.

        Words for which capitalization is important should be capitalized in the
        dictionary. Words that contain spaces or other punctuation will never match.
        """
        self.dictionary = set()
        self.dictionary.update(w for w in wordlist)

    def measure_words_matched(self, ocr_text: str) -> float:
        """Check how many unique words in the OCR text match a dictionary.

        Words with mixed capitalized are only considered a match if the test word
        matches that capitalization.

        Returns:
            number of words that match / number
        """
        text = re.sub(r"[0-9_]+", ' ', ocr_text)
        text = re.sub(r'\W+', ' ', text)
        text_words_list = re.split(r'\s+', text)
        text_words = {w for w in text_words_list if len(w) >= 3}

        matches = 0
        for w in text_words:
            if w in self.dictionary or (
                w != w.lower() and w.lower() in self.dictionary
            ):
                matches += 1
        if matches > 0:
            hit_ratio = matches / len(text_words)
        else:
            hit_ratio = 0.0
        return hit_ratio