HEX
Server: Apache
System: Linux srv1.prosuiteplus.com 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64
User: prosuiteplus (1001)
PHP: 8.3.20
Disabled: NONE
Upload Files
File: //lib/python3/dist-packages/ocrmypdf/__pycache__/hocrtransform.cpython-38.pyc
U

��Z^�6�@s6ddlZddlZddlmZddlmZmZmZddlm	Z	ddl
mZddlm
Z
eddd	d
dg�ZGdd
�d
e�ZGdd�d�Zedk�r2ejdd�Zejdddddd�ejddeddd�ejdddd d!�ejd"ddd#d�ejd$d%d&�ejd'd(d&�e��Zeejej�Zejejejejej d)�dS)*�N)�
namedtuple)�atan�cos�sin)�ElementTree)�inch)�Canvas�Rect�x1�y1�x2�y2c@seZdZdS)�HocrTransformErrorN)�__name__�
__module__�__qualname__�rr�8/usr/lib/python3/dist-packages/ocrmypdf/hocrtransform.pyr+src@s�eZdZdZe�d�Ze�dej�Ze	�
dddddd	��Zd
d�Zdd
�Z
dd�Zedd��Zedd��Zdd�Zedd��Zd!dd�Zedd��Zdd �ZdS)"�
HocrTransformz�
    A class for converting documents from the hOCR format.
    For details of the hOCR format, see:
    http://kba.cloud/hocr-spec/
    zbbox((\s+\d+){4})zs
        baseline \s+
        ([\-\+]?\d*\.?\d*) \s+  # +/- decimal float
        ([\-\+]?\d+)            # +/- intZffu	f‌f‌iu	f‌f‌lZfiZfl)uffuffiufflufiuflcCs�||_t�|�|_t�d|j��j�}d|_|r<|�	d�|_d\|_
|_|j�d|j�D]8}|�
|�}|�|�}|j|j|_
|j|j|_q�qZ|j
dks�|jdkr�td��dS)Nz
({.*})html��)NNz.//%sdiv[@class='ocr_page']z$hocr file is missing page dimensions)�dpir�parse�hocr�re�matchZgetroot�tag�xmlns�group�width�height�findall�element_coordinates�
pt_from_pixelrr
r
rr)�selfZhocrFileNamer�matchesZdiv�coordsZ	pt_coordsrrr�__init__Cs

zHocrTransform.__init__cCs6|jdkrdS|j�d|j�}|r.|�|�SdSdS)z=
        Return the textual content of the HTML body
        Nrz	.//%sbody)r�findr�_get_element_text)r$Zbodyrrr�__str__[s

zHocrTransform.__str__cCsLd}|jdk	r||j7}|��D]}||�|�7}q |jdk	rH||j7}|S)zL
        Return the textual content of the element and its children
        rN)�textZgetchildrenr)�tail)r$�elementr+Zchildrrrr)gs



zHocrTransform._get_element_textcsRd}d|jkrN|j�|jd�}|rN|�d����t��fdd�td�D��}|S)zj
        Returns a tuple containing the coordinates of the bounding box around
        an element
        )rrrr�titlerc3s|]}t�|�VqdS�N)�int)�.0�n�r&rr�	<genexpr>sz4HocrTransform.element_coordinates.<locals>.<genexpr>�)�attrib�box_pattern�searchr�splitr	�_make�range)�clsr-�outr%rr3rr"ts
z!HocrTransform.element_coordinatescCs@d|jkr<|j�|jd�}|r<t|�d��t|�d��fSdS)zN
        Returns a tuple containing the baseline slope and intercept.
        r.r�)rr)r6�baseline_patternr8�floatrr0)r<r-r%rrr�baseline�s

zHocrTransform.baselinecst��fdd�|D��S)zQ
        Returns the quantity in PDF units (pt) given quantity in pixels
        c3s|]}|�jtVqdSr/)rr)r1�c�r$rrr4�sz.HocrTransform.pt_from_pixel.<locals>.<genexpr>)r	r:)r$ZpxlrrCrr#�szHocrTransform.pt_from_pixelcCs|�|j�S)z�
        Given an input string, returns the corresponding string that:
        - is available in the helvetica facetype
        - does not contain any ligature (to allow easy search in the PDF file)
        )�	translate�	ligatures)r<�srrr�replace_unsupported_chars�sz'HocrTransform.replace_unsupported_charsNF�	Helveticac
Cs\t||j|jfdd�}|�ddd�|�ddd�|�d�|j�d|jdf�D]f}|�	|��
�}	t|	�dkrrqR|�|�}
|�
|
�}|rR|j|j|j|j|j|j|j|jdd�qRd}|j�d|jd	f�D]}
d
}|�||
d||||�q�|�s&|j�d|jd
f�}|�||d||||�|dk	�rH|j|dd|j|jd�|��|��dS)aQ
        Creates a PDF file with an image superimposed on top of the text.
        Text is positioned according to the bounding box of the lines in
        the hOCR file.
        The image need not be identical to the image used to create the hOCR
        file.
        It can have a lower resolution, different color mode, etc.
        r)ZpagesizeZpageCompressionrz.//%sp[@class='%s']Zocr_par�ZfillF�.//%sspan[@class='%s']Zocr_lineTZ	ocrx_wordz.//%sdiv[@class='%s']Zocr_pageN)rr )rrr �setStrokeColorRGB�setFillColorRGB�setLineWidthrr!rr)�rstrip�lenr"r#�rectr
r
rr�_do_liner(Z	drawImageZshowPageZsave)r$ZoutFileNameZ
imageFileName�showBoundingboxes�fontname�
invisibleText�interwordSpaces�pdf�elem�elemtxt�
pxl_coordsZptZfound_lines�line�rootrrr�to_pdf�s^





���
�

zHocrTransform.to_pdfcCs||d|dS)Nrrr)r<Zpoly�xrrr�polyval�szHocrTransform.polyvalcCsj|�|�}|�|�}	|	j|	j}
|�|�\}}t|�dkr>d}t|�}
t|
�t|
�}}|�	�}||j
t}|
t|�|}|�||�|r�|�
d�|j|	j|}|�r|��|�ddd�|�d�|�|	j||	j|�||f|	j|	j��|�dd�|�dd	d	�|�|||||	j|�|�d	d	d	�|�d
|j|f�}|D�]}|�|���}|�|�}|dk�r|�qP|�|�}|�|�}|�r�|d7}t�|j|	j|j|�d||
�|	jf�}|j|j}|�|||�}|�r|j |j|j|	j||
d	d
�|�!�}|j|d	}||d}|�"||�|d	k�rP|�#d||�|�$|��qP|�%|�dS)Ng{�G�zt?g�gffffff�?g�������?g�?�rrrJr� rI�d)&r"r#r
rrA�absrrrZ	beginTextrrZsetFontZsetTextRenderModer ZsetDashrKrMrZr
rr^ZsetTextTransformrLr!rr)�striprGr	r:ZstringWidthrPZgetStartOfLineZ
moveCursorZ
setHorizScaleZtextOutZdrawText)r$rVrZZ	elemclassrSrTrUrRZpxl_line_coordsZline_boxZline_heightZslopeZ
pxl_interceptZangleZcos_aZsin_ar+Z	interceptZfontsizeZbaseline_y2�elementsrWrXrYZboxZ	box_widthZ
font_widthZcursorZdxZdyrrrrQ�s�




�




��
�
zHocrTransform._do_line)NFrHFF)rrr�__doc__r�compiler7�VERBOSEr?�str�	maketransrEr'r*r)�classmethodr"rAr#rGr\r^rQrrrrr/s8
��





�
M
r�__main__zConvert hocr file to PDF)Zdescriptionz-bz--boundingboxes�
store_trueFzShow bounding boxes borders)�action�default�helpz-rz--resolutioni,z&Resolution of the image that was OCRed)�typerorpz-iz--imagez-Path to the image to be placed above the text)rorpz--interword-spaceszAdd spaces between words�hocrfilez"Path to the hocr file to be parsed)rp�
outputfilez$Path to the PDF file to be generated)rU)!�argparser�collectionsrZmathrrrZ	xml.etreerZreportlab.lib.unitsrZreportlab.pdfgen.canvasrr	�	Exceptionrrr�ArgumentParser�parser�add_argumentr0�
parse_args�argsrrZ
resolutionrr\rsZimageZ
boundingboxesZinterword_spacesrrrr�<module>sb2
�����