Source code for layoutparser.ocr.tesseract_agent

# Copyright 2021 The Layout Parser team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import csv
import pickle

import pandas as pd

from .base import BaseOCRAgent, BaseOCRElementType
from ..io import load_dataframe
from ..file_utils import is_pytesseract_available

if is_pytesseract_available():
    import pytesseract


[docs]class TesseractFeatureType(BaseOCRElementType): """ The element types for Tesseract Detection API """ PAGE = 0 BLOCK = 1 PARA = 2 LINE = 3 WORD = 4 @property def attr_name(self): name_cvt = { TesseractFeatureType.PAGE: "page_num", TesseractFeatureType.BLOCK: "block_num", TesseractFeatureType.PARA: "par_num", TesseractFeatureType.LINE: "line_num", TesseractFeatureType.WORD: "word_num", } return name_cvt[self] @property def group_levels(self): levels = ["page_num", "block_num", "par_num", "line_num", "word_num"] return levels[: self + 1]
[docs]class TesseractAgent(BaseOCRAgent): """ A wrapper for `Tesseract <https://github.com/tesseract-ocr/tesseract>`_ Text Detection APIs based on `PyTesseract <https://github.com/tesseract-ocr/tesseract>`_. """ DEPENDENCIES = ["pytesseract"] def __init__(self, languages="eng", **kwargs): """Create a Tesseract OCR Agent. Args: languages (:obj:`list` or :obj:`str`, optional): You can specify the language code(s) of the documents to detect to improve accuracy. The supported language and their code can be found on `its github repo <https://github.com/tesseract-ocr/langdata>`_. It supports two formats: 1) you can pass in the languages code as a string of format like `"eng+fra"`, or 2) you can pack them as a list of strings `["eng", "fra"]`. Defaults to 'eng'. """ self.lang = languages if isinstance(languages, str) else "+".join(languages) self.configs = kwargs
[docs] @classmethod def with_tesseract_executable(cls, tesseract_cmd_path, **kwargs): pytesseract.pytesseract.tesseract_cmd = tesseract_cmd_path return cls(**kwargs)
def _detect(self, img_content): res = {} res["text"] = pytesseract.image_to_string( img_content, lang=self.lang, **self.configs ) _data = pytesseract.image_to_data(img_content, lang=self.lang, **self.configs) res["data"] = pd.read_csv( io.StringIO(_data), quoting=csv.QUOTE_NONE, encoding="utf-8", sep="\t", converters={"text": str}, ) return res
[docs] def detect( self, image, return_response=False, return_only_text=True, agg_output_level=None ): """Send the input image for OCR. Args: image (:obj:`np.ndarray` or :obj:`str`): The input image array or the name of the image file return_response (:obj:`bool`, optional): Whether directly return all output (string and boxes info) from Tesseract. Defaults to `False`. return_only_text (:obj:`bool`, optional): Whether return only the texts in the OCR results. Defaults to `False`. agg_output_level (:obj:`~TesseractFeatureType`, optional): When set, aggregate the GCV output with respect to the specified aggregation level. Defaults to `None`. """ res = self._detect(image) if return_response: return res if return_only_text: return res["text"] if agg_output_level is not None: return self.gather_data(res, agg_output_level) return res["text"]
[docs] @staticmethod def gather_data(response, agg_level): """ Gather the OCR'ed text, bounding boxes, and confidence in a given aggeragation level. """ assert isinstance( agg_level, TesseractFeatureType ), f"Invalid agg_level {agg_level}" res = response["data"] df = ( res[~res.text.isna()] .groupby(agg_level.group_levels) .apply( lambda gp: pd.Series( [ gp["left"].min(), gp["top"].min(), gp["width"].max(), gp["height"].max(), gp["conf"].mean(), gp["text"].str.cat(sep=" "), ] ) ) .reset_index(drop=True) .reset_index() .rename( columns={ 0: "x_1", 1: "y_1", 2: "w", 3: "h", 4: "score", 5: "text", "index": "id", } ) .assign( x_2=lambda x: x.x_1 + x.w, y_2=lambda x: x.y_1 + x.h, block_type="rectangle", ) .drop(columns=["w", "h"]) ) return load_dataframe(df)
[docs] @staticmethod def load_response(filename): with open(filename, "rb") as fp: res = pickle.load(fp) return res
[docs] @staticmethod def save_response(res, file_name): with open(file_name, "wb") as fp: pickle.dump(res, fp, protocol=pickle.HIGHEST_PROTOCOL)