Source code for layoutparser.io.basic

# Copyright 2021 The Layout Parser team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ast
import json
from typing import List, Union, Dict, Dict, Any

import pandas as pd

from ..elements import (
    BaseLayoutElement,
    TextBlock,
    Layout,
    BASECOORD_ELEMENT_NAMEMAP,
)


[docs]def load_json(filename: str) -> Union[BaseLayoutElement, Layout]: """Load a JSON file and save it as a layout object with appropriate data types. Args: filename (str): The name of the JSON file. Returns: Union[BaseLayoutElement, Layout]: Based on the JSON file format, it will automatically parse the type of the data and load it accordingly. """ with open(filename, "r") as fp: res = json.load(fp) return load_dict(res)
[docs]def load_dict(data: Union[Dict, List[Dict]]) -> Union[BaseLayoutElement, Layout]: """Load a dict of list of dict representations of some layout data, automatically parse its type, and save it as any of BaseLayoutElement or Layout datatype. Args: data (Union[Dict, List]): A dict of list of dict representations of the layout data Raises: ValueError: If the data format is incompatible with the layout-data-JSON format, raise a `ValueError`. ValueError: If any `block_type` name is not in the available list of layout element names defined in `BASECOORD_ELEMENT_NAMEMAP`, raise a `ValueError`. Returns: Union[BaseLayoutElement, Layout]: Based on the dict format, it will automatically parse the type of the data and load it accordingly. """ if isinstance(data, dict): if "page_data" in data: # It is a layout instance return Layout(load_dict(data["blocks"])._blocks, page_data=data["page_data"]) else: if data["block_type"] not in BASECOORD_ELEMENT_NAMEMAP: raise ValueError(f"Invalid block_type {data['block_type']}") # Check if it is a textblock is_textblock = any(ele in data for ele in TextBlock._features) if is_textblock: return TextBlock.from_dict(data) else: return BASECOORD_ELEMENT_NAMEMAP[data["block_type"]].from_dict(data) elif isinstance(data, list): return Layout([load_dict(ele) for ele in data]) else: raise ValueError(f"Invalid input JSON structure.")
[docs]def load_csv(filename: str, block_type: str = None) -> Layout: """Load the Layout object from the given CSV file. Args: filename (str): The name of the CSV file. A row of the table represents an individual layout element. block_type (str): If there's no block_type column in the CSV file, you must pass in a block_type variable such that layout parser can appropriately detect the type of the layout elements. Returns: Layout: The parsed Layout object from the CSV file. """ return load_dataframe(pd.read_csv(filename), block_type=block_type)
[docs]def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout: """Load the Layout object from the given dataframe. Args: df (pd.DataFrame): block_type (str): If there's no block_type column in the CSV file, you must pass in a block_type variable such that layout parser can appropriately detect the type of the layout elements. Returns: Layout: The parsed Layout object from the CSV file. """ df = df.copy() if "points" in df.columns: if df["points"].dtype == object: df["points"] = df["points"].map( lambda x: ast.literal_eval(x) if not pd.isna(x) else x ) if block_type is None: if "block_type" not in df.columns: raise ValueError( "`block_type` not specified both in dataframe and arguments" ) else: df["block_type"] = block_type if any(col in TextBlock._features for col in df.columns): # Automatically setting index for textblock if "id" not in df.columns: df["id"] = df.index return load_dict(df.apply(lambda x: x.dropna().to_dict(), axis=1).to_list())