Skip to content

Commit 5980c40

Browse files
authored
Add export_kv_to_pandas to Document
2 parents e75f0b8 + 189e31c commit 5980c40

File tree

1 file changed

+45
-2
lines changed

1 file changed

+45
-2
lines changed

textractor/entities/document.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import xlsxwriter
1010
import io
1111
from pathlib import Path
12-
from typing import List, IO, Union, AnyStr, Tuple
12+
from typing import List, IO, Union, AnyStr, Tuple, Optional
1313
from copy import deepcopy
1414
from collections import defaultdict
1515
from PIL import Image
@@ -23,7 +23,7 @@
2323
from textractor.entities.query import Query
2424
from textractor.entities.signature import Signature
2525
from textractor.entities.layout import Layout
26-
from textractor.exceptions import InputError
26+
from textractor.exceptions import InputError, MissingDependencyException
2727
from textractor.entities.key_value import KeyValue
2828
from textractor.entities.bbox import SpatialObject
2929
from textractor.utils.s3_utils import download_from_s3
@@ -608,6 +608,49 @@ def export_kv_to_csv(
608608
f"csv file stored at location {os.path.join(os.getcwd(),filepath)}"
609609
)
610610

611+
def export_kv_to_pandas(
612+
self,
613+
trim: bool = True,
614+
confidence_threshold: float = 0.0,
615+
drop_confidence: bool = True
616+
):
617+
"""
618+
Converts key-value pairs with optional confidence filtering into a pandas DataFrame.
619+
620+
:param trim: Flag to trim whitespace and punctuation from key and value text. Default is True.
621+
:type trim: bool
622+
:param confidence_threshold: Minimum confidence level required for a key-value pair to be included in the DataFrame. Default is 0.0.
623+
:type confidence_threshold: float
624+
:param drop_confidence: Flag to exclude the confidence column from the DataFrame. Default is True.
625+
:type drop_confidence: bool
626+
627+
:return: A pandas DataFrame containing key-value pairs, and optionally their confidence scores.
628+
The DataFrame will have 'Key' and 'Value' columns, and a 'Confidence' column if `drop_confidence` is False.
629+
:rtype: pd.DataFrame
630+
"""
631+
try:
632+
from pandas import DataFrame
633+
except ImportError:
634+
raise MissingDependencyException("The pandas library is required for exporting tables to DataFrame objects. Please install it with `pip install pandas`.")
635+
636+
keys: list[str] = []
637+
values: list[str] = []
638+
confidences: Optional[list[float]] = [] if not drop_confidence else None
639+
640+
# Loop through key-values and filter by confidence threshold
641+
for row in self.key_values:
642+
if row.confidence > confidence_threshold:
643+
keys.append(row.key.text.strip(": ").strip() if trim else row.key.text)
644+
values.append(row.value.text.strip() if trim else row.value.text)
645+
if confidences is not None:
646+
confidences.append(row.confidence)
647+
648+
data = {'Key': keys, 'Value': values}
649+
if confidences is not None:
650+
data['Confidence'] = confidences
651+
652+
return DataFrame(data)
653+
611654
def export_kv_to_txt(
612655
self,
613656
include_kv: bool = True,

0 commit comments

Comments
 (0)