|
9 | 9 | import xlsxwriter |
10 | 10 | import io |
11 | 11 | from pathlib import Path |
12 | | -from typing import List, IO, Union, AnyStr, Tuple |
| 12 | +from typing import List, IO, Union, AnyStr, Tuple, Optional |
13 | 13 | from copy import deepcopy |
14 | 14 | from collections import defaultdict |
15 | 15 | from PIL import Image |
|
23 | 23 | from textractor.entities.query import Query |
24 | 24 | from textractor.entities.signature import Signature |
25 | 25 | from textractor.entities.layout import Layout |
26 | | -from textractor.exceptions import InputError |
| 26 | +from textractor.exceptions import InputError, MissingDependencyException |
27 | 27 | from textractor.entities.key_value import KeyValue |
28 | 28 | from textractor.entities.bbox import SpatialObject |
29 | 29 | from textractor.utils.s3_utils import download_from_s3 |
@@ -608,6 +608,49 @@ def export_kv_to_csv( |
608 | 608 | f"csv file stored at location {os.path.join(os.getcwd(),filepath)}" |
609 | 609 | ) |
610 | 610 |
|
| 611 | + def export_kv_to_pandas( |
| 612 | + self, |
| 613 | + trim: bool = True, |
| 614 | + confidence_threshold: float = 0.0, |
| 615 | + drop_confidence: bool = True |
| 616 | + ): |
| 617 | + """ |
| 618 | + Converts key-value pairs with optional confidence filtering into a pandas DataFrame. |
| 619 | +
|
| 620 | + :param trim: Flag to trim whitespace and punctuation from key and value text. Default is True. |
| 621 | + :type trim: bool |
| 622 | + :param confidence_threshold: Minimum confidence level required for a key-value pair to be included in the DataFrame. Default is 0.0. |
| 623 | + :type confidence_threshold: float |
| 624 | + :param drop_confidence: Flag to exclude the confidence column from the DataFrame. Default is True. |
| 625 | + :type drop_confidence: bool |
| 626 | +
|
| 627 | + :return: A pandas DataFrame containing key-value pairs, and optionally their confidence scores. |
| 628 | + The DataFrame will have 'Key' and 'Value' columns, and a 'Confidence' column if `drop_confidence` is False. |
| 629 | + :rtype: pd.DataFrame |
| 630 | + """ |
| 631 | + try: |
| 632 | + from pandas import DataFrame |
| 633 | + except ImportError: |
| 634 | + raise MissingDependencyException("The pandas library is required for exporting tables to DataFrame objects. Please install it with `pip install pandas`.") |
| 635 | + |
| 636 | + keys: list[str] = [] |
| 637 | + values: list[str] = [] |
| 638 | + confidences: Optional[list[float]] = [] if not drop_confidence else None |
| 639 | + |
| 640 | + # Loop through key-values and filter by confidence threshold |
| 641 | + for row in self.key_values: |
| 642 | + if row.confidence > confidence_threshold: |
| 643 | + keys.append(row.key.text.strip(": ").strip() if trim else row.key.text) |
| 644 | + values.append(row.value.text.strip() if trim else row.value.text) |
| 645 | + if confidences is not None: |
| 646 | + confidences.append(row.confidence) |
| 647 | + |
| 648 | + data = {'Key': keys, 'Value': values} |
| 649 | + if confidences is not None: |
| 650 | + data['Confidence'] = confidences |
| 651 | + |
| 652 | + return DataFrame(data) |
| 653 | + |
611 | 654 | def export_kv_to_txt( |
612 | 655 | self, |
613 | 656 | include_kv: bool = True, |
|
0 commit comments