capitalone
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎datacompy/base.py
Lines changed: 96 additions & 1 deletion b/‎datacompy/base.py
Lines changed: 96 additions & 1 deletion
diff --git a/‎datacompy/core.py
Lines changed: 38 additions & 18 deletions b/‎datacompy/core.py
Lines changed: 38 additions & 18 deletions
diff --git a/‎datacompy/fugue.py
Lines changed: 2 additions & 0 deletions b/‎datacompy/fugue.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎datacompy/polars.py
Lines changed: 41 additions & 20 deletions b/‎datacompy/polars.py
Lines changed: 41 additions & 20 deletions
@@ -46,10 +46,10 @@ pip install datacompy[snowflake]
 
 ### LegacySparkCompare and SparkPandasCompare removal
 
-With version ``v0.17.0`` the ``LegacySparkCompare`` and ``SparkPandasCompare`` have been removed.
+Starting with v0.17.0, both `LegacySparkCompare` and `SparkPandasCompare` have been removed.
 
 
-#### Supported versions and dependncies
+#### Supported versions and dependencies
 
 Different versions of Spark, Pandas, and Python interact differently. Below is a matrix of what we test with.
 With the move to Pandas on Spark API and compatability issues with Pandas 2+ we will for the mean time note support Pandas 2
 
@@ -24,7 +24,7 @@
 import logging
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any
+from typing import Any, Dict
 
 from jinja2 import Environment, FileSystemLoader, select_autoescape
 from ordered_set import OrderedSet
@@ -348,3 +348,98 @@ def df_to_str(df: Any, sample_count: int | None = None, on_index: bool = False)
 
     # Fallback to str() if we can't determine the type
     return str(df)
+
+
+def get_column_tolerance(column: str, tol_dict: Dict[str, float]) -> float:
+    """
+    Return the tolerance value for a given column from a dictionary of tolerances.
+
+    Parameters
+    ----------
+    column : str
+        The name of the column for which to retrieve the tolerance.
+    tol_dict : dict of str to float
+        Dictionary mapping column names to their tolerance values.
+        May contain a "default" key for columns not explicitly listed.
+
+    Returns
+    -------
+    float
+        The tolerance value for the specified column, or the "default" tolerance if the column is not found.
+        Returns 0 if neither the column nor "default" is present in the dictionary.
+    """
+    return tol_dict.get(column, tol_dict.get("default", 0.0))
+
+
+def _validate_tolerance_parameter(
+    param_value: float | Dict[str, float],
+    param_name: str,
+    case_mode: str = "lower",
+) -> Dict[str, float]:
+    """Validate and normalize tolerance parameter input.
+
+    Parameters
+    ----------
+    param_value : float or dict
+        The tolerance value to validate. Can be either a float or a dictionary mapping
+        column names to float values.
+    param_name : str
+        Name of the parameter being validated ('abs_tol' or 'rel_tol')
+    case_mode : str
+        How to handle column name case. Options are:
+        - "lower": convert to lowercase
+        - "upper": convert to uppercase
+        - "preserve": keep original case
+
+    Returns
+    -------
+    dict
+        Normalized dictionary of tolerance values
+
+    Raises
+    ------
+    TypeError
+        If param_value is not a float or dict
+    ValueError
+        If any tolerance values are not numeric or negative or if case_mode is invalid
+    """
+    if case_mode not in ["lower", "upper", "preserve"]:
+        raise ValueError("case_mode must be 'lower', 'upper', or 'preserve'")
+
+    # If float, convert to dict with default value
+    if isinstance(param_value, int | float):
+        if param_value < 0:
+            raise ValueError(f"{param_name} cannot be negative")
+        return {"default": float(param_value)}
+
+    # If dict, validate values and format
+    if isinstance(param_value, dict):
+        result = {}
+
+        # Convert all values to float and validate
+        for col, value in param_value.items():
+            if not isinstance(value, int | float):
+                raise ValueError(
+                    f"Value for column '{col}' in {param_name} must be numeric"
+                )
+            if value < 0:
+                raise ValueError(
+                    f"Value for column '{col}' in {param_name} cannot be negative"
+                )
+
+            # Handle column name case according to case_mode
+            col_key = str(col)
+            if case_mode == "lower":
+                col_key = col_key.lower()
+            elif case_mode == "upper":
+                col_key = col_key.upper()
+
+            result[col_key] = float(value)
+
+        # If no default provided, add 0.0
+        if "default" not in result:
+            result["default"] = 0.0
+
+        return result
+
+    raise TypeError(f"{param_name} must be a float or dictionary")
@@ -30,7 +30,9 @@
 
 from datacompy.base import (
     BaseCompare,
+    _validate_tolerance_parameter,
     df_to_str,
+    get_column_tolerance,
     render,
     save_html_report,
     temp_column_name,
@@ -59,10 +61,14 @@ class Compare(BaseCompare):
         If True, the index will be used to join the two dataframes.  If both
         ``join_columns`` and ``on_index`` are provided, an exception will be
         raised.
-    abs_tol : float, optional
-        Absolute tolerance between two values.
-    rel_tol : float, optional
-        Relative tolerance between two values.
+    abs_tol : float or dict, optional
+        Absolute tolerance between two values. Can be either a float value applied to all columns,
+        or a dictionary mapping column names to specific tolerance values. The special key "default"
+        in the dictionary specifies the tolerance for columns not explicitly listed.
+    rel_tol : float or dict, optional
+        Relative tolerance between two values. Can be either a float value applied to all columns,
+        or a dictionary mapping column names to specific tolerance values. The special key "default"
+        in the dictionary specifies the tolerance for columns not explicitly listed.
     df1_name : str, optional
         A string name for the first dataframe.  This allows the reporting to
         print out an actual name instead of "df1", and allows human users to
@@ -91,15 +97,24 @@ def __init__(
         df2: pd.DataFrame,
         join_columns: List[str] | str | None = None,
         on_index: bool = False,
-        abs_tol: float = 0,
-        rel_tol: float = 0,
+        abs_tol: float | Dict[str, float] = 0,
+        rel_tol: float | Dict[str, float] = 0,
         df1_name: str = "df1",
         df2_name: str = "df2",
         ignore_spaces: bool = False,
         ignore_case: bool = False,
         cast_column_names_lower: bool = True,
     ) -> None:
         self.cast_column_names_lower = cast_column_names_lower
+
+        # Validate tolerance parameters first
+        self._abs_tol_dict = _validate_tolerance_parameter(
+            abs_tol, "abs_tol", "lower" if cast_column_names_lower else "preserve"
+        )
+        self._rel_tol_dict = _validate_tolerance_parameter(
+            rel_tol, "rel_tol", "lower" if cast_column_names_lower else "preserve"
+        )
+
         if on_index and join_columns is not None:
             raise Exception("Only provide on_index or join_columns")
         elif on_index:
@@ -369,12 +384,12 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
                     [
                         self.intersect_rows,
                         columns_equal(
-                            self.intersect_rows[col_1],
-                            self.intersect_rows[col_2],
-                            self.rel_tol,
-                            self.abs_tol,
-                            ignore_spaces,
-                            ignore_case,
+                            col_1=self.intersect_rows[col_1],
+                            col_2=self.intersect_rows[col_2],
+                            rel_tol=get_column_tolerance(column, self._rel_tol_dict),
+                            abs_tol=get_column_tolerance(column, self._abs_tol_dict),
+                            ignore_spaces=ignore_spaces,
+                            ignore_case=ignore_case,
                         ).to_frame(name=col_match),
                     ],
                     axis=1,
@@ -414,6 +429,8 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
                     ),
                     "max_diff": max_diff,
                     "null_diff": null_diff,
+                    "rel_tol": get_column_tolerance(column, self._rel_tol_dict),
+                    "abs_tol": get_column_tolerance(column, self._abs_tol_dict),
                 }
             )
 
@@ -589,12 +606,12 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
                 orig_col_name = col[:-6]
 
                 col_comparison = columns_equal(
-                    self.intersect_rows[orig_col_name + "_" + self.df1_name],
-                    self.intersect_rows[orig_col_name + "_" + self.df2_name],
-                    self.rel_tol,
-                    self.abs_tol,
-                    self.ignore_spaces,
-                    self.ignore_case,
+                    col_1=self.intersect_rows[orig_col_name + "_" + self.df1_name],
+                    col_2=self.intersect_rows[orig_col_name + "_" + self.df2_name],
+                    rel_tol=get_column_tolerance(orig_col_name, self._rel_tol_dict),
+                    abs_tol=get_column_tolerance(orig_col_name, self._abs_tol_dict),
+                    ignore_spaces=self.ignore_spaces,
+                    ignore_case=self.ignore_case,
                 )
 
                 if not ignore_matching_cols or (
@@ -717,6 +734,8 @@ def _get_mismatch_stats(self, sample_count: int) -> dict:
                         "unequal_cnt": column["unequal_cnt"],
                         "max_diff": column["max_diff"],
                         "null_diff": column["null_diff"],
+                        "rel_tol": column["rel_tol"],
+                        "abs_tol": column["abs_tol"],
                     }
                 )
                 if column["unequal_cnt"] > 0:
@@ -961,6 +980,7 @@ def columns_equal(
         col_2, ignore_spaces=ignore_spaces, ignore_case=ignore_case
     )
 
+    # Rest of comparison logic using rel_tol and abs_tol
     # short circuit if comparing mixed type columns. Check list/arrrays or just return false for everything else.
     if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
         col_2
 
@@ -625,6 +625,8 @@ def _any(col: str) -> int:
                     "unequal_cnt": col["unequal_cnt"],
                     "max_diff": col["max_diff"],
                     "null_diff": col["null_diff"],
+                    "rel_tol": rel_tol,
+                    "abs_tol": abs_tol,
                 }
                 for col in column_stats
                 if not col["all_match"]
 
@@ -31,7 +31,9 @@
 
 from datacompy.base import (
     BaseCompare,
+    _validate_tolerance_parameter,
     df_to_str,
+    get_column_tolerance,
     render,
     save_html_report,
     temp_column_name,
@@ -59,10 +61,13 @@ class PolarsCompare(BaseCompare):
     join_columns : list or str
         Column(s) to join dataframes on.  If a string is passed in, that one
         column will be used.
-    abs_tol : float, optional
-        Absolute tolerance between two values.
-    rel_tol : float, optional
-        Relative tolerance between two values.
+    abs_tol : float or dict, optional
+        Absolute tolerance between two values. Can be either a float value applied to all columns,
+        or a dictionary mapping column names to specific tolerance values. The special key "default"
+        in the dictionary specifies the tolerance for columns not explicitly listed.
+    rel_tol : float or dict, optional
+        Relative tolerance between two values. Can be either a float value applied to all columns,
+        or a dictionary mapping column names to specific tolerance values. The special key "default"
     df1_name : str, optional
         A string name for the first dataframe.  This allows the reporting to
         print out an actual name instead of "df1", and allows human users to
@@ -90,8 +95,8 @@ def __init__(
         df1: pl.DataFrame,
         df2: pl.DataFrame,
         join_columns: List[str] | str,
-        abs_tol: float = 0,
-        rel_tol: float = 0,
+        abs_tol: float | Dict[str, float] = 0,
+        rel_tol: float | Dict[str, float] = 0,
         df1_name: str = "df1",
         df2_name: str = "df2",
         ignore_spaces: bool = False,
@@ -100,6 +105,14 @@ def __init__(
     ) -> None:
         self.cast_column_names_lower = cast_column_names_lower
 
+        # Validate tolerance parameters first
+        self._abs_tol_dict = _validate_tolerance_parameter(
+            abs_tol, "abs_tol", "lower" if cast_column_names_lower else "preserve"
+        )
+        self._rel_tol_dict = _validate_tolerance_parameter(
+            rel_tol, "rel_tol", "lower" if cast_column_names_lower else "preserve"
+        )
+
         if isinstance(join_columns, str):
             self.join_columns = [
                 str(join_columns).lower()
@@ -371,12 +384,12 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
                 col_match = column + "_match"
                 self.intersect_rows = self.intersect_rows.with_columns(
                     columns_equal(
-                        self.intersect_rows[col_1],
-                        self.intersect_rows[col_2],
-                        self.rel_tol,
-                        self.abs_tol,
-                        ignore_spaces,
-                        ignore_case,
+                        col_1=self.intersect_rows[col_1],
+                        col_2=self.intersect_rows[col_2],
+                        rel_tol=get_column_tolerance(column, self._rel_tol_dict),
+                        abs_tol=get_column_tolerance(column, self._abs_tol_dict),
+                        ignore_spaces=ignore_spaces,
+                        ignore_case=ignore_case,
                     ).alias(col_match)
                 )
                 match_cnt = self.intersect_rows[col_match].sum()
@@ -409,6 +422,8 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
                     ),
                     "max_diff": max_diff,
                     "null_diff": null_diff,
+                    "rel_tol": get_column_tolerance(column, self._rel_tol_dict),
+                    "abs_tol": get_column_tolerance(column, self._abs_tol_dict),
                 }
             )
 
@@ -588,12 +603,12 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> pl.DataFrame:
                 orig_col_name = col[:-6]
 
                 col_comparison = columns_equal(
-                    self.intersect_rows[orig_col_name + "_" + self.df1_name],
-                    self.intersect_rows[orig_col_name + "_" + self.df2_name],
-                    self.rel_tol,
-                    self.abs_tol,
-                    self.ignore_spaces,
-                    self.ignore_case,
+                    col_1=self.intersect_rows[orig_col_name + "_" + self.df1_name],
+                    col_2=self.intersect_rows[orig_col_name + "_" + self.df2_name],
+                    rel_tol=get_column_tolerance(orig_col_name, self._rel_tol_dict),
+                    abs_tol=get_column_tolerance(orig_col_name, self._abs_tol_dict),
+                    ignore_spaces=self.ignore_spaces,
+                    ignore_case=self.ignore_case,
                 )
 
                 if not ignore_matching_cols or (
@@ -717,6 +732,8 @@ def _get_mismatch_stats(self, sample_count: int) -> dict:
                         "unequal_cnt": column["unequal_cnt"],
                         "max_diff": column["max_diff"],
                         "null_diff": column["null_diff"],
+                        "rel_tol": column["rel_tol"],
+                        "abs_tol": column["abs_tol"],
                     }
                 )
                 if column["unequal_cnt"] > 0:
@@ -768,7 +785,9 @@ def _get_unique_rows_data(self, sample_count: int, column_count: int) -> dict:
             "df1_unique_rows": {
                 "has_rows": min_sample_count_df1 > 0,
                 "rows": df_to_str(
-                    self.df1_unq_rows[:, :min_column_count_df1],
+                    self.df1_unq_rows.select(
+                        self.df1_unq_rows.columns[:min_column_count_df1]
+                    ),
                     sample_count=min_sample_count_df1,
                 )
                 if self.df1_unq_rows.shape[0] > 0
@@ -780,7 +799,9 @@ def _get_unique_rows_data(self, sample_count: int, column_count: int) -> dict:
             "df2_unique_rows": {
                 "has_rows": min_sample_count_df2 > 0,
                 "rows": df_to_str(
-                    self.df2_unq_rows[:, :min_column_count_df2],
+                    self.df2_unq_rows.select(
+                        self.df2_unq_rows.columns[:min_column_count_df2]
+                    ),
                     sample_count=min_sample_count_df2,
                 )
                 if self.df2_unq_rows.shape[0] > 0
Original file line number	Diff line number	Diff line change
`@@ -625,6 +625,8 @@ def _any(col: str) -> int:`
`625`	`625`	`"unequal_cnt": col["unequal_cnt"],`
`626`	`626`	`"max_diff": col["max_diff"],`
`627`	`627`	`"null_diff": col["null_diff"],`
	`628`	`+ "rel_tol": rel_tol,`
	`629`	`+ "abs_tol": abs_tol,`
`628`	`630`	`}`
`629`	`631`	`for col in column_stats`
`630`	`632`	`if not col["all_match"]`