Merge pull request #421 from capitalone/develop

fdosani · web-flow · commit 10eeaed98c9d · 2025-06-20T17:12:46.000-03:00
Release v0.16.8
diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -34,16 +34,16 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ['3.10', '3.11', '3.12']
-        spark-version: [3.2.4, 3.3.4, 3.4.2, 3.5.1]
-        pandas-version: [2.2.3, 1.5.3]
-        numpy-version: [2.1.2, 1.26.4]
+        spark-version: [3.2.4, 3.3.4, 3.4.4, 3.5.6]
+        pandas-version: [2.3.0, 1.5.3]
+        numpy-version: [2.2.6, 1.26.4]
         exclude:
           - python-version: '3.11'
             spark-version: 3.2.4
           - python-version: '3.11'
             spark-version: 3.3.4
           - pandas-version: 1.5.3
-            numpy-version: 2.1.2
+            numpy-version: 2.2.6
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}
       SPARK_VERSION: ${{ matrix.spark-version }}
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.16.7"
+__version__ = "0.16.8"
 
 import platform
 from warnings import warn
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -66,9 +66,9 @@ class Compare(BaseCompare):
         A string name for the second dataframe
     ignore_spaces : bool, optional
         Flag to strip whitespace (including newlines) from string columns (including any join
-        columns)
+        columns). Excludes categoricals.
     ignore_case : bool, optional
-        Flag to ignore the case of string columns
+        Flag to ignore the case of string columns. Excludes categoricals.
     cast_column_names_lower: bool, optional
         Boolean indicator that controls of column names will be cast into lower case
 
@@ -285,16 +285,13 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:
         else:
             params = {"on": self.join_columns}
 
-        if ignore_spaces:
-            for column in self.join_columns:
-                if self.df1[column].dtype.kind == "O" and pd.api.types.is_string_dtype(
-                    self.df1[column]
-                ):
-                    self.df1[column] = self.df1[column].str.strip()
-                if self.df2[column].dtype.kind == "O" and pd.api.types.is_string_dtype(
-                    self.df2[column]
-                ):
-                    self.df2[column] = self.df2[column].str.strip()
+        for column in self.join_columns:
+            self.df1[column] = normalize_string_column(
+                self.df1[column], ignore_spaces=ignore_spaces, ignore_case=False
+            )
+            self.df2[column] = normalize_string_column(
+                self.df2[column], ignore_spaces=ignore_spaces, ignore_case=False
+            )
 
         outer_join = self.df1.merge(
             self.df2,
@@ -870,16 +867,13 @@ def columns_equal(
     """
     default_value = "DATACOMPY_NULL"
     compare: pd.Series[bool]
-    if ignore_spaces:
-        if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
-            col_1 = col_1.str.strip()
-        if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
-            col_2 = col_2.str.strip()
-    if ignore_case:
-        if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
-            col_1 = col_1.str.upper()
-        if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
-            col_2 = col_2.str.upper()
+
+    col_1 = normalize_string_column(
+        col_1, ignore_spaces=ignore_spaces, ignore_case=ignore_case
+    )
+    col_2 = normalize_string_column(
+        col_2, ignore_spaces=ignore_spaces, ignore_case=ignore_case
+    )
 
     # short circuit if comparing mixed type columns. Check list/arrrays or just return false for everything else.
     if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
@@ -1054,3 +1048,35 @@ def generate_id_within_group(
         )
     else:
         return dataframe[join_columns].groupby(join_columns).cumcount()
+
+
+def normalize_string_column(
+    column: pd.Series, ignore_spaces: bool, ignore_case: bool
+) -> pd.Series:
+    """Normalize a string column by converting to upper case and stripping whitespace.
+
+    Parameters
+    ----------
+    column : pd.Series
+        The column to normalize
+    ignore_spaces : bool
+        Whether to ignore spaces when normalizing
+    ignore_case : bool
+        Whether to ignore case when normalizing
+
+    Returns
+    -------
+    pd.Series
+        The normalized column
+
+    Notes
+    -----
+    Will not operate on categorical columns.
+    """
+    if (column.dtype.kind == "O" and pd.api.types.infer_dtype(column) == "string") or (
+        pd.api.types.is_string_dtype(column)
+        and not isinstance(column.dtype, pd.CategoricalDtype)
+    ):
+        column = column.str.strip() if ignore_spaces else column
+        column = column.str.upper() if ignore_case else column
+    return column
diff --git a/datacompy/polars.py b/datacompy/polars.py
@@ -66,9 +66,9 @@ class PolarsCompare(BaseCompare):
         A string name for the second dataframe
     ignore_spaces : bool, optional
         Flag to strip whitespace (including newlines) from string columns (including any join
-        columns)
+        columns). Excludes categoricals.
     ignore_case : bool, optional
-        Flag to ignore the case of string columns
+        Flag to ignore the case of string columns. Excludes categoricals.
     cast_column_names_lower: bool, optional
         Boolean indicator that controls of column names will be cast into lower case
 
@@ -233,10 +233,19 @@ def intersect_columns(self) -> OrderedSet[str]:
         return OrderedSet(self.df1.columns) & OrderedSet(self.df2.columns)
 
     def _dataframe_merge(self, ignore_spaces: bool) -> None:
-        """Merge df1 to df2 on the join columns.
+        """Perform an outer join between two dataframes and categorize rows into unique and intersecting groups based on the join columns.
 
-        To get df1 - df2, df2 - df1
-        and df1 & df2.
+        Parameters
+        ----------
+        ignore_spaces : bool
+            If True, normalizes string columns by ignoring spaces during the join operation.
+
+        Notes
+        -----
+        - Temporary columns may be added to the dataframes during processing
+          and are cleaned up before final output.
+        - The method assumes that `self.df1`, `self.df2`, and `self.join_columns`
+          are properly initialized before calling this method.
         """
         params: Dict[str, Any]
         LOG.debug("Outer joining")
@@ -261,10 +270,16 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:
 
         if ignore_spaces:
             for column in self.join_columns:
-                if str(df1[column].dtype) in STRING_TYPE:
-                    df1 = df1.with_columns(pl.col(column).str.strip_chars())
-                if str(df2[column].dtype) in STRING_TYPE:
-                    df2 = df2.with_columns(pl.col(column).str.strip_chars())
+                df1 = df1.with_columns(
+                    normalize_string_column(
+                        df1[column], ignore_spaces=ignore_spaces, ignore_case=False
+                    )
+                )
+                df2 = df2.with_columns(
+                    normalize_string_column(
+                        df2[column], ignore_spaces=ignore_spaces, ignore_case=False
+                    )
+                )
 
         df1_non_join_columns = OrderedSet(df1.columns) - OrderedSet(temp_join_columns)
         df2_non_join_columns = OrderedSet(df2.columns) - OrderedSet(temp_join_columns)
@@ -841,17 +856,8 @@ def columns_equal(
     """
     compare: pl.Series
 
-    if ignore_spaces:
-        if str(col_1.dtype) in STRING_TYPE:
-            col_1 = col_1.str.strip_chars()
-        if str(col_2.dtype) in STRING_TYPE:
-            col_2 = col_2.str.strip_chars()
-
-    if ignore_case:
-        if str(col_1.dtype) in STRING_TYPE:
-            col_1 = col_1.str.to_uppercase()
-        if str(col_2.dtype) in STRING_TYPE:
-            col_2 = col_2.str.to_uppercase()
+    col_1 = normalize_string_column(col_1, ignore_spaces, ignore_case)
+    col_2 = normalize_string_column(col_2, ignore_spaces, ignore_case)
 
     if (
         (str(col_1.dtype) in STRING_TYPE and str(col_2.dtype) in STRING_TYPE)
@@ -895,8 +901,7 @@ def columns_equal(
 def compare_string_and_date_columns(col_1: pl.Series, col_2: pl.Series) -> pl.Series:
     """Compare a string column and date column, value-wise.
 
-    This tries to
-    convert a string column to a date column and compare that way.
+    This tries to convert a string column to a date column and compare that way.
 
     Parameters
     ----------
@@ -1021,3 +1026,34 @@ def generate_id_within_group(
         return dataframe.select(
             rn=pl.col(dataframe.columns[0]).cum_count().over(join_columns)
         ).to_series()
+
+
+def normalize_string_column(
+    column: pl.Series, ignore_spaces: bool, ignore_case: bool
+) -> pl.Series:
+    """Normalize a string column by converting to upper case and stripping whitespace.
+
+    Parameters
+    ----------
+    column : pl.Series
+        The column to normalize
+    ignore_spaces : bool
+        Whether to ignore spaces when normalizing
+    ignore_case : bool
+        Whether to ignore case when normalizing
+
+    Returns
+    -------
+    pl.Series
+        The normalized column
+
+    Notes
+    -----
+    Will not operate on categorical columns.
+    """
+    if str(column.dtype.base_type()) in STRING_TYPE:
+        if ignore_spaces:
+            column = column.str.strip_chars()
+        if ignore_case:
+            column = column.str.to_uppercase()
+    return column
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ maintainers = [
   { name="Raymond Haffar", email="raymond.haffar@capitalone.com" },
 ]
 license = {text = "Apache Software License"}
-dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.5,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.27.1,>=0.20.4"]
+dependencies = ["pandas<=2.3.0,>=0.25.0", "numpy<=2.2.6,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.31.0,>=0.20.4"]
 requires-python = ">=3.10.0"
 classifiers = [
     "Intended Audience :: Developers",
@@ -56,7 +56,7 @@ python-tag = "py3"
 
 [project.optional-dependencies]
 fugue = ["fugue[dask,duckdb,ray]<=0.9.1,>=0.8.7"]
-spark = ["pyspark[connect]>=3.1.1; python_version < \"3.11\"", "pyspark[connect]>=3.4; python_version >= \"3.11\""]
+spark = ["pyspark[connect]>=3.1.1,<=3.5.6; python_version < \"3.11\"", "pyspark[connect]>=3.4,<=3.5.6; python_version >= \"3.11\""]
 snowflake = ["snowflake-connector-python", "snowflake-snowpark-python"]
 docs = ["sphinx", "furo", "myst-parser"]
 tests = ["pytest", "pytest-cov"]
diff --git a/tests/test_core.py b/tests/test_core.py
diff --git a/tests/test_polars.py b/tests/test_polars.py