Skip to content

Commit 10eeaed

Browse files
authored
Merge pull request #421 from capitalone/develop
Release v0.16.8
2 parents ee838fa + ac940d4 commit 10eeaed

File tree

7 files changed

+375
-54
lines changed

7 files changed

+375
-54
lines changed

.github/workflows/test-package.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,16 @@ jobs:
3434
fail-fast: false
3535
matrix:
3636
python-version: ['3.10', '3.11', '3.12']
37-
spark-version: [3.2.4, 3.3.4, 3.4.2, 3.5.1]
38-
pandas-version: [2.2.3, 1.5.3]
39-
numpy-version: [2.1.2, 1.26.4]
37+
spark-version: [3.2.4, 3.3.4, 3.4.4, 3.5.6]
38+
pandas-version: [2.3.0, 1.5.3]
39+
numpy-version: [2.2.6, 1.26.4]
4040
exclude:
4141
- python-version: '3.11'
4242
spark-version: 3.2.4
4343
- python-version: '3.11'
4444
spark-version: 3.3.4
4545
- pandas-version: 1.5.3
46-
numpy-version: 2.1.2
46+
numpy-version: 2.2.6
4747
env:
4848
PYTHON_VERSION: ${{ matrix.python-version }}
4949
SPARK_VERSION: ${{ matrix.spark-version }}

datacompy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
Then extended to carry that functionality over to Spark Dataframes.
1919
"""
2020

21-
__version__ = "0.16.7"
21+
__version__ = "0.16.8"
2222

2323
import platform
2424
from warnings import warn

datacompy/core.py

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ class Compare(BaseCompare):
6666
A string name for the second dataframe
6767
ignore_spaces : bool, optional
6868
Flag to strip whitespace (including newlines) from string columns (including any join
69-
columns)
69+
columns). Excludes categoricals.
7070
ignore_case : bool, optional
71-
Flag to ignore the case of string columns
71+
Flag to ignore the case of string columns. Excludes categoricals.
7272
cast_column_names_lower: bool, optional
7373
Boolean indicator that controls of column names will be cast into lower case
7474
@@ -285,16 +285,13 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:
285285
else:
286286
params = {"on": self.join_columns}
287287

288-
if ignore_spaces:
289-
for column in self.join_columns:
290-
if self.df1[column].dtype.kind == "O" and pd.api.types.is_string_dtype(
291-
self.df1[column]
292-
):
293-
self.df1[column] = self.df1[column].str.strip()
294-
if self.df2[column].dtype.kind == "O" and pd.api.types.is_string_dtype(
295-
self.df2[column]
296-
):
297-
self.df2[column] = self.df2[column].str.strip()
288+
for column in self.join_columns:
289+
self.df1[column] = normalize_string_column(
290+
self.df1[column], ignore_spaces=ignore_spaces, ignore_case=False
291+
)
292+
self.df2[column] = normalize_string_column(
293+
self.df2[column], ignore_spaces=ignore_spaces, ignore_case=False
294+
)
298295

299296
outer_join = self.df1.merge(
300297
self.df2,
@@ -870,16 +867,13 @@ def columns_equal(
870867
"""
871868
default_value = "DATACOMPY_NULL"
872869
compare: pd.Series[bool]
873-
if ignore_spaces:
874-
if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
875-
col_1 = col_1.str.strip()
876-
if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
877-
col_2 = col_2.str.strip()
878-
if ignore_case:
879-
if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
880-
col_1 = col_1.str.upper()
881-
if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
882-
col_2 = col_2.str.upper()
870+
871+
col_1 = normalize_string_column(
872+
col_1, ignore_spaces=ignore_spaces, ignore_case=ignore_case
873+
)
874+
col_2 = normalize_string_column(
875+
col_2, ignore_spaces=ignore_spaces, ignore_case=ignore_case
876+
)
883877

884878
# short circuit if comparing mixed type columns. Check list/arrrays or just return false for everything else.
885879
if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
@@ -1054,3 +1048,35 @@ def generate_id_within_group(
10541048
)
10551049
else:
10561050
return dataframe[join_columns].groupby(join_columns).cumcount()
1051+
1052+
1053+
def normalize_string_column(
1054+
column: pd.Series, ignore_spaces: bool, ignore_case: bool
1055+
) -> pd.Series:
1056+
"""Normalize a string column by converting to upper case and stripping whitespace.
1057+
1058+
Parameters
1059+
----------
1060+
column : pd.Series
1061+
The column to normalize
1062+
ignore_spaces : bool
1063+
Whether to ignore spaces when normalizing
1064+
ignore_case : bool
1065+
Whether to ignore case when normalizing
1066+
1067+
Returns
1068+
-------
1069+
pd.Series
1070+
The normalized column
1071+
1072+
Notes
1073+
-----
1074+
Will not operate on categorical columns.
1075+
"""
1076+
if (column.dtype.kind == "O" and pd.api.types.infer_dtype(column) == "string") or (
1077+
pd.api.types.is_string_dtype(column)
1078+
and not isinstance(column.dtype, pd.CategoricalDtype)
1079+
):
1080+
column = column.str.strip() if ignore_spaces else column
1081+
column = column.str.upper() if ignore_case else column
1082+
return column

datacompy/polars.py

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ class PolarsCompare(BaseCompare):
6666
A string name for the second dataframe
6767
ignore_spaces : bool, optional
6868
Flag to strip whitespace (including newlines) from string columns (including any join
69-
columns)
69+
columns). Excludes categoricals.
7070
ignore_case : bool, optional
71-
Flag to ignore the case of string columns
71+
Flag to ignore the case of string columns. Excludes categoricals.
7272
cast_column_names_lower: bool, optional
7373
Boolean indicator that controls of column names will be cast into lower case
7474
@@ -233,10 +233,19 @@ def intersect_columns(self) -> OrderedSet[str]:
233233
return OrderedSet(self.df1.columns) & OrderedSet(self.df2.columns)
234234

235235
def _dataframe_merge(self, ignore_spaces: bool) -> None:
236-
"""Merge df1 to df2 on the join columns.
236+
"""Perform an outer join between two dataframes and categorize rows into unique and intersecting groups based on the join columns.
237237
238-
To get df1 - df2, df2 - df1
239-
and df1 & df2.
238+
Parameters
239+
----------
240+
ignore_spaces : bool
241+
If True, normalizes string columns by ignoring spaces during the join operation.
242+
243+
Notes
244+
-----
245+
- Temporary columns may be added to the dataframes during processing
246+
and are cleaned up before final output.
247+
- The method assumes that `self.df1`, `self.df2`, and `self.join_columns`
248+
are properly initialized before calling this method.
240249
"""
241250
params: Dict[str, Any]
242251
LOG.debug("Outer joining")
@@ -261,10 +270,16 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:
261270

262271
if ignore_spaces:
263272
for column in self.join_columns:
264-
if str(df1[column].dtype) in STRING_TYPE:
265-
df1 = df1.with_columns(pl.col(column).str.strip_chars())
266-
if str(df2[column].dtype) in STRING_TYPE:
267-
df2 = df2.with_columns(pl.col(column).str.strip_chars())
273+
df1 = df1.with_columns(
274+
normalize_string_column(
275+
df1[column], ignore_spaces=ignore_spaces, ignore_case=False
276+
)
277+
)
278+
df2 = df2.with_columns(
279+
normalize_string_column(
280+
df2[column], ignore_spaces=ignore_spaces, ignore_case=False
281+
)
282+
)
268283

269284
df1_non_join_columns = OrderedSet(df1.columns) - OrderedSet(temp_join_columns)
270285
df2_non_join_columns = OrderedSet(df2.columns) - OrderedSet(temp_join_columns)
@@ -841,17 +856,8 @@ def columns_equal(
841856
"""
842857
compare: pl.Series
843858

844-
if ignore_spaces:
845-
if str(col_1.dtype) in STRING_TYPE:
846-
col_1 = col_1.str.strip_chars()
847-
if str(col_2.dtype) in STRING_TYPE:
848-
col_2 = col_2.str.strip_chars()
849-
850-
if ignore_case:
851-
if str(col_1.dtype) in STRING_TYPE:
852-
col_1 = col_1.str.to_uppercase()
853-
if str(col_2.dtype) in STRING_TYPE:
854-
col_2 = col_2.str.to_uppercase()
859+
col_1 = normalize_string_column(col_1, ignore_spaces, ignore_case)
860+
col_2 = normalize_string_column(col_2, ignore_spaces, ignore_case)
855861

856862
if (
857863
(str(col_1.dtype) in STRING_TYPE and str(col_2.dtype) in STRING_TYPE)
@@ -895,8 +901,7 @@ def columns_equal(
895901
def compare_string_and_date_columns(col_1: pl.Series, col_2: pl.Series) -> pl.Series:
896902
"""Compare a string column and date column, value-wise.
897903
898-
This tries to
899-
convert a string column to a date column and compare that way.
904+
This tries to convert a string column to a date column and compare that way.
900905
901906
Parameters
902907
----------
@@ -1021,3 +1026,34 @@ def generate_id_within_group(
10211026
return dataframe.select(
10221027
rn=pl.col(dataframe.columns[0]).cum_count().over(join_columns)
10231028
).to_series()
1029+
1030+
1031+
def normalize_string_column(
1032+
column: pl.Series, ignore_spaces: bool, ignore_case: bool
1033+
) -> pl.Series:
1034+
"""Normalize a string column by converting to upper case and stripping whitespace.
1035+
1036+
Parameters
1037+
----------
1038+
column : pl.Series
1039+
The column to normalize
1040+
ignore_spaces : bool
1041+
Whether to ignore spaces when normalizing
1042+
ignore_case : bool
1043+
Whether to ignore case when normalizing
1044+
1045+
Returns
1046+
-------
1047+
pl.Series
1048+
The normalized column
1049+
1050+
Notes
1051+
-----
1052+
Will not operate on categorical columns.
1053+
"""
1054+
if str(column.dtype.base_type()) in STRING_TYPE:
1055+
if ignore_spaces:
1056+
column = column.str.strip_chars()
1057+
if ignore_case:
1058+
column = column.str.to_uppercase()
1059+
return column

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ maintainers = [
1313
{ name="Raymond Haffar", email="[email protected]" },
1414
]
1515
license = {text = "Apache Software License"}
16-
dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.5,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.27.1,>=0.20.4"]
16+
dependencies = ["pandas<=2.3.0,>=0.25.0", "numpy<=2.2.6,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.31.0,>=0.20.4"]
1717
requires-python = ">=3.10.0"
1818
classifiers = [
1919
"Intended Audience :: Developers",
@@ -56,7 +56,7 @@ python-tag = "py3"
5656

5757
[project.optional-dependencies]
5858
fugue = ["fugue[dask,duckdb,ray]<=0.9.1,>=0.8.7"]
59-
spark = ["pyspark[connect]>=3.1.1; python_version < \"3.11\"", "pyspark[connect]>=3.4; python_version >= \"3.11\""]
59+
spark = ["pyspark[connect]>=3.1.1,<=3.5.6; python_version < \"3.11\"", "pyspark[connect]>=3.4,<=3.5.6; python_version >= \"3.11\""]
6060
snowflake = ["snowflake-connector-python", "snowflake-snowpark-python"]
6161
docs = ["sphinx", "furo", "myst-parser"]
6262
tests = ["pytest", "pytest-cov"]

0 commit comments

Comments
 (0)