Skip to content

Conversation

nameexhaustion
Copy link
Collaborator

@nameexhaustion nameexhaustion commented Jun 23, 2025

This PR introduces logic to identify columns that are guaranteed to be non-NULL after applying a filter expression, and then uses that to potentially re-write the join types. After the join-type is re-written, we are then able to push the filters past the join.

# Possible re-writes:
FULL -> (LEFT | RIGHT | INNER)
LEFT -> INNER
RIGHT -> INNER

Example: Full -> Right join

lhs = pl.LazyFrame(
    {"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, None], "c": ["a", "b", "c", "d", "e"]}
)
rhs = pl.LazyFrame(
    {"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, None, 5], "c": ["A", "B", "C", "D", "E"]}
)

q = lhs.join(rhs, left_on="a", right_on="b", how="full", coalesce=True).filter(
    pl.col("c_right") <= "B", pl.col("a") <= 2
)
print(q.explain())

### Before
# FILTER [([(col("c_right")) <= ("B")]) & ([(col("a")) <= (2)])]
# FROM
#   FULL JOIN:
#   LEFT PLAN ON: [col("a")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END FULL JOIN
### After
# SELECT [col("b_right").alias("a"), col("b"), col("c"), col("a").alias("a_right"), col("c_right")]
#   RIGHT JOIN:
#   LEFT PLAN ON: [col("a")]
#     FILTER [(col("a")) <= (2)]
#     FROM
#       DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     FILTER [([(col("c")) <= ("B")]) & ([(col("b")) <= (2)])]
#     FROM
#       DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END RIGHT JOIN

Example

lhs = pl.LazyFrame(
    {
        "foo": [1, 2, 3],
        "bar": [6.0, 7.0, 8.0],
        "ham": ["a", "b", "c"],
    }
)

rhs = pl.LazyFrame(
    {
        "apple": ["x", "y", "z"],
        "ham": ["a", "b", "d"],
        "bar": ["a", "b", "c"],
        "foo2": [1, 2, 3],
    }
)

print(
    lhs.join(rhs, how="left", on="ham")
    .filter(pl.col.ham == "a", pl.col.apple == "x", pl.col.foo == "3")
    .explain()
)
  • Release 1.31.0
# FILTER [([(col("ham")) == ("a")]) & ([(col("apple")) == ("x")])]
# FROM
#   LEFT JOIN:
#   LEFT PLAN ON: [col("ham")]
#     FILTER [(col("foo")) == ("3")]
#     FROM
#       DF ["foo", "bar", "ham"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("ham")]
#     DF ["apple", "ham", "bar", "foo2"]; PROJECT */4 COLUMNS
#   END LEFT JOIN
# FILTER [(col("apple")) == ("x")]
# FROM
#   LEFT JOIN:
#   LEFT PLAN ON: [col("ham")]
#     FILTER [([(col("foo")) == ("3")]) & ([(col("ham")) == ("a")])]
#     FROM
#       DF ["foo", "bar", "ham"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("ham")]
#     FILTER [(col("ham")) == ("a")]
#     FROM
#       DF ["apple", "ham", "bar", "foo2"]; PROJECT */4 COLUMNS
#   END LEFT JOIN
  • After this PR
# INNER JOIN:
# LEFT PLAN ON: [col("ham")]
#   FILTER [([(col("foo")) == ("3")]) & ([(col("ham")) == ("a")])]
#   FROM
#     DF ["foo", "bar", "ham"]; PROJECT */3 COLUMNS
# RIGHT PLAN ON: [col("ham")]
#   FILTER [([(col("apple")) == ("x")]) & ([(col("ham")) == ("a")])]
#   FROM
#     DF ["apple", "ham", "bar", "foo2"]; PROJECT */4 COLUMNS
# END INNER JOIN

More examples

Full -> Left join
q = lhs.join(rhs, left_on="a", right_on="b", how="full", coalesce=True).filter(
    pl.col("b") <= 3
)
print(q.explain())

### Before
# FILTER [(col("b")) <= (3)]
# FROM
#   FULL JOIN:
#   LEFT PLAN ON: [col("a")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END FULL JOIN
### After
# LEFT JOIN:
# LEFT PLAN ON: [col("a")]
#   FILTER [(col("b")) <= (3)]
#   FROM
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
# RIGHT PLAN ON: [col("b")]
#   DF ["a", "b", "c"]; PROJECT */3 COLUMNS
# END LEFT JOIN
Full -> Inner join
q = lhs.join(rhs, left_on="a", right_on="b", how="full", coalesce=True).filter(
    pl.col("b") <= 3, pl.col("c_right") <= "B"
)
print(q.explain())

### Before
# FILTER [([(col("b")) <= (3)]) & ([(col("c_right")) <= ("B")])]
# FROM
#   FULL JOIN:
#   LEFT PLAN ON: [col("a")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END FULL JOIN
### After
# INNER JOIN:
# LEFT PLAN ON: [col("a")]
#   FILTER [(col("b")) <= (3)]
#   FROM
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
# RIGHT PLAN ON: [col("b")]
#   FILTER [(col("c")) <= ("B")]
#   FROM
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
# END INNER JOIN
Left -> Inner join
q = lhs.join(rhs, left_on="a", right_on="b", how="left", coalesce=True).filter(
    pl.col("c_right") <= "B"
)
print(q.explain())

### Before
# FILTER [(col("c_right")) <= ("B")]
# FROM
#   LEFT JOIN:
#   LEFT PLAN ON: [col("a")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END LEFT JOIN
### After
# INNER JOIN:
# LEFT PLAN ON: [col("a")]
#   DF ["a", "b", "c"]; PROJECT */3 COLUMNS
# RIGHT PLAN ON: [col("b")]
#   FILTER [(col("c")) <= ("B")]
#   FROM
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
Right -> Inner join
q = lhs.join(rhs, left_on="a", right_on="b", how="right", coalesce=True).filter(
    pl.col("c") <= "B"
)
print(q.explain())

### Before
# FILTER [(col("c")) <= ("B")]
# FROM
#   RIGHT JOIN:
#   LEFT PLAN ON: [col("a")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END RIGHT JOIN
### After
# SELECT [col("b"), col("c"), col("a_right").alias("a"), col("a").alias("b_right"), col("c_right")]
#   INNER JOIN:
#   LEFT PLAN ON: [col("a")]
#     FILTER [(col("c")) <= ("B")]
#     FROM
#       DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   RIGHT PLAN ON: [col("b")]
#     DF ["a", "b", "c"]; PROJECT */3 COLUMNS
#   END INNER JOIN

@github-actions github-actions bot added performance Performance issues or improvements python Related to Python Polars rust Related to Rust Polars labels Jun 23, 2025
Copy link

codecov bot commented Jun 23, 2025

Codecov Report

Attention: Patch coverage is 93.05556% with 20 lines in your changes missing coverage. Please review.

Project coverage is 80.74%. Comparing base (0f615b2) to head (8562d84).
Report is 7 commits behind head on main.

Files with missing lines Patch % Lines
...lan/src/plans/optimizer/predicate_pushdown/join.rs 95.14% 12 Missing ⚠️
crates/polars-plan/src/plans/aexpr/properties.rs 80.00% 8 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main   #23275      +/-   ##
==========================================
- Coverage   80.75%   80.74%   -0.01%     
==========================================
  Files        1643     1644       +1     
  Lines      220327   220892     +565     
  Branches     2786     2786              
==========================================
+ Hits       177919   178355     +436     
- Misses      41744    41873     +129     
  Partials      664      664              

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@nameexhaustion nameexhaustion changed the title perf: Re-write join types to allow pushing more filters perf: Re-write join types during filter pushdown Jun 23, 2025
// Projects the new join output table back into the original join output table.
let mut project_to_original: Option<Vec<ExprIR>> = None;

if options.args.should_coalesce() {
Copy link
Collaborator Author

@nameexhaustion nameexhaustion Jun 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This few hundred lines of code in this branch are specifically to handle the differing output schema of coalescing right-joins 🥲.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want to extract coalesce to be a query plan thing and not a join property. Would be nice for this as well.

@@ -652,9 +652,6 @@ def test_predicate_pushdown_join_19772(

q = left.join(right, on="k", how=join_type).filter(predicate) # type: ignore[arg-type]

plan = q.explain()
assert plan.startswith("FILTER")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filter is now pushed

@nameexhaustion nameexhaustion marked this pull request as ready for review June 23, 2025 11:35
@ritchie46 ritchie46 merged commit f250201 into pola-rs:main Jun 24, 2025
30 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
performance Performance issues or improvements python Related to Python Polars rust Related to Rust Polars
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Filter on join column not pushed down
3 participants