Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ jobs:
run: |
pytest -n auto -m "not release and not benchmark and not docs" tests/unit/io/test_multiscan.py
pytest -n auto -m "not release and not benchmark and not docs" tests/unit/io/test_scan_row_deletion.py
pytest -n auto -m "not release and not benchmark and not docs" tests/unit/io/test_iceberg.py

- name: Check import without optional dependencies
if: github.ref_name != 'main' && matrix.os == 'ubuntu-latest' && (matrix.python-version == '3.13' || matrix.python-version == '3.13t')
Expand Down
26 changes: 20 additions & 6 deletions crates/polars-core/src/schema/iceberg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::sync::Arc;

use arrow::datatypes::{ArrowDataType, ArrowSchema, Field as ArrowField};
use polars_error::{PolarsResult, feature_gated, polars_bail, polars_err};
use polars_utils::aliases::InitHashMaps;
use polars_utils::pl_str::PlSmallStr;

use crate::prelude::{DataType, Field, PlIndexMap};
Expand All @@ -30,12 +31,25 @@ impl IcebergSchema {
where
I: IntoIterator<Item = &'a ArrowField>,
{
Ok(Self(
iter.into_iter()
.map(|x| arrow_field_to_iceberg_column_rec(x, None))
.map(|x| x.map(|x| (x.physical_id, x)))
.collect::<PolarsResult<PlIndexMap<u32, IcebergColumn>>>()?,
))
let iter = iter.into_iter();
let size_hint = iter.size_hint();

let mut out = PlIndexMap::with_capacity(size_hint.1.unwrap_or(size_hint.0));

for arrow_field in iter {
let col: IcebergColumn = arrow_field_to_iceberg_column_rec(arrow_field, None)?;
let existing = out.insert(col.physical_id, col);

if let Some(existing) = existing {
polars_bail!(
Duplicate:
"IcebergSchema: duplicate physical ID {:?}",
existing,
)
}
}

Ok(Self(out))
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-stream/src/nodes/io_sources/batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ impl FileReader for BatchFnReader {
args: BeginReadArgs,
) -> PolarsResult<(FileReaderOutputRecv, JoinHandle<PolarsResult<()>>)> {
let BeginReadArgs {
projected_schema: _,
projection: _,
row_index: None,
pre_slice: None,
predicate: None,
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-stream/src/nodes/io_sources/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use crate::async_executor::{AbortOnDropHandle, spawn};
use crate::async_primitives::distributor_channel::{self, distributor_channel};
use crate::morsel::SourceToken;
use crate::nodes::compute_node_prelude::*;
use crate::nodes::io_sources::multi_file_reader::reader_interface::Projection;
use crate::nodes::io_sources::multi_file_reader::reader_interface::output::FileReaderOutputSend;
use crate::nodes::{MorselSeq, TaskPriority};

Expand Down Expand Up @@ -132,7 +133,7 @@ impl FileReader for CsvFileReader {
let memslice = self.get_bytes_maybe_decompress()?;

let BeginReadArgs {
projected_schema,
projection: Projection::Plain(projected_schema),
// Because we currently only support PRE_SLICE we don't need to handle row index here.
row_index,
pre_slice,
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-stream/src/nodes/io_sources/ipc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use crate::async_primitives::linearizer::Linearizer;
use crate::morsel::{Morsel, MorselSeq, SourceToken, get_ideal_morsel_size};
use crate::nodes::io_sources::multi_file_reader::reader_interface::output::FileReaderOutputSend;
use crate::nodes::io_sources::multi_file_reader::reader_interface::{
FileReader, FileReaderCallbacks,
FileReader, FileReaderCallbacks, Projection,
};
use crate::{DEFAULT_DISTRIBUTOR_BUFFER_SIZE, DEFAULT_LINEARIZER_BUFFER_SIZE};

Expand Down Expand Up @@ -196,7 +196,7 @@ impl FileReader for IpcFileReader {
} = self.init_data.clone().unwrap();

let BeginReadArgs {
projected_schema,
projection: Projection::Plain(projected_schema),
row_index,
pre_slice: pre_slice_arg,
predicate: None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ use polars_core::schema::SchemaRef;
use polars_error::PolarsResult;
use polars_io::RowIndex;
use polars_io::predicates::ScanIOPredicate;
use polars_plan::dsl::ScanSource;
use polars_plan::dsl::{MissingColumnsPolicy, ScanSource};
use polars_plan::plans::hive::HivePartitionsDf;
use polars_utils::slice_enum::Slice;

use super::ExtraOperations;
use crate::nodes::io_sources::multi_file_reader::extra_ops::column_selector::{
ColumnSelector, ColumnSelectorBuilder,
};
use crate::nodes::io_sources::multi_file_reader::extra_ops::missing_column_err;
use crate::nodes::io_sources::multi_file_reader::initialization::deletion_files::ExternalFilterMask;
use crate::nodes::io_sources::multi_file_reader::reader_interface::Projection;
use crate::nodes::io_sources::multi_file_reader::row_counter::RowCounter;

/// Apply extra operations onto morsels originating from a reader. This should be initialized
Expand All @@ -28,7 +30,7 @@ pub enum ApplyExtraOps {
/// in Arc.
Uninitialized {
final_output_schema: SchemaRef,
projected_file_schema: SchemaRef,
projection: Projection,
extra_ops: ExtraOperations,
/// This here so that we can get the include file path name if needed.
scan_source: ScanSource,
Expand All @@ -43,7 +45,8 @@ pub enum ApplyExtraOps {
/// Physical - i.e. applied before `external_filter_mask`. This is calculated in `initialize()` if needed.
physical_pre_slice: Option<Slice>,
external_filter_mask: Option<ExternalFilterMask>,
row_index: Option<RowIndex>,
/// `(_, insertion_position)`
row_index: Option<(RowIndex, usize)>,
/// This will have include_file_paths, hive columns, missing columns.
column_selectors: Option<Vec<ColumnSelector>>,
predicate: Option<ScanIOPredicate>,
Expand Down Expand Up @@ -74,15 +77,16 @@ impl ApplyExtraOps {

Uninitialized {
final_output_schema,
#[expect(unused)]
projected_file_schema, // TODO: This can maybe be removed
projection,
extra_ops:
ExtraOperations {
row_index,
row_index_col_idx,
pre_slice,
cast_columns_policy,
missing_columns_policy,
include_file_paths,
file_path_col_idx,
predicate,
},
scan_source,
Expand All @@ -95,45 +99,13 @@ impl ApplyExtraOps {
panic!("impl error: negative pre_slice at post")
}

let mut slf = Self::Initialized {
physical_pre_slice: pre_slice,
external_filter_mask,
row_index,

// Initialized below
column_selectors: None,
predicate: None,
};

let schema_before_selection = if incoming_schema.len() == final_output_schema.len()
Copy link
Collaborator Author

@nameexhaustion nameexhaustion Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently the column selectors are applied after attaching the row_index column. This is prone to name collisions as the underlying file can contain a column called "row_index" that gets renamed to a different column.

This PR changes the ordering of operations to always apply the column selectors first before any other operation that adds/removes columns (e.g. row_index) to resolve this problem. As a result this code here is no longer needed.

{
// Incoming schema already has all of the columns, either because no extra columns were needed, or
// the extra columns were attached by the reader (which is just Parquet when it has a predicate).
incoming_schema.clone()
} else {
// We use a trick to determine our schema state before reordering by applying onto an empty DataFrame.
// This is much less error prone compared determining it separately.
//
// This schema may contain an additional row index column.
let mut df = DataFrame::empty_with_schema(incoming_schema);
slf.apply_to_df(&mut df, RowCounter::MAX)?;
df.schema().clone()
};

let mut column_selectors = Vec::with_capacity(final_output_schema.len());
let selector_builder = ColumnSelectorBuilder {
cast_columns_policy,
missing_columns_policy,
};
// Tracks if the input already has all columns in the right order and type.
let mut is_input_passthrough =
schema_before_selection.len() == final_output_schema.len();

let file_path_col_idx = include_file_paths.as_ref().map_or(
// Default usize::MAX as it is not a valid index
usize::MAX,
|name| final_output_schema.index_of(name).unwrap(),
);
let mut is_input_passthrough = incoming_schema.len() == final_output_schema.len();

for (output_index, (output_name, output_dtype)) in
final_output_schema.iter().enumerate()
Expand All @@ -151,6 +123,21 @@ impl ApplyExtraOps {
),
),
)))
} else if output_index == row_index_col_idx {
if let Some(ri) = &row_index {
// Row index is done by us (ApplyExtraOps). Insert a placeholder column.
ColumnSelector::Constant(Box::new((
ri.name.clone(),
Scalar::null(DataType::Null),
)))
} else {
debug_assert_eq!(
incoming_schema.get(output_name),
Some(&DataType::IDX_DTYPE)
);

ColumnSelector::Position(incoming_schema.index_of(output_name).unwrap())
}
} else if let Some(hive_parts) = &hive_parts
&& let Ok(hive_column) = hive_parts.df().column(output_name)
{
Expand All @@ -161,12 +148,40 @@ impl ApplyExtraOps {
hive_column.get(scan_source_idx)?.into_static(),
),
)))
} else if let Some((mapped_projection, incoming_idx, incoming_dtype)) = (|| {
let mapped_projection =
projection.get_mapped_projection_ref_by_output_name(output_name)?;

let (incoming_idx, _, incoming_dtype) =
incoming_schema.get_full(mapped_projection.source_name)?;

Some((mapped_projection, incoming_idx, incoming_dtype))
})(
) {
debug_assert_eq!(mapped_projection.output_dtype, output_dtype);

if let Some(resolved_transform) = mapped_projection.resolved_transform {
debug_assert_eq!(resolved_transform.source_dtype, incoming_dtype);

resolved_transform
.attach_transforms(ColumnSelector::Position(incoming_idx))
} else {
selector_builder.build_column_selector(
incoming_schema,
output_name,
output_dtype,
)?
}
} else {
selector_builder.build_column_selector(
&schema_before_selection,
output_name,
output_dtype,
)?
match &selector_builder.missing_columns_policy {
MissingColumnsPolicy::Insert => ColumnSelector::Constant(Box::new((
output_name.clone(),
Scalar::null(output_dtype.clone()),
))),
MissingColumnsPolicy::Raise => {
return Err(missing_column_err(output_name));
},
}
};

is_input_passthrough &= match &selector {
Expand All @@ -183,21 +198,17 @@ impl ApplyExtraOps {
Some(column_selectors)
};

let Self::Initialized {
column_selectors: slf_column_selectors,
predicate: slf_predicate,
..
} = &mut slf
else {
unreachable!()
let out = Self::Initialized {
physical_pre_slice: pre_slice,
external_filter_mask,
row_index: row_index.map(|ri| (ri, row_index_col_idx)),
column_selectors,
predicate,
};

*slf_column_selectors = column_selectors;
*slf_predicate = predicate;

// Return a `Noop` if our initialized state does not have any operations. Downstream
// can see the `Noop` and avoid running through an extra distributor pipeline.
let slf = match slf {
let out = match out {
Initialized {
physical_pre_slice: None,
external_filter_mask: None,
Expand All @@ -206,12 +217,12 @@ impl ApplyExtraOps {
predicate: None,
} => Self::Noop,

Initialized { .. } => slf,
Initialized { .. } => out,

_ => unreachable!(),
};

Ok(slf)
Ok(out)
},
}
}
Expand Down Expand Up @@ -276,9 +287,16 @@ impl ApplyExtraOps {
local_filter_mask.filter_df(df)?;
};

if let Some(column_selectors) = column_selectors.as_deref() {
*df = column_selectors
.iter()
.map(|x| x.select_from_columns(df.get_columns(), df.height()))
.collect::<PolarsResult<DataFrame>>()?;
}

// Note: This branch is hit if we have negative slice or predicate + row index and the reader
// does not support them.
if let Some(ri) = row_index {
if let Some((ri, col_idx)) = row_index {
// Adjustment needed for `current_row_position`.
let local_offset_adjustment = RowCounter::new(
// Number of physical rows skipped in the current function
Expand All @@ -300,20 +318,11 @@ impl ApplyExtraOps {
.num_rows_idxsize_saturating()?,
);

unsafe {
df.with_column_unchecked(Column::new_row_index(
ri.name.clone(),
offset,
df.height(),
)?)
};
}
let row_index_col = Column::new_row_index(ri.name.clone(), offset, df.height())?;

if let Some(column_selectors) = column_selectors.as_deref() {
*df = column_selectors
.iter()
.map(|x| x.select_from_columns(df.get_columns(), df.height()))
.collect::<PolarsResult<DataFrame>>()?;
debug_assert_eq!(df.get_columns()[*col_idx].name(), &ri.name);

unsafe { *df.get_columns_mut().get_mut(*col_idx).unwrap() = row_index_col }
}

if let Some(predicate) = predicate {
Expand Down
Loading
Loading