fix: Fix errors on native scan_iceberg (pola-rs#23811)

nameexhaustion · Washiil · commit 883bebd720c5 · 2025-08-02T00:41:36.000-04:00
diff --git a/crates/polars-parquet/src/arrow/read/schema/convert.rs b/crates/polars-parquet/src/arrow/read/schema/convert.rs
@@ -1,5 +1,8 @@
 //! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`].
-use arrow::datatypes::{ArrowDataType, ArrowSchema, Field, IntervalUnit, TimeUnit};
+use std::sync::Arc;
+
+use arrow::datatypes::{ArrowDataType, ArrowSchema, Field, IntervalUnit, Metadata, TimeUnit};
+use polars_utils::format_pl_smallstr;
 use polars_utils::pl_str::PlSmallStr;
 
 use crate::arrow::read::schema::SchemaInferenceOptions;
@@ -309,11 +312,27 @@ pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {
 /// Returns `None` iff the parquet type has no associated primitive types,
 /// i.e. if it is a column-less group type.
 fn to_field(type_: &ParquetType, options: &SchemaInferenceOptions) -> Option<Field> {
-    Some(Field::new(
-        type_.get_field_info().name.clone(),
+    let field_info = type_.get_field_info();
+
+    let metadata: Option<Arc<Metadata>> = field_info.id.map(|x: i32| {
+        Arc::new(
+            [(
+                PlSmallStr::from_static("PARQUET:field_id"),
+                format_pl_smallstr!("{x}"),
+            )]
+            .into(),
+        )
+    });
+
+    let mut arrow_field = Field::new(
+        field_info.name.clone(),
         to_dtype(type_, options)?,
         is_nullable(type_.get_field_info()),
-    ))
+    );
+
+    arrow_field.metadata = metadata;
+
+    Some(arrow_field)
 }
 
 /// Converts a parquet list to arrow list.
diff --git a/crates/polars-plan/src/plans/optimizer/expand_datasets.rs b/crates/polars-plan/src/plans/optimizer/expand_datasets.rs
@@ -29,66 +29,82 @@ impl OptimizationRule for ExpandDatasets {
         _expr_arena: &mut Arena<crate::prelude::AExpr>,
         node: Node,
     ) -> PolarsResult<Option<IR>> {
-        let ir = lp_arena.get(node);
-
+        // # Note
+        // This function mutates the IR node in-place rather than returning the new IR - the
+        // StackOptimizer will re-call this function otherwise.
         if let IR::Scan {
+            sources,
             scan_type,
             unified_scan_args,
-            ..
-        } = ir
+
+            file_info: _,
+            hive_parts: _,
+            predicate: _,
+            output_schema: _,
+        } = lp_arena.get_mut(node)
         {
             let projection = unified_scan_args.projection.clone();
             let limit = match unified_scan_args.pre_slice.clone() {
                 Some(v @ Slice::Positive { .. }) => Some(v.end_position()),
                 _ => None,
             };
 
-            match scan_type.as_ref() {
+            match scan_type.as_mut() {
                 #[cfg(feature = "python")]
                 FileScanIR::PythonDataset {
                     dataset_object,
                     cached_ir,
                 } => {
                     let cached_ir = cached_ir.clone();
-
                     let mut guard = cached_ir.lock().unwrap();
 
-                    // Note: We always get called twice in succession from the stack optimizer,
-                    // as it was designed to optimize until fixed point. Ensure we return
-                    // Ok(None) if the mutex contains the initialized state.
-                    if match guard.as_ref() {
-                        // Reject cached if limit or projection does not match. This can happen if a scan is reused.
+                    if config::verbose() {
+                        eprintln!(
+                            "expand_datasets(): python[{}]: limit: {:?}, project: {}",
+                            dataset_object.name(),
+                            limit,
+                            projection.as_ref().map_or(
+                                PlSmallStr::from_static("all"),
+                                |x| format_pl_smallstr!("{}", x.len())
+                            )
+                        )
+                    }
+
+                    let can_use_existing = match guard.as_ref() {
                         Some(resolved) => {
                             let ExpandedDataset {
                                 limit: cached_limit,
                                 projection: cached_projection,
-                                resolved_ir: _,
+                                expanded_dsl: _,
                                 python_scan: _,
                             } = resolved;
 
                             cached_limit == &limit && cached_projection == &projection
                         },
 
                         None => false,
-                    } {
-                        return Ok(None);
-                    }
+                    };
 
-                    if config::verbose() {
-                        eprintln!(
-                            "expand_datasets(): python[{}]: limit: {:?}, project: {}",
-                            dataset_object.name(),
+                    if !can_use_existing {
+                        let expanded_dsl =
+                            dataset_object.to_dataset_scan(limit, projection.as_deref())?;
+
+                        *guard = Some(ExpandedDataset {
                             limit,
-                            projection.as_ref().map_or(
-                                PlSmallStr::from_static("all"),
-                                |x| format_pl_smallstr!("{}", x.len())
-                            )
-                        )
+                            projection,
+                            expanded_dsl,
+                            python_scan: None,
+                        })
                     }
 
-                    let plan = dataset_object.to_dataset_scan(limit, projection.as_deref())?;
+                    let ExpandedDataset {
+                        limit: _,
+                        projection: _,
+                        expanded_dsl,
+                        python_scan,
+                    } = guard.as_mut().unwrap();
 
-                    let (resolved_ir, python_scan) = match plan {
+                    match expanded_dsl {
                         DslPlan::Scan {
                             sources: resolved_sources,
                             unified_scan_args: resolved_unified_scan_args,
@@ -97,22 +113,6 @@ impl OptimizationRule for ExpandDatasets {
                         } => {
                             use crate::dsl::FileScanDsl;
 
-                            let mut ir = ir.clone();
-
-                            let IR::Scan {
-                                sources,
-                                scan_type,
-                                unified_scan_args,
-
-                                file_info: _,
-                                hive_parts: _,
-                                predicate: _,
-                                output_schema: _,
-                            } = &mut ir
-                            else {
-                                unreachable!()
-                            };
-
                             // We only want a few configuration flags from here (e.g. column casting config).
                             // The rest we either expect to be None (e.g. projection / row_index), or ignore.
                             let UnifiedScanArgs {
@@ -131,25 +131,26 @@ impl OptimizationRule for ExpandDatasets {
                                 include_file_paths: _include_file_paths @ None,
                                 deletion_files,
                                 column_mapping,
-                            } = *resolved_unified_scan_args
+                            } = resolved_unified_scan_args.as_ref()
                             else {
                                 panic!(
                                     "invalid scan args from python dataset resolve: {:?}",
                                     &resolved_unified_scan_args
                                 )
                             };
 
-                            unified_scan_args.cloud_options = cloud_options;
-                            unified_scan_args.rechunk = rechunk;
-                            unified_scan_args.cache = cache;
-                            unified_scan_args.cast_columns_policy = cast_columns_policy;
-                            unified_scan_args.missing_columns_policy = missing_columns_policy;
-                            unified_scan_args.extra_columns_policy = extra_columns_policy;
-                            unified_scan_args.deletion_files = deletion_files;
-                            unified_scan_args.column_mapping = column_mapping;
-
-                            *sources = resolved_sources;
-                            *scan_type = Box::new(match *resolved_scan_type {
+                            unified_scan_args.cloud_options = cloud_options.clone();
+                            unified_scan_args.rechunk = *rechunk;
+                            unified_scan_args.cache = *cache;
+                            unified_scan_args.cast_columns_policy = cast_columns_policy.clone();
+                            unified_scan_args.missing_columns_policy = *missing_columns_policy;
+                            unified_scan_args.extra_columns_policy = *extra_columns_policy;
+                            unified_scan_args.deletion_files = deletion_files.clone();
+                            unified_scan_args.column_mapping = column_mapping.clone();
+
+                            *sources = resolved_sources.clone();
+
+                            *scan_type = Box::new(match *resolved_scan_type.clone() {
                                 #[cfg(feature = "csv")]
                                 FileScanDsl::Csv { options } => FileScanIR::Csv { options },
 
@@ -182,18 +183,15 @@ impl OptimizationRule for ExpandDatasets {
                                     file_info: _,
                                 } => FileScanIR::Anonymous { options, function },
                             });
-
-                            (ir, None)
                         },
 
-                        DslPlan::PythonScan { options } => (
-                            ir.clone(),
-                            Some(ExpandedPythonScan {
+                        DslPlan::PythonScan { options } => {
+                            *python_scan = Some(ExpandedPythonScan {
                                 name: dataset_object.name(),
-                                scan_fn: options.scan_fn.unwrap(),
-                                variant: options.python_source,
-                            }),
-                        ),
+                                scan_fn: options.scan_fn.clone().unwrap(),
+                                variant: options.python_source.clone(),
+                            })
+                        },
 
                         dsl => {
                             polars_bail!(
@@ -203,24 +201,12 @@ impl OptimizationRule for ExpandDatasets {
                             )
                         },
                     };
-
-                    let resolved = ExpandedDataset {
-                        limit,
-                        projection,
-                        resolved_ir,
-                        python_scan,
-                    };
-
-                    *guard = Some(resolved);
-
-                    let resolved_ir = guard.as_ref().map(|x| x.resolved_ir.clone()).unwrap();
-
-                    return Ok(Some(resolved_ir));
                 },
 
                 _ => {},
             }
         }
+
         Ok(None)
     }
 }
@@ -229,7 +215,7 @@ impl OptimizationRule for ExpandDatasets {
 pub struct ExpandedDataset {
     limit: Option<usize>,
     projection: Option<Arc<[PlSmallStr]>>,
-    resolved_ir: IR,
+    expanded_dsl: DslPlan,
 
     /// Fallback python scan
     #[cfg(feature = "python")]
@@ -256,7 +242,7 @@ impl Debug for ExpandedDataset {
         let ExpandedDataset {
             limit,
             projection,
-            resolved_ir,
+            expanded_dsl,
 
             #[cfg(feature = "python")]
             python_scan,
@@ -265,8 +251,10 @@ impl Debug for ExpandedDataset {
         return display::ExpandedDataset {
             limit,
             projection,
-            resolved_ir,
-
+            expanded_dsl: &match expanded_dsl.display() {
+                Ok(v) => v.to_string(),
+                Err(e) => e.to_string(),
+            },
             #[cfg(feature = "python")]
             python_scan: python_scan.as_ref().map(
                 |ExpandedPythonScan {
@@ -281,18 +269,17 @@ impl Debug for ExpandedDataset {
         .fmt(f);
 
         mod display {
+            use std::fmt::Debug;
             use std::sync::Arc;
 
             use polars_utils::pl_str::PlSmallStr;
 
-            use crate::prelude::IR;
-
             #[derive(Debug)]
             #[expect(unused)]
             pub struct ExpandedDataset<'a> {
                 pub limit: &'a Option<usize>,
                 pub projection: &'a Option<Arc<[PlSmallStr]>>,
-                pub resolved_ir: &'a IR,
+                pub expanded_dsl: &'a str,
 
                 #[cfg(feature = "python")]
                 pub python_scan: Option<PlSmallStr>,
diff --git a/py-polars/tests/unit/io/test_iceberg.py b/py-polars/tests/unit/io/test_iceberg.py
@@ -53,18 +53,18 @@ class TestIcebergScanIO:
     """Test coverage for `iceberg` scan ops."""
 
     def test_scan_iceberg_plain(self, iceberg_path: str) -> None:
-        df = pl.scan_iceberg(iceberg_path)
-        assert len(df.collect()) == 3
-        assert df.collect_schema() == {
+        q = pl.scan_iceberg(iceberg_path)
+        assert len(q.collect()) == 3
+        assert q.collect_schema() == {
             "id": pl.Int32,
             "str": pl.String,
             "ts": pl.Datetime(time_unit="us", time_zone=None),
         }
 
     def test_scan_iceberg_snapshot_id(self, iceberg_path: str) -> None:
-        df = pl.scan_iceberg(iceberg_path, snapshot_id=7051579356916758811)
-        assert len(df.collect()) == 3
-        assert df.collect_schema() == {
+        q = pl.scan_iceberg(iceberg_path, snapshot_id=7051579356916758811)
+        assert len(q.collect()) == 3
+        assert q.collect_schema() == {
             "id": pl.Int32,
             "str": pl.String,
             "ts": pl.Datetime(time_unit="us", time_zone=None),