databricks
diff --git a/‎src/databricks/sql/backend/sea/queue.py‎
Lines changed: 201 additions & 58 deletions b/‎src/databricks/sql/backend/sea/queue.py‎
Lines changed: 201 additions & 58 deletions
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 from abc import ABC
-from typing import List, Optional, Tuple, Union, TYPE_CHECKING
+import threading
+from typing import Dict, List, Optional, Tuple, Union, TYPE_CHECKING
 
 from databricks.sql.cloudfetch.download_manager import ResultFileDownloadManager
 
@@ -121,6 +122,179 @@ def close(self):
         return
 
 
+class LinkFetcher:
+    """
+    Background helper that incrementally retrieves *external links* for a
+    result set produced by the SEA backend and feeds them to a
+    :class:`databricks.sql.cloudfetch.download_manager.ResultFileDownloadManager`.
+
+    The SEA backend splits large result sets into *chunks*.  Each chunk is
+    stored remotely (e.g., in object storage) and exposed via a signed URL
+    encapsulated by an :class:`ExternalLink`.  Only the first batch of links is
+    returned with the initial query response.  The remaining links must be
+    pulled on demand using the *next-chunk* token embedded in each
+    :pyattr:`ExternalLink.next_chunk_index`.
+
+    LinkFetcher takes care of this choreography so callers (primarily
+    ``SeaCloudFetchQueue``) can simply ask for the link of a specific
+    ``chunk_index`` and block until it becomes available.
+
+    Key responsibilities:
+
+    • Maintain an in-memory mapping from ``chunk_index`` → ``ExternalLink``.
+    • Launch a background worker thread that continuously requests the next
+      batch of links from the backend until all chunks have been discovered or
+      an unrecoverable error occurs.
+    • Bridge SEA link objects to the Thrift representation expected by the
+      existing download manager.
+    • Provide a synchronous API (`get_chunk_link`) that blocks until the desired
+      link is present in the cache.
+    """
+
+    def __init__(
+        self,
+        download_manager: ResultFileDownloadManager,
+        backend: SeaDatabricksClient,
+        statement_id: str,
+        initial_links: List[ExternalLink],
+        total_chunk_count: int,
+    ):
+        self.download_manager = download_manager
+        self.backend = backend
+        self._statement_id = statement_id
+
+        self._shutdown_event = threading.Event()
+
+        self._link_data_update = threading.Condition()
+        self._error: Optional[Exception] = None
+        self.chunk_index_to_link: Dict[int, ExternalLink] = {}
+
+        self._add_links(initial_links)
+        self.total_chunk_count = total_chunk_count
+
+        # DEBUG: capture initial state for observability
+        logger.debug(
+            "LinkFetcher[%s]: initialized with %d initial link(s); expecting %d total chunk(s)",
+            statement_id,
+            len(initial_links),
+            total_chunk_count,
+        )
+
+    def _add_links(self, links: List[ExternalLink]):
+        """Cache *links* locally and enqueue them with the download manager."""
+        logger.debug(
+            "LinkFetcher[%s]: caching %d link(s) – chunks %s",
+            self._statement_id,
+            len(links),
+            ", ".join(str(l.chunk_index) for l in links) if links else "<none>",
+        )
+        for link in links:
+            self.chunk_index_to_link[link.chunk_index] = link
+            self.download_manager.add_link(LinkFetcher._convert_to_thrift_link(link))
+
+    def _get_next_chunk_index(self) -> Optional[int]:
+        """Return the next *chunk_index* that should be requested from the backend, or ``None`` if we have them all."""
+        with self._link_data_update:
+            max_chunk_index = max(self.chunk_index_to_link.keys(), default=None)
+            if max_chunk_index is None:
+                return 0
+            max_link = self.chunk_index_to_link[max_chunk_index]
+            return max_link.next_chunk_index
+
+    def _trigger_next_batch_download(self) -> bool:
+        """Fetch the next batch of links from the backend and return *True* on success."""
+        logger.debug(
+            "LinkFetcher[%s]: requesting next batch of links", self._statement_id
+        )
+        next_chunk_index = self._get_next_chunk_index()
+        if next_chunk_index is None:
+            return False
+
+        try:
+            links = self.backend.get_chunk_links(self._statement_id, next_chunk_index)
+            with self._link_data_update:
+                self._add_links(links)
+                self._link_data_update.notify_all()
+        except Exception as e:
+            logger.error(
+                f"LinkFetcher: Error fetching links for chunk {next_chunk_index}: {e}"
+            )
+            with self._link_data_update:
+                self._error = e
+                self._link_data_update.notify_all()
+            return False
+
+        logger.debug(
+            "LinkFetcher[%s]: received %d new link(s)",
+            self._statement_id,
+            len(links),
+        )
+        return True
+
+    def get_chunk_link(self, chunk_index: int) -> Optional[ExternalLink]:
+        """Return (blocking) the :class:`ExternalLink` associated with *chunk_index*."""
+        logger.debug(
+            "LinkFetcher[%s]: waiting for link of chunk %d",
+            self._statement_id,
+            chunk_index,
+        )
+        if chunk_index >= self.total_chunk_count:
+            return None
+
+        with self._link_data_update:
+            while chunk_index not in self.chunk_index_to_link:
+                if self._error:
+                    raise self._error
+                if self._shutdown_event.is_set():
+                    raise ProgrammingError(
+                        "LinkFetcher is shutting down without providing link for chunk index {}".format(
+                            chunk_index
+                        )
+                    )
+                self._link_data_update.wait()
+
+            return self.chunk_index_to_link[chunk_index]
+
+    @staticmethod
+    def _convert_to_thrift_link(link: ExternalLink) -> TSparkArrowResultLink:
+        """Convert SEA external links to Thrift format for compatibility with existing download manager."""
+        # Parse the ISO format expiration time
+        expiry_time = int(dateutil.parser.parse(link.expiration).timestamp())
+        return TSparkArrowResultLink(
+            fileLink=link.external_link,
+            expiryTime=expiry_time,
+            rowCount=link.row_count,
+            bytesNum=link.byte_count,
+            startRowOffset=link.row_offset,
+            httpHeaders=link.http_headers or {},
+        )
+
+    def _worker_loop(self):
+        """Entry point for the background thread."""
+        logger.debug("LinkFetcher[%s]: worker thread started", self._statement_id)
+        while not self._shutdown_event.is_set():
+            links_downloaded = self._trigger_next_batch_download()
+            if not links_downloaded:
+                self._shutdown_event.set()
+        logger.debug("LinkFetcher[%s]: worker thread exiting", self._statement_id)
+        self._link_data_update.notify_all()
+
+    def start(self):
+        """Spawn the worker thread."""
+        logger.debug("LinkFetcher[%s]: starting worker thread", self._statement_id)
+        self._worker_thread = threading.Thread(
+            target=self._worker_loop, name=f"LinkFetcher-{self._statement_id}"
+        )
+        self._worker_thread.start()
+
+    def stop(self):
+        """Signal the worker thread to stop and wait for its termination."""
+        logger.debug("LinkFetcher[%s]: stopping worker thread", self._statement_id)
+        self._shutdown_event.set()
+        self._worker_thread.join()
+        logger.debug("LinkFetcher[%s]: worker thread stopped", self._statement_id)
+
+
 class SeaCloudFetchQueue(CloudFetchQueue):
     """Queue implementation for EXTERNAL_LINKS disposition with ARROW format for SEA backend."""
 
@@ -158,80 +332,49 @@ def __init__(
             description=description,
         )
 
-        self._sea_client = sea_client
-        self._statement_id = statement_id
-        self._total_chunk_count = total_chunk_count
-
         logger.debug(
             "SeaCloudFetchQueue: Initialize CloudFetch loader for statement {}, total chunks: {}".format(
                 statement_id, total_chunk_count
             )
         )
 
         initial_links = result_data.external_links or []
-        self._chunk_index_to_link = {link.chunk_index: link for link in initial_links}
 
         # Track the current chunk we're processing
         self._current_chunk_index = 0
-        first_link = self._chunk_index_to_link.get(self._current_chunk_index, None)
-        if not first_link:
-            # possibly an empty response
-            return None
 
-        # Track the current chunk we're processing
-        self._current_chunk_index = 0
-        # Initialize table and position
-        self.table = self._create_table_from_link(first_link)
+        self.link_fetcher = None  # for empty responses, we do not need a link fetcher
+        if total_chunk_count > 0:
+            self.link_fetcher = LinkFetcher(
+                download_manager=self.download_manager,
+                backend=sea_client,
+                statement_id=statement_id,
+                initial_links=initial_links,
+                total_chunk_count=total_chunk_count,
+            )
+            self.link_fetcher.start()
 
-    def _convert_to_thrift_link(self, link: ExternalLink) -> TSparkArrowResultLink:
-        """Convert SEA external links to Thrift format for compatibility with existing download manager."""
-        # Parse the ISO format expiration time
-        expiry_time = int(dateutil.parser.parse(link.expiration).timestamp())
-        return TSparkArrowResultLink(
-            fileLink=link.external_link,
-            expiryTime=expiry_time,
-            rowCount=link.row_count,
-            bytesNum=link.byte_count,
-            startRowOffset=link.row_offset,
-            httpHeaders=link.http_headers or {},
-        )
+        # Initialize table and position
+        self.table = self._create_next_table()
 
-    def _get_chunk_link(self, chunk_index: int) -> Optional["ExternalLink"]:
-        if chunk_index >= self._total_chunk_count:
+    def _create_next_table(self) -> Union["pyarrow.Table", None]:
+        """Create next table by retrieving the logical next downloaded file."""
+        if self.link_fetcher is None:
             return None
 
-        if chunk_index not in self._chunk_index_to_link:
-            links = self._sea_client.get_chunk_links(self._statement_id, chunk_index)
-            self._chunk_index_to_link.update({l.chunk_index: l for l in links})
-
-        link = self._chunk_index_to_link.get(chunk_index, None)
-        if not link:
-            raise ServerOperationError(
-                f"Error fetching link for chunk {chunk_index}",
-                {
-                    "operation-id": self._statement_id,
-                    "diagnostic-info": None,
-                },
-            )
-        return link
-
-    def _create_table_from_link(
-        self, link: ExternalLink
-    ) -> Union["pyarrow.Table", None]:
-        """Create a table from a link."""
-
-        thrift_link = self._convert_to_thrift_link(link)
-        self.download_manager.add_link(thrift_link)
+        chunk_link = self.link_fetcher.get_chunk_link(self._current_chunk_index)
+        if chunk_link is None:
+            return None
 
-        row_offset = link.row_offset
+        row_offset = chunk_link.row_offset
+        # NOTE: link has already been submitted to download manager at this point
         arrow_table = self._create_table_at_offset(row_offset)
 
+        self._current_chunk_index += 1
+
         return arrow_table
 
-    def _create_next_table(self) -> Union["pyarrow.Table", None]:
-        """Create next table by retrieving the logical next downloaded file."""
-        self._current_chunk_index += 1
-        next_chunk_link = self._get_chunk_link(self._current_chunk_index)
-        if not next_chunk_link:
-            return None
-        return self._create_table_from_link(next_chunk_link)
+    def close(self):
+        super().close()
+        if self.link_fetcher:
+            self.link_fetcher.stop()