run-llama · AstraBert · Sep 24, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/llama-index-core/llama_index/core/base/llms/types.py b/llama-index-core/llama_index/core/base/llms/types.py
@@ -425,6 +425,24 @@ def validate_cited_content(cls, v: Any) -> Any:
         return v
 
 
+class ThinkingBlock(BaseModel):
+    """A representation of the content streamed from reasoning/thinking processes by LLMs"""
+
+    block_type: Literal["thinking"] = "thinking"
+    content: Optional[str] = Field(
+        description="Content of the reasoning/thinking process, if available",
+        default=None,
+    )
+    num_tokens: Optional[int] = Field(
+        description="Number of token used for reasoning/thinking, if available",
+        default=None,
+    )
+    additional_information: Dict[str, Any] = Field(
+        description="Additional information related to the thinking/reasoning process, if available",
+        default_factory=dict,
+    )
+
+
 ContentBlock = Annotated[
     Union[
         TextBlock,
@@ -435,6 +453,7 @@ def validate_cited_content(cls, v: Any) -> Any:
         CachePoint,
         CitableBlock,
         CitationBlock,
+        ThinkingBlock,
     ],
     Field(discriminator="block_type"),
 ]

diff --git a/llama-index-core/llama_index/core/memory/memory.py b/llama-index-core/llama_index/core/memory/memory.py
@@ -28,6 +28,7 @@
     CachePoint,
     CitableBlock,
     CitationBlock,
+    ThinkingBlock,
 )
 from llama_index.core.bridge.pydantic import (
     BaseModel,
@@ -343,6 +344,7 @@ def _estimate_token_count(
                     DocumentBlock,
                     CitableBlock,
                     CitationBlock,
+                    ThinkingBlock,
                 ]
             ] = []
 

diff --git a/llama-index-core/tests/base/llms/test_types.py b/llama-index-core/tests/base/llms/test_types.py
@@ -17,6 +17,7 @@
     AudioBlock,
     CachePoint,
     CacheControl,
+    ThinkingBlock,
 )
 from llama_index.core.bridge.pydantic import BaseModel
 from llama_index.core.bridge.pydantic import ValidationError
@@ -455,3 +456,20 @@ def test_video_block_store_as_base64(mp4_bytes: bytes, mp4_base64: bytes):
     assert VideoBlock(video=mp4_bytes).video == mp4_base64
     # Store already encoded data
     assert VideoBlock(video=mp4_base64).video == mp4_base64
+
+
+def test_thinking_block():
+    block = ThinkingBlock()
+    assert block.block_type == "thinking"
+    assert block.additional_information == {}
+    assert block.content is None
+    assert block.num_tokens is None
+    block = ThinkingBlock(
+        content="hello world",
+        num_tokens=100,
+        additional_information={"total_thinking_tokens": 1000},
+    )
+    assert block.block_type == "thinking"
+    assert block.additional_information == {"total_thinking_tokens": 1000}
+    assert block.content == "hello world"
+    assert block.num_tokens == 100
diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/base.py b/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/base.py
@@ -25,6 +25,7 @@
 )
 from llama_index.core.base.llms.types import TextBlock as LITextBlock
 from llama_index.core.base.llms.types import CitationBlock as LICitationBlock
+from llama_index.core.base.llms.types import ThinkingBlock as LIThinkingBlock
 from llama_index.core.bridge.pydantic import Field, PrivateAttr
 from llama_index.core.callbacks import CallbackManager
 from llama_index.core.constants import DEFAULT_TEMPERATURE
@@ -204,6 +205,9 @@ def __init__(
     ) -> None:
         additional_kwargs = additional_kwargs or {}
         callback_manager = callback_manager or CallbackManager([])
+        # set the temperature to 1 when thinking is enabled, as per: https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking
+        if thinking_dict and thinking_dict.get("type") == "enabled":
+            temperature = 1
 
         super().__init__(
             temperature=temperature,
@@ -340,11 +344,8 @@ def _completion_response_from_chat_response(
 
     def _get_blocks_and_tool_calls_and_thinking(
         self, response: Any
-    ) -> Tuple[
-        List[ContentBlock], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]
-    ]:
+    ) -> Tuple[List[ContentBlock], List[Dict[str, Any]], List[Dict[str, Any]]]:
         tool_calls = []
-        thinking = None
         blocks: List[ContentBlock] = []
         citations: List[TextCitation] = []
         tracked_citations: Set[str] = set()
@@ -375,11 +376,18 @@ def _get_blocks_and_tool_calls_and_thinking(
                     citations.extend(content_block.citations)
             # this assumes a single thinking block, which as of 2025-03-06, is always true
             elif isinstance(content_block, ThinkingBlock):
-                thinking = content_block.model_dump()
+                blocks.append(
+                    LIThinkingBlock(
+                        content=content_block.thinking,
+                        additional_information=content_block.model_dump(
+                            exclude={"thinking"}
+                        ),
+                    )
+                )
             elif isinstance(content_block, ToolUseBlock):
                 tool_calls.append(content_block.model_dump())
 
-        return blocks, tool_calls, thinking, [x.model_dump() for x in citations]
+        return blocks, tool_calls, [x.model_dump() for x in citations]
 
     @llm_chat_callback()
     def chat(
@@ -397,8 +405,8 @@ def chat(
             **all_kwargs,
         )
 
-        blocks, tool_calls, thinking, citations = (
-            self._get_blocks_and_tool_calls_and_thinking(response)
+        blocks, tool_calls, citations = self._get_blocks_and_tool_calls_and_thinking(
+            response
         )
 
         return AnthropicChatResponse(
@@ -407,7 +415,6 @@ def chat(
                 blocks=blocks,
                 additional_kwargs={
                     "tool_calls": tool_calls,
-                    "thinking": thinking,
                 },
             ),
             citations=citations,
@@ -570,8 +577,8 @@ async def achat(
             **all_kwargs,
         )
 
-        blocks, tool_calls, thinking, citations = (
-            self._get_blocks_and_tool_calls_and_thinking(response)
+        blocks, tool_calls, citations = self._get_blocks_and_tool_calls_and_thinking(
+            response
         )
 
         return AnthropicChatResponse(
@@ -580,7 +587,6 @@ async def achat(
                 blocks=blocks,
                 additional_kwargs={
                     "tool_calls": tool_calls,
-                    "thinking": thinking,
                 },
             ),
             citations=citations,

diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/utils.py b/llama-index-integrations/llms/llama-index-llms-anthropic/llama_index/llms/anthropic/utils.py
@@ -14,6 +14,7 @@
     CachePoint,
     CitableBlock,
     CitationBlock,
+    ThinkingBlock,
     ContentBlock,
 )
 
@@ -26,7 +27,7 @@
     CacheControlEphemeralParam,
     Base64PDFSourceParam,
 )
-from anthropic.types import ContentBlock as AnthropicContentBlock
+from anthropic.types import ContentBlockParam as AnthropicContentBlock
 from anthropic.types.beta import (
     BetaSearchResultBlockParam,
     BetaTextBlockParam,
@@ -201,9 +202,6 @@ def blocks_to_anthropic_blocks(
     if kwargs.get("cache_control"):
         global_cache_control = CacheControlEphemeralParam(**kwargs["cache_control"])
 
-    if kwargs.get("thinking"):
-        anthropic_blocks.append(ThinkingBlockParam(**kwargs["thinking"]))
-
     for block in blocks:
         if isinstance(block, TextBlock):
             if block.text:
@@ -251,6 +249,17 @@ def blocks_to_anthropic_blocks(
             if global_cache_control:
                 anthropic_blocks[-1]["cache_control"] = global_cache_control
 
+        elif isinstance(block, ThinkingBlock):
+            if block.content:
+                signature = block.additional_information.get("signature", "")
+                anthropic_blocks.append(
+                    ThinkingBlockParam(
+                        signature=signature, thinking=block.content, type="thinking"
+                    )
+                )
+                if global_cache_control:
+                    anthropic_blocks[-1]["cache_control"] = global_cache_control
+
         elif isinstance(block, CachePoint):
             if len(anthropic_blocks) > 0:
                 anthropic_blocks[-1]["cache_control"] = CacheControlEphemeralParam(

diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-anthropic/pyproject.toml
@@ -27,7 +27,7 @@ dev = [
 
 [project]
 name = "llama-index-llms-anthropic"
-version = "0.8.6"
+version = "0.9.0"
 description = "llama-index llms anthropic integration"
 authors = [{name = "Your Name", email = "[email protected]"}]
 requires-python = ">=3.9,<4.0"

diff --git a/llama-index-integrations/llms/llama-index-llms-anthropic/tests/test_llms_anthropic.py b/llama-index-integrations/llms/llama-index-llms-anthropic/tests/test_llms_anthropic.py
@@ -17,6 +17,7 @@
     CachePoint,
     CacheControl,
 )
+from llama_index.core.base.llms.types import ThinkingBlock
 from llama_index.core.tools import FunctionTool
 from llama_index.llms.anthropic import Anthropic
 from llama_index.llms.anthropic.base import AnthropicChatResponse
@@ -384,6 +385,59 @@ def test_cache_point_to_cache_control() -> None:
     assert ant_messages[0]["content"][-1]["cache_control"]["ttl"] == "5m"
 
 
+def test_thinking_input():
+    messages = [
+        ChatMessage(
+            role="assistant",
+            blocks=[
+                ThinkingBlock(content="Hello"),
+                TextBlock(text="World"),
+            ],
+        ),
+    ]
+    ant_messages, _ = messages_to_anthropic_messages(messages)
+    assert ant_messages[0]["role"] == "assistant"
+    assert ant_messages[0]["content"][0]["type"] == "thinking"
+    assert ant_messages[0]["content"][0]["thinking"] == "Hello"
+    assert ant_messages[0]["content"][1]["type"] == "text"
+    assert ant_messages[0]["content"][1]["text"] == "World"
+
+
+@pytest.mark.skipif(
+    os.getenv("ANTHROPIC_API_KEY") is None,
+    reason="Anthropic API key not available to test Anthropic document uploading ",
+)
+def test_thinking():
+    llm = Anthropic(
+        model="claude-sonnet-4-0",
+        # max_tokens must be greater than budget_tokens
+        max_tokens=64000,
+        # temperature must be 1.0 for thinking to work
+        temperature=1.0,
+        thinking_dict={"type": "enabled", "budget_tokens": 1600},
+    )
+    res = llm.chat(
+        messages=[
+            ChatMessage(
+                content="Please solve the following equation for x: x^2+12x+7=0. Please think before providing a response."
+            )
+        ]
+    )
+    assert any(isinstance(block, ThinkingBlock) for block in res.message.blocks)
+    assert (
+        len(
+            "".join(
+                [
+                    block.content or ""
+                    for block in res.message.blocks
+                    if isinstance(block, ThinkingBlock)
+                ]
+            )
+        )
+        > 0
+    )
+
+
 @pytest.mark.skipif(
     os.getenv("ANTHROPIC_API_KEY") is None,
     reason="Anthropic API key not available to test Anthropic document uploading ",

diff --git a/...dex-integrations/llms/llama-index-llms-google-genai/llama_index/llms/google_genai/base.py b/...dex-integrations/llms/llama-index-llms-google-genai/llama_index/llms/google_genai/base.py
@@ -36,6 +36,7 @@
     CompletionResponseGen,
     LLMMetadata,
     MessageRole,
+    ThinkingBlock,
 )
 from llama_index.core.bridge.pydantic import BaseModel, Field, PrivateAttr
 from llama_index.core.callbacks import CallbackManager
@@ -393,7 +394,7 @@ def gen() -> ChatResponseGen:
                 )
                 llama_resp.delta = content_delta
                 llama_resp.message.content = content
-                llama_resp.message.additional_kwargs["thoughts"] = thoughts
+                llama_resp.message.blocks.append(ThinkingBlock(content=thoughts))
                 llama_resp.message.additional_kwargs["tool_calls"] = existing_tool_calls
                 yield llama_resp
 
@@ -453,7 +454,9 @@ async def gen() -> ChatResponseAsyncGen:
                             )
                             llama_resp.delta = content_delta
                             llama_resp.message.content = content
-                            llama_resp.message.additional_kwargs["thoughts"] = thoughts
+                            llama_resp.message.blocks.append(
+                                ThinkingBlock(content=thoughts)
+                            )
                             llama_resp.message.additional_kwargs["tool_calls"] = (
                                 existing_tool_calls
                             )