NVIDIA
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/GptManager.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/GptManager.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h‎
Lines changed: 0 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 28 additions & 10 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 6 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/runtime/samplingConfig.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/include/tensorrt_llm/runtime/samplingConfig.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a‎
Lines changed: 2 additions & 2 deletions
@@ -4,7 +4,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    rev: v1.1.13
+    rev: v1.5.5
     hooks:
     -   id: remove-crlf
 -   repo: https://github.com/google/yapf
@@ -24,7 +24,7 @@ repos:
     -   id: check-yaml
     -   id: trailing-whitespace
 -   repo: https://github.com/PyCQA/autoflake
-    rev: v1.6.1
+    rev: v2.3.1
     hooks:
     -   id: autoflake
         args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']
 
@@ -94,6 +94,7 @@ class [[deprecated("Use the executor API instead.")]] GptManager
     [[nodiscard]] SizeType32 getMaxSequenceLen() const;
     [[nodiscard]] SizeType32 getMaxNumSequences() const;
     [[nodiscard]] SizeType32 getMaxDraftLen() const;
+    [[nodiscard]] SizeType32 getVocabSizePadded() const;
 
     void validateLlmRequest(
         LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
 
@@ -26,8 +26,6 @@
 namespace tensorrt_llm::batch_manager
 {
 
-namespace tle = tensorrt_llm::executor;
-
 class AllocateKvCache : Algorithm
 {
     using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;
 
@@ -56,7 +56,7 @@ class BaseEvictionPolicy
 
 struct ExpiringBlockComparator
 {
-    inline bool operator()(BlockPtr const& a, BlockPtr const& b) const
+    bool operator()(BlockPtr const& a, BlockPtr const& b) const
     {
         // If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
         // raw pointer values.
 
@@ -166,6 +166,8 @@ class KVCacheBlock
 public:
     using IdType = std::int32_t;
 
+    static constexpr IdType kCachedBlocksRootId = -1;
+
     explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
 
     void startScheduling();
@@ -379,6 +381,16 @@ class GenerationRequest
         return mKvCacheRetentionConfig.getDecodeDurationMs();
     }
 
+    [[nodiscard]] bool getContextRequiresCyclicKvCache() const
+    {
+        return mContextRequiresCyclicKvCache;
+    }
+
+    void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache)
+    {
+        mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache;
+    }
+
 private:
     // Request id of the sequence
     LlmRequest::RequestIdType mRequestId;
@@ -392,6 +404,9 @@ class GenerationRequest
     runtime::ITensor::SharedPtr mCacheBlockIndices;
     // The retention priority to assign to decode blocks
     executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
+
+    // A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache.
+    bool mContextRequiresCyclicKvCache{false};
 };
 
 // attach metadata to a pool pointer
@@ -443,7 +458,7 @@ class BlockManager
         SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
         CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
 
     ~BlockManager();
 
@@ -712,6 +727,9 @@ class BlockManager
     SizeType32 mMissedBlocks;
     std::set<KVCacheBlock::IdType> reusedBlockIds;
 
+    // Whether or not to maintain a hashmap of blocks.
+    bool mEnableHashKey;
+
 private:
     friend class KVCacheManager;
 };
@@ -818,16 +836,18 @@ class BaseKVCacheManager
     //! \details These blocks become reusable from next step.
     virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
 
-    virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
+    [[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
 
-    virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0;
+    [[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        LlmRequest::RequestIdType requestId) const
+        = 0;
 
-    virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
+    [[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
         std::vector<LlmRequest::RequestIdType> const& requestIds) const
         = 0;
 
-    virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
-    virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
+    [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
+    [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
 
     virtual void refreshBlocks() = 0;
     virtual void flushIterationEvents() = 0;
@@ -846,7 +866,7 @@ class BaseKVCacheManager
             * 2 * modelConfig.getSizePerHead();
     }
 
-    [[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
+    [[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
         nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
         tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
 
@@ -924,7 +944,7 @@ class KVCacheManager : public BaseKVCacheManager
         return mBlockManager.getNumFreeBlocks();
     }
 
-    [[nodiscard]] virtual SizeType32 getNumPools() const override
+    [[nodiscard]] SizeType32 getNumPools() const override
     {
         return mBlockManager.getNumPools();
     }
@@ -994,8 +1014,6 @@ class KVCacheManager : public BaseKVCacheManager
     /// @return  The number of blocks
     [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;
 
-    void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
-
     /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
     void addToken(LlmRequest::RequestIdType requestId) override;
 
 
@@ -91,7 +91,7 @@ class BlockIterator
 
     runtime::ITensor::SharedPtr mPool;
     runtime::ITensor::SharedPtr mCurrent;
-    const std::vector<SizeType32> mBlockIds;
+    std::vector<SizeType32> const mBlockIds;
     size_t mIdx;
 };
 
 
@@ -490,9 +490,14 @@ class GenericLlmRequest
         initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
     }
 
-    void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
+    void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false)
     {
+        if (mEndId.has_value())
+        {
+            TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
+                "EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
+        }
         TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
             "Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
             maxEncoderInputLen.value());
 
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/layers/defaultDecodingParams.h"
 #include "tensorrt_llm/runtime/common.h"
 
+#include <algorithm>
 #include <functional>
 #include <optional>
 #include <vector>
 
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d
-size 9125406
+oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1
+size 9123884
Original file line number	Diff line number	Diff line change
`@@ -26,8 +26,6 @@`
`26`	`26`	`namespace tensorrt_llm::batch_manager`
`27`	`27`	`{`
`28`	`28`
`29`		`-namespace tle = tensorrt_llm::executor;`
`30`		`-`
`31`	`29`	`class AllocateKvCache : Algorithm`
`32`	`30`	`{`
`33`	`31`	`using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ class BaseEvictionPolicy`
`56`	`56`
`57`	`57`	`struct ExpiringBlockComparator`
`58`	`58`	`{`
`59`		`- inline bool operator()(BlockPtr const& a, BlockPtr const& b) const`
	`59`	`+ bool operator()(BlockPtr const& a, BlockPtr const& b) const`
`60`	`60`	`{`
`61`	`61`	`// If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the`
`62`	`62`	`// raw pointer values.`