fix: remove duplicate layer multiplication in KV cache size calculation (#6481)

jaedeok-nvidia · web-flow · commit fbee27990917 · 2025-07-31T22:34:34.000-04:00
Signed-off-by: Jaedeok Kim &lt;jaedeokk@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -477,9 +477,9 @@ std::map<SizeType32, float> BlockManager::calculateWindowSizeToShare(
         windowSizeToContribution[windowSize] = cacheSizeWeight;
     }
 
-    for (auto const& [windowSize, layers] : windowSizeToLayers)
+    for (auto const& [windowSize, _] : windowSizeToLayers)
     {
-        windowSizeToContribution.at(windowSize) *= windowSize * layers.size();
+        windowSizeToContribution.at(windowSize) *= windowSize;
     }
     auto const windowSizesTotalSum = std::accumulate(windowSizeToContribution.begin(), windowSizeToContribution.end(),
         0.0, [](auto sum, auto const& windowSize) { return sum + windowSize.second; });
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -414,8 +414,8 @@ TEST_F(KVCacheManagerTest, BlockManagerTestWindowSizeToShare)
     {
         std::map<SizeType32, std::vector<SizeType32>> windowSizeToLayers{
             {1024, {1}},       // contribution = 1024*1 = 1024
-            {4096, {0, 4, 5}}, // contribution = 4096*3 = 12288
-            {8192, {2, 3}},    // contribution = 8192*2 = 16384
+            {4096, {0, 4, 5}}, // contribution = 4096*1 = 4096
+            {8192, {2, 3}},    // contribution = 8192*1 = 8192
         };
         // Use identical cache size per token across window sizes for simplicity.
         std::map<SizeType32, SizeType32> cacheSizePerTokenPerWindow{{1024, 1}, {4096, 1}, {8192, 1}};
@@ -431,9 +431,9 @@ TEST_F(KVCacheManagerTest, BlockManagerTestWindowSizeToShare)
         // Calculate expected shares based on contributions.
         std::map<SizeType32, float> expectedShares;
         std::map<SizeType32, SizeType32> contributions;
-        for (auto const& [windowSize, layers] : windowSizeToLayers)
+        for (auto const& [windowSize, _] : windowSizeToLayers)
         {
-            contributions[windowSize] = windowSize * static_cast<SizeType32>(layers.size());
+            contributions[windowSize] = windowSize * 1.0f;
         }
         auto const totalContribution = std::accumulate(contributions.begin(), contributions.end(), 0.0f,
             [](float sum, auto const& kv) { return sum + kv.second; });
@@ -445,27 +445,28 @@ TEST_F(KVCacheManagerTest, BlockManagerTestWindowSizeToShare)
         }
 
         // Verify the exact hard-coded values mentioned in the comment
-        EXPECT_NEAR(result.at(1024), 0.0345f, 1e-4f);
-        EXPECT_NEAR(result.at(4096), 0.4138f, 1e-4f);
-        EXPECT_NEAR(result.at(8192), 0.5517f, 1e-4f);
+        EXPECT_NEAR(result.at(1024), 0.0769f, 1e-4f);
+        EXPECT_NEAR(result.at(4096), 0.3077f, 1e-4f);
+        EXPECT_NEAR(result.at(8192), 0.6154f, 1e-4f);
 
         // Verify that when shares are converted to actual block counts, they match expected values.
         auto getRoundedBlocks
             = [&](float share) { return static_cast<SizeType32>(std::round(share * numPrimaryBlocks)); };
-        EXPECT_EQ(getRoundedBlocks(result.at(1024)), 565);
-        EXPECT_EQ(getRoundedBlocks(result.at(4096)), 6780);
-        EXPECT_EQ(getRoundedBlocks(result.at(8192)), 9039);
+        EXPECT_EQ(getRoundedBlocks(result.at(1024)), 1260);
+        EXPECT_EQ(getRoundedBlocks(result.at(4096)), 5041);
+        EXPECT_EQ(getRoundedBlocks(result.at(8192)), 10082);
     }
 
     // Variable window size with different cache sizes per token per window
     {
         std::map<SizeType32, std::vector<SizeType32>> windowSizeToLayers{
-            {1024, {1}},       // contribution = 1024*1*2 = 2048 (cache size per token = 2)
-            {4096, {0, 4, 5}}, // contribution = 4096*3*4 = 49152 (cache size per token = 4)
-            {8192, {2, 3}},    // contribution = 8192*2*1 = 16384 (cache size per token = 1)
+            {1024, {1}},       // contribution = 1024*(1*2) = 2048 (cache size per token per layer = 2)
+            {4096, {0, 4, 5}}, // contribution = 4096*(3*4) = 49152 (cache size per token per layer = 4)
+            {8192, {2, 3}},    // contribution = 8192*(2*1) = 16384 (cache size per token per layer = 1)
         };
-        // Different cache sizes per token per window
-        std::map<SizeType32, SizeType32> cacheSizePerTokenPerWindow{{1024, 2}, {4096, 4}, {8192, 1}};
+        // Different cache sizes per token per window.
+        // cacheSizePerTokenPerWindow is accumulated across the layers of given window size.
+        std::map<SizeType32, SizeType32> cacheSizePerTokenPerWindow{{1024, 2}, {4096, 12}, {8192, 2}};
 
         auto result = BlockManager::calculateWindowSizeToShare(windowSizeToLayers, cacheSizePerTokenPerWindow);
         EXPECT_EQ(result.size(), 3);
@@ -478,10 +479,10 @@ TEST_F(KVCacheManagerTest, BlockManagerTestWindowSizeToShare)
         // Calculate expected shares based on contributions with different cache sizes per token.
         std::map<SizeType32, float> expectedShares;
         std::map<SizeType32, SizeType32> contributions;
-        for (auto const& [windowSize, layers] : windowSizeToLayers)
+        for (auto const& [windowSize, _] : windowSizeToLayers)
         {
             auto const cacheSizePerToken = cacheSizePerTokenPerWindow.at(windowSize);
-            contributions[windowSize] = windowSize * static_cast<SizeType32>(layers.size()) * cacheSizePerToken;
+            contributions[windowSize] = windowSize * cacheSizePerToken;
         }
         auto const totalContribution = std::accumulate(contributions.begin(), contributions.end(), 0.0f,
             [](float sum, auto const& kv) { return sum + kv.second; });
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -206,6 +206,13 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
             self.max_attention_window_vec = kv_cache_config.max_attention_window.copy(
             )  # Make a copy to avoid modifying original
 
+            # Clamp all window sizes to max_seq_len before calculating the
+            # number of KV cache blocks. This prevents the KV cache pool from
+            # being skewed by the largest window values.
+            self.max_attention_window_vec = [
+                min(max_seq_len, w) for w in self.max_attention_window_vec
+            ]
+
         sink_token_length = (kv_cache_config.sink_token_length
                              if kv_cache_config.sink_token_length is not None
                              else 0)

Original file line number	Diff line number	Diff line change
`@@ -477,9 +477,9 @@ std::map<SizeType32, float> BlockManager::calculateWindowSizeToShare(`
`477`	`477`	`windowSizeToContribution[windowSize] = cacheSizeWeight;`
`478`	`478`	`}`
`479`	`479`
`480`		`- for (auto const& [windowSize, layers] : windowSizeToLayers)`
	`480`	`+ for (auto const& [windowSize, _] : windowSizeToLayers)`
`481`	`481`	`{`
`482`		`- windowSizeToContribution.at(windowSize) = windowSize layers.size();`
	`482`	`+ windowSizeToContribution.at(windowSize) *= windowSize;`
`483`	`483`	`}`
`484`	`484`	`auto const windowSizesTotalSum = std::accumulate(windowSizeToContribution.begin(), windowSizeToContribution.end(),`
`485`	`485`	`0.0, [](auto sum, auto const& windowSize) { return sum + windowSize.second; });`