Skip to content

Commit 3c04620

Browse files
authored
TensorRT-LLM v0.18 release (#3231)
1 parent 258c754 commit 3c04620

File tree

113 files changed

+1002
-708
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+1002
-708
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ repos:
44
hooks:
55
- id: isort
66
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
7-
rev: v1.1.13
7+
rev: v1.5.5
88
hooks:
99
- id: remove-crlf
1010
- repo: https://github.com/google/yapf
@@ -24,7 +24,7 @@ repos:
2424
- id: check-yaml
2525
- id: trailing-whitespace
2626
- repo: https://github.com/PyCQA/autoflake
27-
rev: v1.6.1
27+
rev: v2.3.1
2828
hooks:
2929
- id: autoflake
3030
args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']

README.md

Lines changed: 3 additions & 3 deletions

cpp/include/tensorrt_llm/batch_manager/GptManager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class [[deprecated("Use the executor API instead.")]] GptManager
9494
[[nodiscard]] SizeType32 getMaxSequenceLen() const;
9595
[[nodiscard]] SizeType32 getMaxNumSequences() const;
9696
[[nodiscard]] SizeType32 getMaxDraftLen() const;
97+
[[nodiscard]] SizeType32 getVocabSizePadded() const;
9798

9899
void validateLlmRequest(
99100
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;

cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626
namespace tensorrt_llm::batch_manager
2727
{
2828

29-
namespace tle = tensorrt_llm::executor;
30-
3129
class AllocateKvCache : Algorithm
3230
{
3331
using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;

cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class BaseEvictionPolicy
5656

5757
struct ExpiringBlockComparator
5858
{
59-
inline bool operator()(BlockPtr const& a, BlockPtr const& b) const
59+
bool operator()(BlockPtr const& a, BlockPtr const& b) const
6060
{
6161
// If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
6262
// raw pointer values.

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ class KVCacheBlock
166166
public:
167167
using IdType = std::int32_t;
168168

169+
static constexpr IdType kCachedBlocksRootId = -1;
170+
169171
explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
170172

171173
void startScheduling();
@@ -379,6 +381,16 @@ class GenerationRequest
379381
return mKvCacheRetentionConfig.getDecodeDurationMs();
380382
}
381383

384+
[[nodiscard]] bool getContextRequiresCyclicKvCache() const
385+
{
386+
return mContextRequiresCyclicKvCache;
387+
}
388+
389+
void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache)
390+
{
391+
mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache;
392+
}
393+
382394
private:
383395
// Request id of the sequence
384396
LlmRequest::RequestIdType mRequestId;
@@ -392,6 +404,9 @@ class GenerationRequest
392404
runtime::ITensor::SharedPtr mCacheBlockIndices;
393405
// The retention priority to assign to decode blocks
394406
executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
407+
408+
// A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache.
409+
bool mContextRequiresCyclicKvCache{false};
395410
};
396411

397412
// attach metadata to a pool pointer
@@ -443,7 +458,7 @@ class BlockManager
443458
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
444459
CacheType cacheType = CacheType::kSELF,
445460
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
446-
std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
461+
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
447462

448463
~BlockManager();
449464

@@ -712,6 +727,9 @@ class BlockManager
712727
SizeType32 mMissedBlocks;
713728
std::set<KVCacheBlock::IdType> reusedBlockIds;
714729

730+
// Whether or not to maintain a hashmap of blocks.
731+
bool mEnableHashKey;
732+
715733
private:
716734
friend class KVCacheManager;
717735
};
@@ -818,16 +836,18 @@ class BaseKVCacheManager
818836
//! \details These blocks become reusable from next step.
819837
virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
820838

821-
virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
839+
[[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
822840

823-
virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0;
841+
[[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
842+
LlmRequest::RequestIdType requestId) const
843+
= 0;
824844

825-
virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
845+
[[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
826846
std::vector<LlmRequest::RequestIdType> const& requestIds) const
827847
= 0;
828848

829-
virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
830-
virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
849+
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
850+
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
831851

832852
virtual void refreshBlocks() = 0;
833853
virtual void flushIterationEvents() = 0;
@@ -846,7 +866,7 @@ class BaseKVCacheManager
846866
* 2 * modelConfig.getSizePerHead();
847867
}
848868

849-
[[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
869+
[[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
850870
nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
851871
tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
852872

@@ -924,7 +944,7 @@ class KVCacheManager : public BaseKVCacheManager
924944
return mBlockManager.getNumFreeBlocks();
925945
}
926946

927-
[[nodiscard]] virtual SizeType32 getNumPools() const override
947+
[[nodiscard]] SizeType32 getNumPools() const override
928948
{
929949
return mBlockManager.getNumPools();
930950
}
@@ -994,8 +1014,6 @@ class KVCacheManager : public BaseKVCacheManager
9941014
/// @return The number of blocks
9951015
[[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;
9961016

997-
void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
998-
9991017
/// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
10001018
void addToken(LlmRequest::RequestIdType requestId) override;
10011019

cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class BlockIterator
9191

9292
runtime::ITensor::SharedPtr mPool;
9393
runtime::ITensor::SharedPtr mCurrent;
94-
const std::vector<SizeType32> mBlockIds;
94+
std::vector<SizeType32> const mBlockIds;
9595
size_t mIdx;
9696
};
9797

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,9 +490,14 @@ class GenericLlmRequest
490490
initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
491491
}
492492

493-
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
493+
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
494494
std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false)
495495
{
496+
if (mEndId.has_value())
497+
{
498+
TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
499+
"EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
500+
}
496501
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
497502
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
498503
maxEncoderInputLen.value());

cpp/include/tensorrt_llm/runtime/samplingConfig.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "tensorrt_llm/layers/defaultDecodingParams.h"
2222
#include "tensorrt_llm/runtime/common.h"
2323

24+
#include <algorithm>
2425
#include <functional>
2526
#include <optional>
2627
#include <vector>
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d
3-
size 9125406
2+
oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1
3+
size 9123884

0 commit comments

Comments
 (0)