@@ -1755,18 +1755,6 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
17551755 ...
17561756
17571757
1758- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
1759- # "use llama_kv_self_n_tokens instead");
1760- @ctypes_function (
1761- "llama_get_kv_cache_token_count" , [llama_context_p_ctypes ], ctypes .c_int32
1762- )
1763- def llama_get_kv_cache_token_count (ctx : llama_context_p , / ) -> int :
1764- """Returns the number of tokens in the KV cache (slow, use only for debug)
1765- If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1766- """
1767- ...
1768-
1769-
17701758# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
17711759# LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
17721760@ctypes_function (
@@ -1777,16 +1765,6 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
17771765 ...
17781766
17791767
1780- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
1781- # "use llama_kv_self_used_cells instead");
1782- @ctypes_function (
1783- "llama_get_kv_cache_used_cells" , [llama_context_p_ctypes ], ctypes .c_int32
1784- )
1785- def llama_get_kv_cache_used_cells (ctx : llama_context_p , / ) -> int :
1786- """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1787- ...
1788-
1789-
17901768# // Clear the KV cache - both cell info is erased and KV data is zeroed
17911769# LLAMA_API void llama_kv_self_clear(
17921770# struct llama_context * ctx);
@@ -1797,49 +1775,6 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
17971775 """Clear the KV cache - both cell info is erased and KV data is zeroed"""
17981776 ...
17991777
1800- # NOTE: Deprecated
1801- @ctypes_function ("llama_kv_self_clear" , [llama_context_p_ctypes ], None )
1802- def llama_kv_cache_clear (ctx : llama_context_p , / ):
1803- """Clear the KV cache"""
1804- ...
1805-
1806-
1807- # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
1808- # // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
1809- # // seq_id < 0 : match any sequence
1810- # // p0 < 0 : [0, p1]
1811- # // p1 < 0 : [p0, inf)
1812- # LLAMA_API bool llama_kv_cache_seq_rm(
1813- # struct llama_context * ctx,
1814- # llama_seq_id seq_id,
1815- # llama_pos p0,
1816- # llama_pos p1);
1817- @ctypes_function (
1818- "llama_kv_cache_seq_rm" ,
1819- [
1820- llama_context_p_ctypes ,
1821- llama_seq_id ,
1822- llama_pos ,
1823- llama_pos ,
1824- ],
1825- ctypes .c_bool ,
1826- )
1827- def llama_kv_cache_seq_rm (
1828- ctx : llama_context_p ,
1829- seq_id : Union [llama_seq_id , int ],
1830- p0 : Union [llama_pos , int ],
1831- p1 : Union [llama_pos , int ],
1832- / ,
1833- ) -> bool :
1834- """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
1835-
1836- Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
1837-
1838- seq_id < 0 : match any sequence
1839- p0 < 0 : [0, p1]
1840- p1 < 0 : [p0, inf)"""
1841- ...
1842-
18431778
18441779# // Copy all tokens that belong to the specified sequence to another sequence
18451780# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
@@ -1877,33 +1812,6 @@ def llama_kv_self_seq_cp(
18771812 ...
18781813
18791814
1880- # NOTE: Deprecated
1881- @ctypes_function (
1882- "llama_kv_self_seq_cp" ,
1883- [
1884- llama_context_p_ctypes ,
1885- llama_seq_id ,
1886- llama_seq_id ,
1887- llama_pos ,
1888- llama_pos ,
1889- ],
1890- None ,
1891- )
1892- def llama_kv_cache_seq_cp (
1893- ctx : llama_context_p ,
1894- seq_id_src : Union [llama_seq_id , int ],
1895- seq_id_dst : Union [llama_seq_id , int ],
1896- p0 : Union [llama_pos , int ],
1897- p1 : Union [llama_pos , int ],
1898- / ,
1899- ):
1900- """Copy all tokens that belong to the specified sequence to another sequence
1901- Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
1902- p0 < 0 : [0, p1]
1903- p1 < 0 : [p0, inf)"""
1904- ...
1905-
1906-
19071815# // Removes all tokens that do not belong to the specified sequence
19081816# LLAMA_API void llama_kv_self_seq_keep(
19091817# struct llama_context * ctx,
@@ -1916,13 +1824,6 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
19161824 ...
19171825
19181826
1919- # NOTE: Deprecated
1920- @ctypes_function (
1921- "llama_kv_self_seq_keep" , [llama_context_p_ctypes , llama_seq_id ], None
1922- )
1923- def llama_kv_cache_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
1924- """Removes all tokens that do not belong to the specified sequence"""
1925- ...
19261827
19271828
19281829
@@ -1964,49 +1865,6 @@ def llama_kv_self_seq_add(
19641865 p0 < 0 : [0, p1]
19651866 p1 < 0 : [p0, inf)"""
19661867 ...
1967-
1968-
1969- # // NOTE: Deprecated
1970- # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
1971- # // If the KV cache is RoPEd, the KV data is updated accordingly:
1972- # // - lazily on next llama_decode()
1973- # // - explicitly with llama_kv_cache_update()
1974- # // p0 < 0 : [0, p1]
1975- # // p1 < 0 : [p0, inf)
1976- # LLAMA_API void llama_kv_cache_seq_add(
1977- # struct llama_context * ctx,
1978- # llama_seq_id seq_id,
1979- # llama_pos p0,
1980- # llama_pos p1,
1981- # llama_pos delta);
1982- @ctypes_function (
1983- "llama_kv_self_seq_add" ,
1984- [
1985- llama_context_p_ctypes ,
1986- llama_seq_id ,
1987- llama_pos ,
1988- llama_pos ,
1989- llama_pos ,
1990- ],
1991- None ,
1992- )
1993- def llama_kv_cache_seq_add (
1994- ctx : llama_context_p ,
1995- seq_id : Union [llama_seq_id , int ],
1996- p0 : Union [llama_pos , int ],
1997- p1 : Union [llama_pos , int ],
1998- delta : Union [llama_pos , int ],
1999- / ,
2000- ):
2001- """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
2002- If the KV cache is RoPEd, the KV data is updated accordingly:
2003- - lazily on next llama_decode()
2004- - explicitly with llama_kv_cache_update()
2005- p0 < 0 : [0, p1]
2006- p1 < 0 : [p0, inf)"""
2007- ...
2008-
2009-
20101868# // Integer division of the positions by factor of `d > 1`
20111869# // If the KV cache is RoPEd, the KV data is updated accordingly
20121870# // p0 < 0 : [0, p1]
@@ -2043,43 +1901,6 @@ def llama_kv_self_seq_div(
20431901 ...
20441902
20451903
2046- # // NOTE: Deprecated
2047- # // Integer division of the positions by factor of `d > 1`
2048- # // If the KV cache is RoPEd, the KV data is updated accordingly
2049- # // p0 < 0 : [0, p1]
2050- # // p1 < 0 : [p0, inf)
2051- # LLAMA_API void llama_kv_cache_seq_div(
2052- # struct llama_context * ctx,
2053- # llama_seq_id seq_id,
2054- # llama_pos p0,
2055- # llama_pos p1,
2056- # int d);
2057- @ctypes_function (
2058- "llama_kv_self_seq_div" ,
2059- [
2060- llama_context_p_ctypes ,
2061- llama_seq_id ,
2062- llama_pos ,
2063- llama_pos ,
2064- ctypes .c_int ,
2065- ],
2066- None ,
2067- )
2068- def llama_kv_cache_seq_div (
2069- ctx : llama_context_p ,
2070- seq_id : Union [llama_seq_id , int ],
2071- p0 : Union [llama_pos , int ],
2072- p1 : Union [llama_pos , int ],
2073- d : Union [ctypes .c_int , int ],
2074- / ,
2075- ):
2076- """Integer division of the positions by factor of `d > 1`
2077- If the KV cache is RoPEd, the KV data is updated accordingly
2078- p0 < 0 : [0, p1]
2079- p1 < 0 : [p0, inf)"""
2080- ...
2081-
2082-
20831904# // Returns the largest position present in the KV cache for the specified sequence
20841905# LLAMA_API llama_pos llama_kv_self_seq_pos_max(
20851906# struct llama_context * ctx,
@@ -2108,21 +1929,6 @@ def llama_kv_self_defrag(ctx: llama_context_p, /):
21081929 ...
21091930
21101931
2111- # NOTE: Deprecated
2112- # // Defragment the KV cache
2113- # // This will be applied:
2114- # // - lazily on next llama_decode()
2115- # // - explicitly with llama_kv_self_update()
2116- # LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
2117- @ctypes_function ("llama_kv_cache_defrag" , [llama_context_p_ctypes ], None )
2118- def llama_kv_cache_defrag (ctx : llama_context_p , / ):
2119- """Defragment the KV cache
2120- This will be applied:
2121- - lazily on next llama_decode()
2122- - explicitly with llama_kv_cache_update()"""
2123- ...
2124-
2125-
21261932# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
21271933# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
21281934@ctypes_function ("llama_kv_self_update" , [llama_context_p_ctypes ], None )
@@ -2147,15 +1953,6 @@ def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
21471953 ...
21481954
21491955
2150- # // NOTE: Deprecated
2151- # // Check if the context supports KV cache shifting
2152- # LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
2153- @ctypes_function ("llama_kv_self_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
2154- def llama_kv_cache_can_shift (ctx : llama_context_p , / ) -> bool :
2155- """Check if the context supports KV cache shifting"""
2156- ...
2157-
2158-
21591956# //
21601957# // State / sessions
21611958# //
0 commit comments