|
4 | 4 |
|
5 | 5 | namespace PhpLlm\LlmChain\Store; |
6 | 6 |
|
7 | | -use PhpLlm\LlmChain\Platform\Capability; |
8 | | -use PhpLlm\LlmChain\Platform\Model; |
9 | | -use PhpLlm\LlmChain\Platform\PlatformInterface; |
10 | 7 | use PhpLlm\LlmChain\Store\Document\TextDocument; |
11 | | -use PhpLlm\LlmChain\Store\Document\VectorDocument; |
| 8 | +use PhpLlm\LlmChain\Store\Document\Vectorizer; |
12 | 9 | use Psr\Log\LoggerInterface; |
13 | 10 | use Psr\Log\NullLogger; |
14 | | -use Symfony\Component\Clock\Clock; |
15 | | -use Symfony\Component\Clock\ClockInterface; |
16 | 11 |
|
17 | 12 | /** |
| 13 | + * Converts a collection of TextDocuments into VectorDocuments and pushes them to a store implementation. |
| 14 | + * |
18 | 15 | * @author Christopher Hertel <[email protected]> |
19 | 16 | */ |
20 | 17 | final readonly class Indexer |
21 | 18 | { |
22 | | - private ClockInterface $clock; |
23 | | - |
24 | 19 | public function __construct( |
25 | | - private PlatformInterface $platform, |
26 | | - private Model $model, |
| 20 | + private Vectorizer $vectorizer, |
27 | 21 | private StoreInterface $store, |
28 | | - ?ClockInterface $clock = null, |
29 | 22 | private LoggerInterface $logger = new NullLogger(), |
30 | 23 | ) { |
31 | | - $this->clock = $clock ?? Clock::get(); |
32 | 24 | } |
33 | 25 |
|
34 | 26 | /** |
35 | 27 | * @param TextDocument|iterable<TextDocument> $documents |
| 28 | + * @param int $chunkSize number of documents to vectorize and store in one batch |
36 | 29 | */ |
37 | | - public function index(TextDocument|iterable $documents, int $chunkSize = 0, int $sleep = 0): void |
| 30 | + public function index(TextDocument|iterable $documents, int $chunkSize = 50): void |
38 | 31 | { |
39 | 32 | if ($documents instanceof TextDocument) { |
40 | 33 | $documents = [$documents]; |
41 | 34 | } |
42 | 35 |
|
43 | | - if ([] === $documents) { |
44 | | - $this->logger->debug('No documents to index'); |
45 | | - |
46 | | - return; |
47 | | - } |
48 | | - |
49 | | - $chunks = 0 !== $chunkSize ? array_chunk($documents, $chunkSize) : [$documents]; |
50 | | - |
51 | | - foreach ($chunks as $chunk) { |
52 | | - $this->store->add(...$this->createVectorDocuments($chunk)); |
53 | | - |
54 | | - if (0 !== $sleep) { |
55 | | - $this->clock->sleep($sleep); |
56 | | - } |
57 | | - } |
58 | | - } |
59 | | - |
60 | | - /** |
61 | | - * @param TextDocument[] $documents |
62 | | - * |
63 | | - * @return VectorDocument[] |
64 | | - */ |
65 | | - private function createVectorDocuments(array $documents): array |
66 | | - { |
67 | | - if ($this->model->supports(Capability::INPUT_MULTIPLE)) { |
68 | | - $response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents)); |
69 | | - |
70 | | - $vectors = $response->getContent(); |
71 | | - } else { |
72 | | - $responses = []; |
73 | | - foreach ($documents as $document) { |
74 | | - $responses[] = $this->platform->request($this->model, $document->content); |
75 | | - } |
| 36 | + $counter = 0; |
| 37 | + $chunk = []; |
| 38 | + foreach ($documents as $document) { |
| 39 | + $chunk[] = $document; |
76 | 40 |
|
77 | | - $vectors = []; |
78 | | - foreach ($responses as $response) { |
79 | | - $vectors = array_merge($vectors, $response->getContent()); |
| 41 | + if ($chunkSize === \count($chunk)) { |
| 42 | + $this->store->add(...$this->vectorizer->vectorizeDocuments($chunk)); |
| 43 | + $chunk = []; |
80 | 44 | } |
81 | | - } |
82 | 45 |
|
83 | | - $vectorDocuments = []; |
84 | | - foreach ($documents as $i => $document) { |
85 | | - $vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata); |
| 46 | + ++$counter; |
86 | 47 | } |
87 | 48 |
|
88 | | - return $vectorDocuments; |
| 49 | + $this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter)); |
89 | 50 | } |
90 | 51 | } |
0 commit comments