Skip to content
This repository was archived by the owner on Jul 16, 2025. It is now read-only.

Commit fb0570c

Browse files
authored
fix: youtube transcipt auth issue by using ready-made lib (#353)
1 parent c69f561 commit fb0570c

File tree

2 files changed

+14
-43
lines changed

2 files changed

+14
-43
lines changed

composer.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
"async-aws/bedrock-runtime": "^0.1.0",
4444
"doctrine/dbal": "^3.0 || ^4.0",
4545
"mongodb/mongodb": "^1.21 || ^2.0",
46+
"mrmysql/youtube-transcript": "^v0.0.5",
4647
"php-cs-fixer/shim": "^3.70",
4748
"phpstan/phpstan": "^2.0",
4849
"phpstan/phpstan-symfony": "^2.0",
@@ -66,9 +67,9 @@
6667
"codewithkyrian/transformers": "For using the TransformersPHP with FFI to run models in PHP.",
6768
"doctrine/dbal": "For using MariaDB via Doctrine as retrieval vector store",
6869
"mongodb/mongodb": "For using MongoDB Atlas as retrieval vector store.",
70+
"mrmysql/youtube-transcript": "For using the YouTube transcription tool.",
6971
"probots-io/pinecone-php": "For using the Pinecone as retrieval vector store.",
70-
"symfony/css-selector": "For using the YouTube transcription tool.",
71-
"symfony/dom-crawler": "For using the YouTube transcription tool."
72+
"symfony/dom-crawler": "For using the Crawler tool."
7273
},
7374
"config": {
7475
"allow-plugins": {

src/Chain/Toolbox/Tool/YouTubeTranscriber.php

Lines changed: 11 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@
44

55
namespace PhpLlm\LlmChain\Chain\Toolbox\Tool;
66

7+
use MrMySQL\YoutubeTranscript\TranscriptListFetcher;
78
use PhpLlm\LlmChain\Chain\Exception\LogicException;
8-
use PhpLlm\LlmChain\Chain\Exception\RuntimeException;
99
use PhpLlm\LlmChain\Chain\Toolbox\Attribute\AsTool;
10-
use Symfony\Component\CssSelector\CssSelectorConverter;
11-
use Symfony\Component\DomCrawler\Crawler;
10+
use Symfony\Component\HttpClient\Psr18Client;
1211
use Symfony\Contracts\HttpClient\HttpClientInterface;
1312

1413
/**
@@ -20,11 +19,8 @@
2019
public function __construct(
2120
private HttpClientInterface $client,
2221
) {
23-
if (!class_exists(Crawler::class)) {
24-
throw new LogicException('The Symfony DomCrawler component is required to use this tool. Try running "composer require symfony/dom-crawler".');
25-
}
26-
if (!class_exists(CssSelectorConverter::class)) {
27-
throw new LogicException('The Symfony CSS Selector component is required to use this tool. Try running "composer require symfony/css-selector".');
22+
if (!class_exists(TranscriptListFetcher::class)) {
23+
throw new LogicException('The package `mrmysql/youtube-transcript` is required to use this tool. Try running "composer require mrmysql/youtube-transcript".');
2824
}
2925
}
3026

@@ -33,40 +29,14 @@ public function __construct(
3329
*/
3430
public function __invoke(string $videoId): string
3531
{
36-
// Fetch the HTML content of the YouTube video page
37-
$htmlResponse = $this->client->request('GET', 'https://youtube.com/watch?v='.$videoId);
38-
$html = $htmlResponse->getContent();
39-
40-
// Use DomCrawler to parse the HTML
41-
$crawler = new Crawler($html);
42-
43-
// Extract the script containing the ytInitialPlayerResponse
44-
$scriptContent = $crawler->filter('script')->reduce(function (Crawler $node) {
45-
return str_contains($node->text(), 'var ytInitialPlayerResponse = {');
46-
})->text();
47-
48-
// Extract and parse the JSON data from the script
49-
$start = strpos($scriptContent, 'var ytInitialPlayerResponse = ') + \strlen('var ytInitialPlayerResponse = ');
50-
$dataString = substr($scriptContent, $start);
51-
$dataString = substr($dataString, 0, strrpos($dataString, ';') ?: null);
52-
$data = json_decode(trim($dataString), true);
53-
54-
// Extract the URL for the captions
55-
if (!isset($data['captions']['playerCaptionsTracklistRenderer']['captionTracks'][0]['baseUrl'])) {
56-
throw new RuntimeException('Captions are not available for this video.');
57-
}
58-
$captionsUrl = $data['captions']['playerCaptionsTracklistRenderer']['captionTracks'][0]['baseUrl'];
59-
60-
// Fetch and parse the captions XML
61-
$xmlResponse = $this->client->request('GET', $captionsUrl);
62-
$xmlContent = $xmlResponse->getContent();
63-
$xmlCrawler = new Crawler($xmlContent);
32+
$psr18Client = new Psr18Client($this->client);
33+
$fetcher = new TranscriptListFetcher($psr18Client, $psr18Client, $psr18Client);
6434

65-
// Collect all text elements from the captions
66-
$transcript = $xmlCrawler->filter('text')->each(function (Crawler $node) {
67-
return $node->text().' ';
68-
});
35+
$list = $fetcher->fetch($videoId);
36+
$transcript = $list->findTranscript($list->getAvailableLanguageCodes());
6937

70-
return implode(\PHP_EOL, $transcript);
38+
return array_reduce($transcript->fetch(), function (string $carry, array $item): string {
39+
return $carry.\PHP_EOL.$item['text'];
40+
}, '');
7141
}
7242
}

0 commit comments

Comments
 (0)