44
55namespace App \YouTube ;
66
7- use Symfony \Component \DomCrawler \Crawler ;
7+ use MrMySQL \YoutubeTranscript \TranscriptListFetcher ;
8+ use Symfony \Component \HttpClient \Psr18Client ;
89use Symfony \Contracts \HttpClient \HttpClientInterface ;
910
1011final class TranscriptFetcher
@@ -16,41 +17,14 @@ public function __construct(
1617
1718 public function fetchTranscript (string $ videoId ): string
1819 {
19- // Fetch the HTML content of the YouTube video page
20- $ htmlResponse = $ this ->client ->request ('GET ' , 'https://youtube.com/watch?v= ' .$ videoId );
21- $ html = $ htmlResponse ->getContent ();
20+ $ psr18Client = new Psr18Client ($ this ->client );
21+ $ fetcher = new TranscriptListFetcher ($ psr18Client , $ psr18Client , $ psr18Client );
2222
23- // Use DomCrawler to parse the HTML
24- $ crawler = new Crawler ( $ html );
23+ $ list = $ fetcher -> fetch ( $ videoId );
24+ $ transcript = $ list -> findTranscript ( $ list -> getAvailableLanguageCodes () );
2525
26- // Extract the script containing the ytInitialPlayerResponse
27- $ scriptContent = $ crawler ->filter ('script ' )->reduce (function (Crawler $ node ) {
28- return str_contains ($ node ->text (), 'var ytInitialPlayerResponse = { ' );
29- })->text ();
30-
31- // Extract and parse the JSON data from the script
32- $ start = strpos ($ scriptContent , 'var ytInitialPlayerResponse = ' ) + strlen ('var ytInitialPlayerResponse = ' );
33- $ dataString = substr ($ scriptContent , $ start );
34- $ dataString = substr ($ dataString , 0 , strrpos ($ dataString , '; ' ) ?: null );
35- $ data = json_decode (trim ($ dataString ), true );
36-
37- // Extract the URL for the captions
38- if (!isset ($ data ['captions ' ]['playerCaptionsTracklistRenderer ' ]['captionTracks ' ][0 ]['baseUrl ' ])) {
39- throw new \Exception ('Captions are not available for this video. ' );
40- }
41- $ captionsUrl = $ data ['captions ' ]['playerCaptionsTracklistRenderer ' ]['captionTracks ' ][0 ]['baseUrl ' ];
42-
43- // Fetch and parse the captions XML
44- $ xmlResponse = $ this ->client ->request ('GET ' , $ captionsUrl );
45- $ xmlContent = $ xmlResponse ->getContent ();
46- $ xmlCrawler = new Crawler ($ xmlContent );
47-
48- // Collect all text elements from the captions
49- $ transcript = $ xmlCrawler ->filter ('text ' )->each (function (Crawler $ node ) {
50- return $ node ->text ().' ' ;
51- });
52-
53- // Combine all the text elements into one string
54- return implode (PHP_EOL , $ transcript );
26+ return array_reduce ($ transcript ->fetch (), function (string $ carry , array $ item ): string {
27+ return $ carry .\PHP_EOL .$ item ['text ' ];
28+ }, '' );
5529 }
5630}
0 commit comments