Skip to content

Commit

Permalink
Extract captions base_url using different method when missing
Browse files Browse the repository at this point in the history
The base url will be randomly missing.

Take one of the listed captions urls which already
has the &lang and automatic specifiers. Then remove these
specifiers.
  • Loading branch information
user234683 committed Mar 26, 2022
1 parent b1050e2 commit 21fda2d
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions youtube/yt_data_extract/watch_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,25 @@ def extract_watch_info(polymer_json):
info['translation_languages'] = []
captions_info = player_response.get('captions', {})
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
# Sometimes the above playerCaptionsRender is randomly missing
# Extract base_url from one of the captions by removing lang specifiers
if not info['_captions_base_url']:
base_url = normalize_url(deep_get(
captions_info,
'playerCaptionsTracklistRenderer',
'captionTracks',
0,
'baseUrl'
))
if base_url:
url_parts = urllib.parse.urlparse(base_url)
qs = urllib.parse.parse_qs(url_parts.query)
for key in ('tlang', 'lang', 'name', 'kind', 'fmt'):
if key in qs:
del qs[key]
base_url = urllib.parse.urlunparse(url_parts._replace(
query=urllib.parse.urlencode(qs, doseq=True)))
info['_captions_base_url'] = base_url
for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
lang_code = caption_track.get('languageCode')
if not lang_code:
Expand Down

0 comments on commit 21fda2d

Please sign in to comment.