Skip to content

Commit

Permalink
Merge short and video parsing even further
Browse files Browse the repository at this point in the history
Use multi_get and multi_deep_get for tag differences
Replace the duration check with conservative_update
  • Loading branch information
David-JonesDVN committed May 19, 2023
1 parent 07c9bcf commit 77956a5
Showing 1 changed file with 24 additions and 29 deletions.
53 changes: 24 additions & 29 deletions youtube/yt_data_extract/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,11 @@ def extract_item_info(item, additional_info={}):
info['time_published'] = timestamp.group(1)

if primary_type == 'video':
info['id'] = item.get('videoId')
if not info['id']:
info['id'] = deep_get(item,'navigationEndpoint', 'watchEndpoint',
'videoId')
info['id'] = multi_deep_get(item,
['videoId'],
['navigationEndpoint', 'watchEndpoint', 'videoId'],
['navigationEndpoint', 'reelWatchEndpoint', 'videoId'], # shorts
)
info['view_count'] = extract_int(item.get('viewCountText'))

# dig into accessibility data to get view_count for videos marked as recommended, and to get time_published
Expand All @@ -319,40 +320,34 @@ def extract_item_info(item, additional_info={}):
if info['view_count']:
info['approx_view_count'] = '{:,}'.format(info['view_count'])
else:
info['approx_view_count'] = extract_approx_int(item.get('shortViewCountText'))
info['approx_view_count'] = extract_approx_int(multi_get(item,
'shortViewCountText',
'viewCountText') # shorts
)

# handle case where it is "No views"
if not info['approx_view_count']:
if ('No views' in item.get('shortViewCountText', '')
or 'no views' in accessibility_label.lower()):
or 'no views' in accessibility_label.lower()
or 'No views' in extract_str(item.get('viewCountText', '')) # shorts
):
info['view_count'] = 0
info['approx_view_count'] = '0'

info['duration'] = extract_str(item.get('lengthText'))

if info['duration'] is None: # shorts
if not info['id']:
info['id'] = deep_get(item,'navigationEndpoint',
'reelWatchEndpoint', 'videoId')
info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))

# handle case where it is "No views"
if not info['approx_view_count']:
if ('No views' in extract_str(item.get('viewCountText', ''))):
info['view_count'] = 0
info['approx_view_count'] = '0'

# dig into accessibility data to get duration for shorts
accessibility_label = multi_deep_get(item,
['accessibility', 'accessibilityData', 'label'],
default='')
duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
accessibility_label)
if duration:
if duration.group(2) == 'minute':
info['duration'] = '1:00'
else:
info['duration'] = '0:' + duration.group(1).zfill(2)
# dig into accessibility data to get duration for shorts
accessibility_label = deep_get(item,
'accessibility', 'accessibilityData', 'label',
default='')
duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
accessibility_label)
if duration:
if duration.group(2) == 'minute':
conservative_update(info, 'duration', '1:00')
else:
conservative_update(info,
'duration', '0:' + duration.group(1).zfill(2))

# if it's an item in a playlist, get its index
if 'index' in item: # url has wrong index on playlist page
Expand Down

0 comments on commit 77956a5

Please sign in to comment.