From 35c1da749c0a9f4a17094afbf0c850e09934b21d Mon Sep 17 00:00:00 2001
From: David_JonesDVN <gmdavidjones@gmail.com>
Date: Thu, 11 May 2023 20:18:32 +0330
Subject: [PATCH 1/5] Add functional but preliminary channel tab support

Add channel tabs to the channel template and script
Update continuation token to request different tabs
Add support for 'reelItemRenderer' format required to extract shorts
---
 youtube/channel.py                         | 30 +++++++++-----
 youtube/templates/channel.html             |  8 ++--
 youtube/yt_data_extract/common.py          | 47 ++++++++++++++++++++++
 youtube/yt_data_extract/everything_else.py |  2 +-
 4 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/youtube/channel.py b/youtube/channel.py
index 63f65df6..015a792b 100644
--- a/youtube/channel.py
+++ b/youtube/channel.py
@@ -32,16 +32,23 @@
 generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),)
 
 # added an extra nesting under the 2nd base64 compared to v4
+# added tab support
 def channel_ctoken_v5(channel_id, page, sort, tab, view=1):
     new_sort = (2 if int(sort) == 1 else 1)
     offset = str(30*(int(page) - 1))
+    if tab == 'videos':
+        tab = 15
+    elif tab == 'shorts':
+        tab = 10
+    elif tab == 'streams':
+        tab = 14
     pointless_nest = proto.string(80226972,
         proto.string(2, channel_id)
         + proto.string(3,
             proto.percent_b64encode(
                 proto.string(110,
                     proto.string(3,
-                        proto.string(15,
+                        proto.string(tab,
                             proto.string(1,
                                 proto.string(1,
                                     proto.unpadded_b64encode(
@@ -198,7 +205,7 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1,
     message = 'Got channel tab' if print_status else None
 
     if not ctoken:
-        if tab == 'videos':
+        if tab in ('videos', 'shorts', 'streams'):
             ctoken = channel_ctoken_v5(channel_id, page, sort, tab, view)
         else:
             ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
@@ -338,11 +345,11 @@ def post_process_channel_info(info):
                 info['links'][i] = (text, util.prefix_url(url))
 
 
-def get_channel_first_page(base_url=None, channel_id=None):
+def get_channel_first_page(base_url=None, channel_id=None, tab='videos'):
     if channel_id:
         base_url = 'https://www.youtube.com/channel/' + channel_id
-    return util.fetch_url(base_url + '/videos?pbj=1&view=0', headers_desktop,
-                          debug_name='gen_channel_videos')
+    return util.fetch_url(base_url + '/' + tab + '?pbj=1&view=0',
+                          headers_desktop, debug_name='gen_channel_' + tab)
 
 
 playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"}
@@ -361,24 +368,25 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
     default_params = (page_number == 1 and sort == '3' and view == '1')
     continuation = bool(ctoken) # whether or not we're using a continuation
 
-    if tab == 'videos' and channel_id and not default_params:
+    if (tab in ('videos', 'shorts', 'streams') and channel_id and
+        not default_params):
         tasks = (
             gevent.spawn(get_number_of_videos_channel, channel_id),
             gevent.spawn(get_channel_tab, channel_id, page_number, sort,
-                         'videos', view, ctoken)
+                         tab, view, ctoken)
         )
         gevent.joinall(tasks)
         util.check_gevent_exceptions(*tasks)
         number_of_videos, polymer_json = tasks[0].value, tasks[1].value
         continuation = True
-    elif tab == 'videos':
+    elif tab in ('videos', 'shorts', 'streams'):
         if channel_id:
             num_videos_call = (get_number_of_videos_channel, channel_id)
         else:
             num_videos_call = (get_number_of_videos_general, base_url)
         tasks = (
             gevent.spawn(*num_videos_call),
-            gevent.spawn(get_channel_first_page, base_url=base_url),
+            gevent.spawn(get_channel_first_page, base_url=base_url, tab=tab),
         )
         gevent.joinall(tasks)
         util.check_gevent_exceptions(*tasks)
@@ -429,11 +437,11 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
     if info['error'] is not None:
         return flask.render_template('error.html', error_message = info['error'])
 
-    if tab == 'videos':
+    if tab in ('videos', 'shorts', 'streams'):
         info['number_of_videos'] = number_of_videos
         info['number_of_pages'] = math.ceil(number_of_videos/30)
         info['header_playlist_names'] = local_playlist.get_playlist_names()
-    if tab in ('videos', 'playlists'):
+    if tab in ('videos', 'shorts', 'streams', 'playlists'):
         info['current_sort'] = sort
     elif tab == 'search':
         info['search_box_value'] = query
diff --git a/youtube/templates/channel.html b/youtube/templates/channel.html
index 48887500..fc35245d 100644
--- a/youtube/templates/channel.html
+++ b/youtube/templates/channel.html
@@ -120,7 +120,7 @@ <h2 class="title">{{ channel_name }}</h2>
         </div>
     </div>
     <nav class="channel-tabs">
-        {% for tab_name in ('Videos', 'Playlists', 'About') %}
+        {% for tab_name in ('Videos', 'Shorts', 'Streams', 'Playlists', 'About') %}
             {% if tab_name.lower() == current_tab %}
                 <a class="tab page-button">{{ tab_name }}</a>
             {% else %}
@@ -159,7 +159,7 @@ <h3>Description</h3>
     {% else %}
         <div class="content {{ current_tab + '-content'}}">
             <div id="links-metadata">
-                {% if current_tab == 'videos' %}
+                {% if current_tab in ('videos', 'shorts', 'streams') %}
                     {% set sorts = [('1', 'views'), ('2', 'oldest'), ('3', 'newest')] %}
                     <div id="number-of-results">{{ number_of_videos }} videos</div>
                 {% elif current_tab == 'playlists' %}
@@ -194,11 +194,11 @@ <h2 class="page-number">No results</h2>
                 {% endfor %}
             </nav>
 
-            {% if current_tab == 'videos' and current_sort.__str__() == '2' %}
+            {% if (current_tab in ('videos', 'shorts', 'streams')) and current_sort.__str__() == '2' %}
                 <nav class="next-previous-button-row">
                     {{ common_elements.next_previous_ctoken_buttons(None, ctoken, channel_url + '/' + current_tab, parameters_dictionary) }}
                 </nav>
-            {% elif current_tab == 'videos' %}
+            {% elif current_tab in ('videos', 'shorts', 'streams') %}
                 <nav class="page-button-row">
                     {{ common_elements.page_buttons(number_of_pages, channel_url + '/' + current_tab, parameters_dictionary, include_ends=(current_sort.__str__() == '3')) }}
                 </nav>
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 2f8396c0..874122ae 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -249,6 +249,9 @@ def extract_item_info(item, additional_info={}):
     primary_type = type_parts[-2]
     if primary_type == 'video':
         info['type'] = 'video'
+    elif type_parts[0] == 'reel': # shorts
+        info['type'] = 'video'
+        primary_type = 'short'
     elif primary_type in ('playlist', 'radio', 'show'):
         info['type'] = 'playlist'
         info['playlist_type'] = primary_type
@@ -343,6 +346,48 @@ def extract_item_info(item, additional_info={}):
         else:
             info['index'] = None
 
+    elif primary_type == 'short':
+        info['id'] = item.get('videoId')
+        if not info['id']:
+            info['id'] = deep_get(item,'navigationEndpoint',
+                                  'reelWatchEndpoint', 'videoId')
+        info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
+
+        # handle case where it is "No views"
+        if not info['approx_view_count']:
+            if ('No views' in item.get('shortViewCountText', '')
+                    or 'no views' in accessibility_label.lower()):
+                info['view_count'] = 0
+                info['approx_view_count'] = '0'
+
+        # dig into accessibility data to get duration for shorts
+        accessibility_label = multi_deep_get(item,
+            ['accessibility', 'accessibilityData', 'label'],
+            default='')
+
+        duration = re.search(r'(\d+) (second|seconds|minute) - play video',
+                             accessibility_label)
+        if duration.group(2) == 'minute':
+            info['duration'] = "1:00"
+        else:
+            info['duration'] = "0:" + duration.group(1).zfill(2)
+
+        # if it's an item in a playlist, get its index
+        if 'index' in item: # url has wrong index on playlist page
+            info['index'] = extract_int(item.get('index'))
+        elif 'indexText' in item:
+            # Current item in playlist has ▶ instead of the actual index, must
+            # dig into url
+            match = re.search(r'index=(\d+)', deep_get(item,
+                'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
+                'url', default=''))
+            if match is None:   # worth a try then
+                info['index'] = extract_int(item.get('indexText'))
+            else:
+                info['index'] = int(match.group(1))
+        else:
+            info['index'] = None
+
     elif primary_type in ('playlist', 'radio'):
         info['id'] = item.get('playlistId')
         info['video_count'] = extract_int(item.get('videoCount'))
@@ -398,6 +443,8 @@ def extract_response(polymer_json):
     'gridVideoRenderer',
     'playlistVideoRenderer',
 
+    'reelItemRenderer',
+
     'playlistRenderer',
     'compactPlaylistRenderer',
     'gridPlaylistRenderer',
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
index 9a6e31a8..745d08f0 100644
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -73,7 +73,7 @@ def extract_channel_info(polymer_json, tab, continuation=False):
     #if 'contents' not in response and 'continuationContents' not in response:
     #    return info
 
-    if tab in ('videos', 'playlists', 'search'):
+    if tab in ('videos', 'shorts', 'streams', 'playlists', 'search'):
         items, ctoken = extract_items(response)
         additional_info = {
             'author': info['channel_name'],

From af499458b89bfc4c55dd2307f9305299d68e54f7 Mon Sep 17 00:00:00 2001
From: David_JonesDVN <gmdavidjones@gmail.com>
Date: Fri, 19 May 2023 19:53:09 +0330
Subject: [PATCH 2/5] Fix parsing shorts

Add check for extracting duration for shorts
Make short duration extraction stricter
Fix handling shorts with no views
---
 youtube/yt_data_extract/common.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 874122ae..934de5c6 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -355,8 +355,7 @@ def extract_item_info(item, additional_info={}):
 
         # handle case where it is "No views"
         if not info['approx_view_count']:
-            if ('No views' in item.get('shortViewCountText', '')
-                    or 'no views' in accessibility_label.lower()):
+            if ('No views' in extract_str(item.get('viewCountText', ''))):
                 info['view_count'] = 0
                 info['approx_view_count'] = '0'
 
@@ -364,13 +363,13 @@ def extract_item_info(item, additional_info={}):
         accessibility_label = multi_deep_get(item,
             ['accessibility', 'accessibilityData', 'label'],
             default='')
-
-        duration = re.search(r'(\d+) (second|seconds|minute) - play video',
+        duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
                              accessibility_label)
-        if duration.group(2) == 'minute':
-            info['duration'] = "1:00"
-        else:
-            info['duration'] = "0:" + duration.group(1).zfill(2)
+        if duration:
+            if duration.group(2) == 'minute':
+                info['duration'] = '1:00'
+            else:
+                info['duration'] = '0:' + duration.group(1).zfill(2)
 
         # if it's an item in a playlist, get its index
         if 'index' in item: # url has wrong index on playlist page

From 07c9bcfebb5e29efce25137eb29136a9028185b6 Mon Sep 17 00:00:00 2001
From: David_JonesDVN <gmdavidjones@gmail.com>
Date: Fri, 19 May 2023 20:14:22 +0330
Subject: [PATCH 3/5] Merge short and video parsing

---
 youtube/yt_data_extract/common.py | 65 ++++++++++++-------------------
 1 file changed, 24 insertions(+), 41 deletions(-)

diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 934de5c6..aa761f6e 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -251,7 +251,7 @@ def extract_item_info(item, additional_info={}):
         info['type'] = 'video'
     elif type_parts[0] == 'reel': # shorts
         info['type'] = 'video'
-        primary_type = 'short'
+        primary_type = 'video'
     elif primary_type in ('playlist', 'radio', 'show'):
         info['type'] = 'playlist'
         info['playlist_type'] = primary_type
@@ -330,46 +330,29 @@ def extract_item_info(item, additional_info={}):
 
         info['duration'] = extract_str(item.get('lengthText'))
 
-        # if it's an item in a playlist, get its index
-        if 'index' in item: # url has wrong index on playlist page
-            info['index'] = extract_int(item.get('index'))
-        elif 'indexText' in item:
-            # Current item in playlist has ▶ instead of the actual index, must
-            # dig into url
-            match = re.search(r'index=(\d+)', deep_get(item,
-                'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
-                'url', default=''))
-            if match is None:   # worth a try then
-                info['index'] = extract_int(item.get('indexText'))
-            else:
-                info['index'] = int(match.group(1))
-        else:
-            info['index'] = None
-
-    elif primary_type == 'short':
-        info['id'] = item.get('videoId')
-        if not info['id']:
-            info['id'] = deep_get(item,'navigationEndpoint',
-                                  'reelWatchEndpoint', 'videoId')
-        info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
-
-        # handle case where it is "No views"
-        if not info['approx_view_count']:
-            if ('No views' in extract_str(item.get('viewCountText', ''))):
-                info['view_count'] = 0
-                info['approx_view_count'] = '0'
-
-        # dig into accessibility data to get duration for shorts
-        accessibility_label = multi_deep_get(item,
-            ['accessibility', 'accessibilityData', 'label'],
-            default='')
-        duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
-                             accessibility_label)
-        if duration:
-            if duration.group(2) == 'minute':
-                info['duration'] = '1:00'
-            else:
-                info['duration'] = '0:' + duration.group(1).zfill(2)
+        if info['duration'] is None: # shorts
+            if not info['id']:
+                info['id'] = deep_get(item,'navigationEndpoint',
+                                    'reelWatchEndpoint', 'videoId')
+            info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
+
+            # handle case where it is "No views"
+            if not info['approx_view_count']:
+                if ('No views' in extract_str(item.get('viewCountText', ''))):
+                    info['view_count'] = 0
+                    info['approx_view_count'] = '0'
+
+            # dig into accessibility data to get duration for shorts
+            accessibility_label = multi_deep_get(item,
+                ['accessibility', 'accessibilityData', 'label'],
+                default='')
+            duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
+                                accessibility_label)
+            if duration:
+                if duration.group(2) == 'minute':
+                    info['duration'] = '1:00'
+                else:
+                    info['duration'] = '0:' + duration.group(1).zfill(2)
 
         # if it's an item in a playlist, get its index
         if 'index' in item: # url has wrong index on playlist page

From 77956a5d045b7fdfa74dc243cf8210d1e590dc81 Mon Sep 17 00:00:00 2001
From: David_JonesDVN <gmdavidjones@gmail.com>
Date: Fri, 19 May 2023 23:30:43 +0330
Subject: [PATCH 4/5] Merge short and video parsing even further

Use multi_get and multi_deep_get for tag differences
Replace the duration check with conservative_update
---
 youtube/yt_data_extract/common.py | 53 ++++++++++++++-----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index aa761f6e..4845406d 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -298,10 +298,11 @@ def extract_item_info(item, additional_info={}):
             info['time_published'] = timestamp.group(1)
 
     if primary_type == 'video':
-        info['id'] = item.get('videoId')
-        if not info['id']:
-            info['id'] = deep_get(item,'navigationEndpoint', 'watchEndpoint',
-                                  'videoId')
+        info['id'] = multi_deep_get(item,
+            ['videoId'],
+            ['navigationEndpoint', 'watchEndpoint', 'videoId'],
+            ['navigationEndpoint', 'reelWatchEndpoint', 'videoId'], # shorts
+            )
         info['view_count'] = extract_int(item.get('viewCountText'))
 
         # dig into accessibility data to get view_count for videos marked as recommended, and to get time_published
@@ -319,40 +320,34 @@ def extract_item_info(item, additional_info={}):
         if info['view_count']:
             info['approx_view_count'] = '{:,}'.format(info['view_count'])
         else:
-            info['approx_view_count'] = extract_approx_int(item.get('shortViewCountText'))
+            info['approx_view_count'] = extract_approx_int(multi_get(item,
+                'shortViewCountText',
+                'viewCountText') # shorts
+                )
 
         # handle case where it is "No views"
         if not info['approx_view_count']:
             if ('No views' in item.get('shortViewCountText', '')
-                    or 'no views' in accessibility_label.lower()):
+                    or 'no views' in accessibility_label.lower()
+                    or 'No views' in extract_str(item.get('viewCountText', '')) # shorts
+                    ):
                 info['view_count'] = 0
                 info['approx_view_count'] = '0'
 
         info['duration'] = extract_str(item.get('lengthText'))
 
-        if info['duration'] is None: # shorts
-            if not info['id']:
-                info['id'] = deep_get(item,'navigationEndpoint',
-                                    'reelWatchEndpoint', 'videoId')
-            info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
-
-            # handle case where it is "No views"
-            if not info['approx_view_count']:
-                if ('No views' in extract_str(item.get('viewCountText', ''))):
-                    info['view_count'] = 0
-                    info['approx_view_count'] = '0'
-
-            # dig into accessibility data to get duration for shorts
-            accessibility_label = multi_deep_get(item,
-                ['accessibility', 'accessibilityData', 'label'],
-                default='')
-            duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
-                                accessibility_label)
-            if duration:
-                if duration.group(2) == 'minute':
-                    info['duration'] = '1:00'
-                else:
-                    info['duration'] = '0:' + duration.group(1).zfill(2)
+        # dig into accessibility data to get duration for shorts
+        accessibility_label = deep_get(item,
+            'accessibility', 'accessibilityData', 'label',
+            default='')
+        duration = re.search(r'(\d+) (second|seconds|minute) - play video$',
+                            accessibility_label)
+        if duration:
+            if duration.group(2) == 'minute':
+                conservative_update(info, 'duration', '1:00')
+            else:
+                conservative_update(info,
+                    'duration', '0:' + duration.group(1).zfill(2))
 
         # if it's an item in a playlist, get its index
         if 'index' in item: # url has wrong index on playlist page

From 20868b4cc2cc655565ef0dd114a8cc948076cddd Mon Sep 17 00:00:00 2001
From: David_JonesDVN <gmdavidjones@gmail.com>
Date: Sat, 20 May 2023 15:24:17 +0330
Subject: [PATCH 5/5] Fix minor formatting issues

---
 youtube/yt_data_extract/common.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 4845406d..9bc8eee5 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -301,8 +301,8 @@ def extract_item_info(item, additional_info={}):
         info['id'] = multi_deep_get(item,
             ['videoId'],
             ['navigationEndpoint', 'watchEndpoint', 'videoId'],
-            ['navigationEndpoint', 'reelWatchEndpoint', 'videoId'], # shorts
-            )
+            ['navigationEndpoint', 'reelWatchEndpoint', 'videoId'] # shorts
+        )
         info['view_count'] = extract_int(item.get('viewCountText'))
 
         # dig into accessibility data to get view_count for videos marked as recommended, and to get time_published
@@ -322,15 +322,15 @@ def extract_item_info(item, additional_info={}):
         else:
             info['approx_view_count'] = extract_approx_int(multi_get(item,
                 'shortViewCountText',
-                'viewCountText') # shorts
-                )
+                'viewCountText' # shorts
+            ))
 
         # handle case where it is "No views"
         if not info['approx_view_count']:
             if ('No views' in item.get('shortViewCountText', '')
                     or 'no views' in accessibility_label.lower()
                     or 'No views' in extract_str(item.get('viewCountText', '')) # shorts
-                    ):
+            ):
                 info['view_count'] = 0
                 info['approx_view_count'] = '0'