From 8b7a0fa375130d2cc89d1d513593c201d1d91a98 Mon Sep 17 00:00:00 2001 From: cheskel Date: Fri, 14 Jul 2023 02:51:22 +0300 Subject: [PATCH 1/3] fix sort by --- scrapetube/scrapetube.py | 41 ++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/scrapetube/scrapetube.py b/scrapetube/scrapetube.py index f911ddb..3f5b5f9 100644 --- a/scrapetube/scrapetube.py +++ b/scrapetube/scrapetube.py @@ -58,8 +58,6 @@ def get_channel( ``"streams"``: Streams """ - sort_by_map = {"newest": "dd", "oldest": "da", "popular": "p"} - base_url = "" if channel_url: base_url = channel_url @@ -68,13 +66,12 @@ def get_channel( elif channel_username: base_url = f"https://www.youtube.com/@{channel_username}" - url = "{base_url}/{content_type}?view=0&sort={sort_by}&flow=grid".format( + url = "{base_url}/{content_type}?view=0&flow=grid".format( base_url=base_url, content_type=content_type, - sort_by=sort_by_map[sort_by], ) api_endpoint = "https://www.youtube.com/youtubei/v1/browse" - videos = get_videos(url, api_endpoint, type_property_map[content_type], limit, sleep) + videos = get_videos(url, api_endpoint, type_property_map[content_type], limit, sleep, sort_by) for video in videos: yield video @@ -163,14 +160,14 @@ def get_search( def get_videos( - url: str, api_endpoint: str, selector: str, limit: int, sleep: int + url: str, api_endpoint: str, selector: str, limit: int, sleep: int, sort_by: str = None ) -> Generator[dict, None, None]: session = requests.Session() session.headers[ "User-Agent" ] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36" is_first = True - quit = False + quit_it = False count = 0 while True: if is_first: @@ -184,8 +181,10 @@ def get_videos( data = json.loads( get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}" ) - next_data = get_next_data(data) + next_data = get_next_data(data, sort_by) is_first = False + if sort_by and sort_by != "newest": + continue else: data = get_ajax_data(session, api_endpoint, api_key, next_data, client) next_data = get_next_data(data) @@ -194,13 +193,13 @@ def get_videos( count += 1 yield result if count == limit: - quit = True + quit_it = True break except GeneratorExit: - quit = True + quit_it = True break - if not next_data or quit: + if not next_data or quit_it: break time.sleep(sleep) @@ -237,13 +236,23 @@ def get_json_from_html(html: str, key: str, num_chars: int = 2, stop: str = '"') return html[pos_begin:pos_end] -def get_next_data(data: dict) -> dict: - raw_next_data = next(search_dict(data, "continuationEndpoint"), None) - if not raw_next_data: +def get_next_data(data: dict, sort_by: str = None) -> dict: + # Youtube, please don't change the order of these + sort_by_map = { + "newest": 0, + "popular": 1, + "oldest": 2, + } + if sort_by and sort_by != "newest": + endpoint = next( + search_dict(data, "feedFilterChipBarRenderer"), None)["contents"][sort_by_map[sort_by]]["chipCloudChipRenderer"]["navigationEndpoint"] + else: + endpoint = next(search_dict(data, "continuationEndpoint"), None) + if not endpoint: return None next_data = { - "token": raw_next_data["continuationCommand"]["token"], - "click_params": {"clickTrackingParams": raw_next_data["clickTrackingParams"]}, + "token": endpoint["continuationCommand"]["token"], + "click_params": {"clickTrackingParams": endpoint["clickTrackingParams"]}, } return next_data From 170249883601165bda1cf9a2b3004421dd31d329 Mon Sep 17 00:00:00 2001 From: cheskel Date: Fri, 14 Jul 2023 02:55:01 +0300 Subject: [PATCH 2/3] fix for #10 --- scrapetube/scrapetube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapetube/scrapetube.py b/scrapetube/scrapetube.py index 3f5b5f9..c2cb06f 100644 --- a/scrapetube/scrapetube.py +++ b/scrapetube/scrapetube.py @@ -165,7 +165,8 @@ def get_videos( session = requests.Session() session.headers[ "User-Agent" - ] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36" + ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" + session.headers["Accept-Language"] = "en" is_first = True quit_it = False count = 0 From 8dc8967c28c946846d0fcf59e6302ff5c35f4c7a Mon Sep 17 00:00:00 2001 From: cheskel Date: Fri, 14 Jul 2023 03:11:44 +0300 Subject: [PATCH 3/3] add get_video func, closes #15 --- scrapetube/scrapetube.py | 41 +++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/scrapetube/scrapetube.py b/scrapetube/scrapetube.py index c2cb06f..0101c6d 100644 --- a/scrapetube/scrapetube.py +++ b/scrapetube/scrapetube.py @@ -159,14 +159,37 @@ def get_search( yield video + +def get_video( + id: str, +) -> dict: + + """Get a single video. + + Parameters: + id (``str``): + The video id from the video you want to get. + """ + + session = get_session() + url = f"https://www.youtube.com/watch?v={id}" + html = get_initial_data(session, url) + client = json.loads( + get_json_from_html(html, "INNERTUBE_CONTEXT", 2, '"}},') + '"}}' + )["client"] + session.headers["X-YouTube-Client-Name"] = "1" + session.headers["X-YouTube-Client-Version"] = client["clientVersion"] + data = json.loads( + get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}" + ) + return next(search_dict(data, "videoPrimaryInfoRenderer")) + + + def get_videos( url: str, api_endpoint: str, selector: str, limit: int, sleep: int, sort_by: str = None ) -> Generator[dict, None, None]: - session = requests.Session() - session.headers[ - "User-Agent" - ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" - session.headers["Accept-Language"] = "en" + session = get_session() is_first = True quit_it = False count = 0 @@ -208,6 +231,14 @@ def get_videos( session.close() +def get_session() -> requests.Session: + session = requests.Session() + session.headers[ + "User-Agent" + ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" + session.headers["Accept-Language"] = "en" + return session + def get_initial_data(session: requests.Session, url: str) -> str: session.cookies.set("CONSENT", "YES+cb", domain=".youtube.com") response = session.get(url)