From d9f73622a5b928473a9a5d6013bb8f45309fcdf7 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 6 Jan 2023 15:37:55 +0700 Subject: [PATCH] refactor UrlParser into own module, rename enum fields --- tubearchivist/api/views.py | 4 +- tubearchivist/home/src/download/queue.py | 11 +- tubearchivist/home/src/frontend/api_calls.py | 4 +- tubearchivist/home/src/frontend/watched.py | 4 +- tubearchivist/home/src/ta/helper.py | 102 -------------- tubearchivist/home/src/ta/urlparser.py | 133 +++++++++++++++++++ tubearchivist/home/tasks.py | 11 +- tubearchivist/home/views.py | 5 +- 8 files changed, 157 insertions(+), 117 deletions(-) create mode 100644 tubearchivist/home/src/ta/urlparser.py diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index 12b2e196..7146de72 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -13,8 +13,8 @@ from home.src.index.generic import Pagination from home.src.index.reindex import ReindexProgress from home.src.index.video import SponsorBlock, YoutubeVideo from home.src.ta.config import AppConfig -from home.src.ta.helper import UrlListParser from home.src.ta.ta_redis import RedisArchivist, RedisQueue +from home.src.ta.urlparser import Parser from home.tasks import check_reindex, download_single, extrac_dl, subscribe_to from rest_framework.authentication import ( SessionAuthentication, @@ -484,7 +484,7 @@ class DownloadApiListView(ApiBaseView): pending = [i["youtube_id"] for i in to_add if i["status"] == "pending"] url_str = " ".join(pending) try: - youtube_ids = UrlListParser(url_str).process_list() + youtube_ids = Parser(url_str).parse() except ValueError: message = f"failed to parse: {url_str}" print(message) diff --git a/tubearchivist/home/src/download/queue.py b/tubearchivist/home/src/download/queue.py index b571053d..c3571623 100644 --- a/tubearchivist/home/src/download/queue.py +++ b/tubearchivist/home/src/download/queue.py @@ -163,7 +163,7 @@ class PendingList(PendingIndex): def _process_entry(self, entry): """process single entry from url list""" if entry["type"] == "video": - vid_type = entry.get("vid_type", VideoTypeEnum.VIDEOS) + vid_type = self._get_vid_type(entry) self._add_video(entry["url"], vid_type) elif entry["type"] == "channel": self._parse_channel(entry["url"]) @@ -173,6 +173,15 @@ class PendingList(PendingIndex): else: raise ValueError(f"invalid url_type: {entry}") + @staticmethod + def _get_vid_type(entry): + """add vid type enum if available""" + vid_type_str = entry.get("vid_type") + if not vid_type_str: + return VideoTypeEnum.VIDEOS + + return VideoTypeEnum(vid_type_str) + def _add_video(self, url, vid_type=VideoTypeEnum.VIDEOS): """add video to list""" if url not in self.missing_videos and url not in self.to_skip: diff --git a/tubearchivist/home/src/frontend/api_calls.py b/tubearchivist/home/src/frontend/api_calls.py index 524f55d6..a7d38907 100644 --- a/tubearchivist/home/src/frontend/api_calls.py +++ b/tubearchivist/home/src/frontend/api_calls.py @@ -9,8 +9,8 @@ from home.src.download.subscriptions import ( PlaylistSubscription, ) from home.src.index.playlist import YoutubePlaylist -from home.src.ta.helper import UrlListParser from home.src.ta.ta_redis import RedisArchivist, RedisQueue +from home.src.ta.urlparser import Parser from home.tasks import ( download_pending, index_channel_playlists, @@ -123,7 +123,7 @@ class PostData: """unsubscribe from channels or playlists""" id_unsub = self.exec_val print(f"{id_unsub}: unsubscribe") - to_unsub_list = UrlListParser(id_unsub).process_list() + to_unsub_list = Parser(id_unsub).parse() for to_unsub in to_unsub_list: unsub_type = to_unsub["type"] unsub_id = to_unsub["url"] diff --git a/tubearchivist/home/src/frontend/watched.py b/tubearchivist/home/src/frontend/watched.py index 8978b961..2ce80180 100644 --- a/tubearchivist/home/src/frontend/watched.py +++ b/tubearchivist/home/src/frontend/watched.py @@ -6,7 +6,7 @@ functionality: from datetime import datetime from home.src.es.connect import ElasticWrap -from home.src.ta.helper import UrlListParser +from home.src.ta.urlparser import Parser class WatchState: @@ -34,7 +34,7 @@ class WatchState: def _dedect_type(self): """find youtube id type""" print(self.youtube_id) - url_process = UrlListParser(self.youtube_id).process_list() + url_process = Parser(self.youtube_id).parse() url_type = url_process[0]["type"] return url_type diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py index 45e4190f..51bbf428 100644 --- a/tubearchivist/home/src/ta/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -11,10 +11,8 @@ import string import subprocess import unicodedata from datetime import datetime -from urllib.parse import parse_qs, urlparse import requests -from home.src.download.yt_dlp_base import YtWrap def clean_string(file_name): @@ -147,106 +145,6 @@ def is_short(youtube_id): return response.status_code == 200 -class UrlListParser: - """take a multi line string and detect valid youtube ids""" - - def __init__(self, url_str): - self.url_list = [i.strip() for i in url_str.split()] - - def process_list(self): - """loop through the list""" - youtube_ids = [] - for url in self.url_list: - parsed = urlparse(url) - print(f"processing: {url}") - print(parsed) - if not parsed.netloc: - # is not a url - id_type = self.find_valid_id(url) - youtube_id = url - elif "youtube.com" not in url and "youtu.be" not in url: - raise ValueError(f"{url} is not a youtube link") - elif parsed.path: - # is a url - youtube_id, id_type = self.detect_from_url(parsed) - else: - # not detected - raise ValueError(f"failed to detect {url}") - - youtube_ids.append({"url": youtube_id, "type": id_type}) - - return youtube_ids - - def detect_from_url(self, parsed): - """detect from parsed url""" - if parsed.netloc == "youtu.be": - # shortened - youtube_id = parsed.path.strip("/") - _ = self.find_valid_id(youtube_id) - return youtube_id, "video" - - if parsed.query: - # detect from query string - query_parsed = parse_qs(parsed.query) - if "v" in query_parsed: - youtube_id = query_parsed["v"][0] - _ = self.find_valid_id(youtube_id) - return youtube_id, "video" - - if "list" in query_parsed: - youtube_id = query_parsed["list"][0] - return youtube_id, "playlist" - - if parsed.path.startswith("/channel/"): - # channel id in url - youtube_id = parsed.path.split("/")[2] - _ = self.find_valid_id(youtube_id) - return youtube_id, "channel" - - # detect channel with yt_dlp - youtube_id = self.extract_channel_name(parsed.geturl()) - return youtube_id, "channel" - - @staticmethod - def find_valid_id(id_str): - """detect valid id from length of string""" - str_len = len(id_str) - if str_len == 11: - id_type = "video" - elif str_len == 24: - id_type = "channel" - elif str_len in [34, 18] or id_str in ["LL", "WL"]: - id_type = "playlist" - else: - # unable to parse - raise ValueError("not a valid id_str: " + id_str) - - return id_type - - @staticmethod - def extract_channel_name(url): - """find channel id from channel name with yt-dlp help""" - obs_request = { - "skip_download": True, - "extract_flat": True, - "playlistend": 0, - } - url_info = YtWrap(obs_request).extract(url) - channel_id = url_info.get("channel_id", False) - if channel_id: - return channel_id - - url = url_info.get("url", False) - if url: - # handle old channel name redirect with url path split - channel_id = urlparse(url).path.strip("/").split("/")[1] - - return channel_id - - print(f"failed to extract channel id from {url}") - raise ValueError - - class DurationConverter: """ using ffmpeg to get and parse duration from filepath diff --git a/tubearchivist/home/src/ta/urlparser.py b/tubearchivist/home/src/ta/urlparser.py new file mode 100644 index 00000000..32c6030a --- /dev/null +++ b/tubearchivist/home/src/ta/urlparser.py @@ -0,0 +1,133 @@ +""" +Functionality: +- detect valid youtube ids and links from multi line string +- identify vid_type if possible +""" + +from urllib.parse import parse_qs, urlparse + +from home.src.download.yt_dlp_base import YtWrap +from home.src.index.video_constants import VideoTypeEnum + + +class Parser: + """take a multi line string and detect valid youtube ids""" + + def __init__(self, url_str): + self.url_list = [i.strip() for i in url_str.split()] + + def parse(self): + """parse the list""" + ids = [] + for url in self.url_list: + parsed = urlparse(url) + if parsed.netloc: + # is url + identified = self.process_url(parsed) + else: + # is not url + identified = self._find_valid_id(url) + + if "vid_type" not in identified: + identified.update(self._detect_vid_type(parsed.path)) + + ids.append(identified) + + return ids + + def process_url(self, parsed): + """process as url""" + if parsed.netloc == "youtu.be": + # shortened + youtube_id = parsed.path.strip("/") + return self._validate_expected(youtube_id, "video") + + query_parsed = parse_qs(parsed.query) + if "v" in query_parsed: + # video from v query str + youtube_id = query_parsed["v"][0] + return self._validate_expected(youtube_id, "video") + + if "list" in query_parsed: + # playlist from list query str + youtube_id = query_parsed["list"][0] + return self._validate_expected(youtube_id, "playlist") + + all_paths = parsed.path.strip("/").split("/") + if all_paths[0] == "shorts": + # is shorts video + item = self._validate_expected(all_paths[1], "video") + item.update({"vid_type": VideoTypeEnum.SHORTS.value}) + return item + + if all_paths[0] == "channel": + return self._validate_expected(all_paths[1], "channel") + + # detect channel + channel_id = self._extract_channel_name(parsed.geturl()) + return {"type": "channel", "url": channel_id} + + def _validate_expected(self, youtube_id, expected_type): + """raise value error if not matching""" + matched = self._find_valid_id(youtube_id) + if matched["type"] != expected_type: + raise ValueError( + f"{youtube_id} not of expected type {expected_type}" + ) + + return {"type": expected_type, "url": youtube_id} + + def _find_valid_id(self, id_str): + """detect valid id from length of string""" + if id_str in ("LL", "WL"): + return {"type": "playlist", "url": id_str} + + if id_str.startswith("@"): + url = f"https://www.youtube.com/{id_str}" + channel_id = self._extract_channel_name(url) + return {"type": "channel", "url": channel_id} + + len_id_str = len(id_str) + if len_id_str == 11: + item_type = "video" + elif len_id_str == 24: + item_type = "channel" + elif len_id_str in (34, 18): + item_type = "playlist" + else: + raise ValueError(f"not a valid id_str: {id_str}") + + return {"type": item_type, "url": id_str} + + @staticmethod + def _extract_channel_name(url): + """find channel id from channel name with yt-dlp help""" + obs_request = { + "skip_download": True, + "extract_flat": True, + "playlistend": 0, + } + url_info = YtWrap(obs_request).extract(url) + channel_id = url_info.get("channel_id", False) + if channel_id: + return channel_id + + url = url_info.get("url", False) + if url: + # handle old channel name redirect with url path split + channel_id = urlparse(url).path.strip("/").split("/")[1] + + return channel_id + + print(f"failed to extract channel id from {url}") + raise ValueError + + def _detect_vid_type(self, path): + """try to match enum from path, needs to be serializable""" + last = path.strip("/").split("/")[-1] + try: + vid_type = VideoTypeEnum(last).value + except ValueError: + vid_type = VideoTypeEnum.UNKNOWN.value + + return {"vid_type": vid_type} diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index dd940e0a..d5138368 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -25,8 +25,9 @@ from home.src.index.filesystem import ImportFolderScanner, scan_filesystem from home.src.index.reindex import Reindex, ReindexManual, ReindexOutdated from home.src.index.video_constants import VideoTypeEnum from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder -from home.src.ta.helper import UrlListParser, clear_dl_cache +from home.src.ta.helper import clear_dl_cache from home.src.ta.ta_redis import RedisArchivist, RedisQueue +from home.src.ta.urlparser import Parser CONFIG = AppConfig().config REDIS_HOST = os.environ.get("REDIS_HOST") @@ -261,9 +262,8 @@ def re_sync_thumbs(): @shared_task def subscribe_to(url_str): """take a list of urls to subscribe to""" - to_subscribe_list = UrlListParser(url_str).process_list() - counter = 1 - for item in to_subscribe_list: + to_subscribe_list = Parser(url_str).parse() + for idx, item in enumerate(to_subscribe_list): to_sub_id = item["url"] if item["type"] == "playlist": PlaylistSubscription().process_url_str([item]) @@ -286,10 +286,9 @@ def subscribe_to(url_str): "status": key, "level": "info", "title": "Subscribing to Channels", - "message": f"Processing {counter} of {len(to_subscribe_list)}", + "message": f"Processing {idx + 1} of {len(to_subscribe_list)}", } RedisArchivist().set_message(key, message=message, expire=True) - counter = counter + 1 @shared_task diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index 20647aa3..a4fc8fdc 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -38,8 +38,9 @@ from home.src.index.playlist import YoutubePlaylist from home.src.index.reindex import ReindexProgress from home.src.index.video_constants import VideoTypeEnum from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder -from home.src.ta.helper import UrlListParser, time_parser +from home.src.ta.helper import time_parser from home.src.ta.ta_redis import RedisArchivist +from home.src.ta.urlparser import Parser from home.tasks import extrac_dl, index_channel_playlists, subscribe_to from rest_framework.authtoken.models import Token @@ -456,7 +457,7 @@ class DownloadView(ArchivistResultsView): url_str = request.POST.get("vid_url") print(url_str) try: - youtube_ids = UrlListParser(url_str).process_list() + youtube_ids = Parser(url_str).parse() except ValueError: # failed to process key = "message:add"