From 474ab22792e0db75458ab6a990463292c0510bdf Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 12 Aug 2022 11:53:31 +0700 Subject: [PATCH 01/15] handle channel name redirect in UrlListParser, #276 --- tubearchivist/home/src/ta/helper.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py index bc943ecc..314ecd79 100644 --- a/tubearchivist/home/src/ta/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -158,12 +158,12 @@ class UrlListParser: if parsed.query: # detect from query string query_parsed = parse_qs(parsed.query) - if "v" in query_parsed.keys(): + if "v" in query_parsed: youtube_id = query_parsed["v"][0] _ = self.find_valid_id(youtube_id) return youtube_id, "video" - if "list" in query_parsed.keys(): + if "list" in query_parsed: youtube_id = query_parsed["list"][0] return youtube_id, "playlist" @@ -202,13 +202,19 @@ class UrlListParser: "playlistend": 0, } url_info = YtWrap(obs_request).extract(url) - try: - channel_id = url_info["channel_id"] - except KeyError as error: - print(f"failed to extract channel id from {url}") - raise ValueError from error + channel_id = url_info.get("channel_id", False) + if channel_id: + return channel_id - return channel_id + url = url_info.get("url", False) + if url: + # handle old channel name redirect with url path split + channel_id = urlparse(url).path.strip("/").split("/")[1] + + return channel_id + + print(f"failed to extract channel id from {url}") + raise ValueError class DurationConverter: From 25f15398163a500af6306e4991c07aade6788f6a Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 12 Aug 2022 12:03:09 +0700 Subject: [PATCH 02/15] implement backup task lock, #278 --- tubearchivist/home/apps.py | 1 + tubearchivist/home/tasks.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/apps.py b/tubearchivist/home/apps.py index 5f02ba0f..1c44bbcc 100644 --- a/tubearchivist/home/apps.py +++ b/tubearchivist/home/apps.py @@ -77,6 +77,7 @@ class StartupCheck: "dl_queue", "dl_queue_id", "rescan", + "run_backup", ] for lock in all_locks: response = self.redis_con.del_message(lock) diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index b3da7621..ae8e43e9 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -162,8 +162,19 @@ def run_manual_import(): @shared_task(name="run_backup") def run_backup(reason="auto"): """called from settings page, dump backup to zip file""" - backup_all_indexes(reason) - print("backup finished") + have_lock = False + my_lock = RedisArchivist().get_lock("run_backup") + + try: + have_lock = my_lock.acquire(blocking=False) + if have_lock: + backup_all_indexes(reason) + else: + print("Did not acquire lock for backup task.") + finally: + if have_lock: + my_lock.release() + print("backup finished") @shared_task From 7c3bfa94f39a591f2f2d8b66ee831a58a56d4bb5 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 12 Aug 2022 12:58:39 +0700 Subject: [PATCH 03/15] handle thumbnail download base exception, #281 --- tubearchivist/home/src/download/thumbnails.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/download/thumbnails.py b/tubearchivist/home/src/download/thumbnails.py index 43cd981e..4e77453f 100644 --- a/tubearchivist/home/src/download/thumbnails.py +++ b/tubearchivist/home/src/download/thumbnails.py @@ -40,13 +40,13 @@ class ThumbManagerBase: for i in range(3): try: - response = requests.get(url, stream=True) + response = requests.get(url, stream=True, timeout=5) if response.ok: return Image.open(response.raw) if response.status_code == 404: return self.get_fallback() - except ConnectionError: + except requests.exceptions.RequestException: print(f"{self.item_id}: retry thumbnail download {url}") sleep((i + 1) ** i) From cc40f4632538f86bd176cb4232f43bab51bbf78f Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 12 Aug 2022 14:39:22 +0700 Subject: [PATCH 04/15] improved ryd error handeling, #283 --- tubearchivist/home/src/index/video.py | 4 ++-- tubearchivist/requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index d7d8b983..bded42c6 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -631,8 +631,8 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): return False dislikes = { - "dislike_count": result["dislikes"], - "average_rating": result["rating"], + "dislike_count": result.get("dislikes", 0), + "average_rating": result.get("rating", 0), } self.json_data["stats"].update(dislikes) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index f9729822..28b55f32 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -7,7 +7,7 @@ djangorestframework==3.13.1 Pillow==9.2.0 redis==4.3.4 requests==2.28.1 -ryd-client==0.0.3 +ryd-client==0.0.5 uWSGI==2.0.20 whitenoise==6.2.0 yt_dlp==2022.8.8 From 6afe25a4afa65d40a6395e417c52dec3fe1010ea Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 12 Aug 2022 15:53:20 +0700 Subject: [PATCH 05/15] fix empty subtitle download and index, #288 --- tubearchivist/home/src/index/video.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index bded42c6..d02eae03 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -119,6 +119,7 @@ class YoutubeSubtitle: def download_subtitles(self, relevant_subtitles): """download subtitle files to archive""" videos_base = self.video.config["application"]["videos"] + indexed = [] for subtitle in relevant_subtitles: dest_path = os.path.join(videos_base, subtitle["media_url"]) source = subtitle["source"] @@ -133,12 +134,19 @@ class YoutubeSubtitle: parser = SubtitleParser(response.text, lang, source) parser.process() + if not parser.all_cues: + continue + subtitle_str = parser.get_subtitle_str() self._write_subtitle_file(dest_path, subtitle_str) if self.video.config["downloads"]["subtitle_index"]: query_str = parser.create_bulk_import(self.video, source) self._index_subtitle(query_str) + indexed.append(subtitle) + + return indexed + @staticmethod def _write_subtitle_file(dest_path, subtitle_str): """write subtitle file to disk""" @@ -188,11 +196,15 @@ class SubtitleParser: def process(self): """extract relevant que data""" + self.all_cues = [] all_events = self.subtitle_raw.get("events") + + if not all_events: + return + if self.source == "auto": all_events = self._flat_auto_caption(all_events) - self.all_cues = [] for idx, event in enumerate(all_events): if "dDurationMs" not in event or "segs" not in event: # some events won't have a duration or segs @@ -649,8 +661,8 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): handler = YoutubeSubtitle(self) subtitles = handler.get_subtitles() if subtitles: - self.json_data["subtitles"] = subtitles - handler.download_subtitles(relevant_subtitles=subtitles) + indexed = handler.download_subtitles(relevant_subtitles=subtitles) + self.json_data["subtitles"] = indexed def update_media_url(self): """update only media_url in es for reindex channel rename""" From 8a1ae1ef8331407bc3e16a863b002998fc2d861d Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 13 Aug 2022 14:03:17 +0700 Subject: [PATCH 06/15] add protocoll to CSRF_TRUSTED_ORIGINS --- tubearchivist/config/settings.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 50bea8d6..b9b2cb2b 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -33,7 +33,13 @@ SECRET_KEY = PW_HASH.hexdigest() DEBUG = bool(environ.get("DJANGO_DEBUG")) ALLOWED_HOSTS = [i.strip() for i in environ.get("TA_HOST").split()] -CSRF_TRUSTED_ORIGINS = [i.strip() for i in environ.get("TA_HOST").split()] + +CSRF_TRUSTED_ORIGINS = [] +for host in ALLOWED_HOSTS: + if host.startswith("http://") or host.startswith("https://"): + CSRF_TRUSTED_ORIGINS.append(host) + else: + CSRF_TRUSTED_ORIGINS.append(f"http://{host}") # Application definition @@ -87,6 +93,7 @@ TEMPLATES = [ WSGI_APPLICATION = "config.wsgi.application" if bool(environ.get("TA_LDAP")): + # pylint: disable=global-at-module-level global AUTH_LDAP_SERVER_URI AUTH_LDAP_SERVER_URI = environ.get("TA_LDAP_SERVER_URI") @@ -97,6 +104,7 @@ if bool(environ.get("TA_LDAP")): AUTH_LDAP_BIND_PASSWORD = environ.get("TA_LDAP_BIND_PASSWORD") global AUTH_LDAP_USER_SEARCH + # pylint: disable=no-member AUTH_LDAP_USER_SEARCH = LDAPSearch( environ.get("TA_LDAP_USER_BASE"), ldap.SCOPE_SUBTREE, From bd4710ebdcd6306da1d43374e363ac2713d708cb Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 13 Aug 2022 15:43:23 +0700 Subject: [PATCH 07/15] improved wording for download form, subscription size, #300 --- tubearchivist/home/src/frontend/forms.py | 4 +++- tubearchivist/home/templates/home/settings.html | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 900df878..4459f9b5 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -159,12 +159,14 @@ class MultiSearchForm(forms.Form): class AddToQueueForm(forms.Form): """text area form to add to downloads""" + HELP_TEXT = "Enter at least one video, channel or playlist id/URL here..." + vid_url = forms.CharField( label=False, widget=forms.Textarea( attrs={ "rows": 4, - "placeholder": "Enter Video Urls or IDs here...", + "placeholder": HELP_TEXT, } ), ) diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index 6ec29e19..6477013d 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -34,8 +34,8 @@

Subscriptions

-

Current page size: {{ config.subscriptions.channel_size }}

- Recent videos for channels and playlist to check when running Rescan subscriptions, max recommended 50.
+

YouTube page size: {{ config.subscriptions.channel_size }}

+ Videos to scan to find new items for the Rescan subscriptions task, max recommended 50.
{{ app_form.subscriptions_channel_size }}
From 81aa27e802b41ec93a8ae87adf05d64193129e86 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 13 Aug 2022 16:20:38 +0700 Subject: [PATCH 08/15] fix video-item.list vertical positioning --- tubearchivist/static/css/style.css | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tubearchivist/static/css/style.css b/tubearchivist/static/css/style.css index 55c0ac4b..cb54d71e 100644 --- a/tubearchivist/static/css/style.css +++ b/tubearchivist/static/css/style.css @@ -459,6 +459,8 @@ video:-webkit-full-screen { .video-item.list { display: grid; grid-template-columns: 26% auto; + background-color: var(--highlight-bg); + align-items: center; } .video-progress-bar { @@ -508,8 +510,7 @@ video:-webkit-full-screen { .video-desc.list { padding: 10px; - height: unset; - background-color: var(--highlight-bg); + height: 100%; display: flex; flex-wrap: wrap-reverse; align-content: center; From 41c71fde56603dc1d844235216c9684c52264edd Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 15 Aug 2022 13:55:02 +0700 Subject: [PATCH 09/15] bump yt-dlp --- tubearchivist/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 28b55f32..a14643ef 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -10,4 +10,4 @@ requests==2.28.1 ryd-client==0.0.5 uWSGI==2.0.20 whitenoise==6.2.0 -yt_dlp==2022.8.8 +yt_dlp==2022.8.14 From 7727f533eeac4cab729379c0abaf6d8ada3f3413 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 15 Aug 2022 13:55:24 +0700 Subject: [PATCH 10/15] move subtitle to separate module --- tubearchivist/home/src/index/subtitle.py | 321 +++++++++++++++++++++++ tubearchivist/home/src/index/video.py | 316 +--------------------- 2 files changed, 323 insertions(+), 314 deletions(-) create mode 100644 tubearchivist/home/src/index/subtitle.py diff --git a/tubearchivist/home/src/index/subtitle.py b/tubearchivist/home/src/index/subtitle.py new file mode 100644 index 00000000..1c8e0acc --- /dev/null +++ b/tubearchivist/home/src/index/subtitle.py @@ -0,0 +1,321 @@ +""" +functionality: +- download subtitles +- parse subtitles into it's cues +- index dubtitles +""" + +import json +import os +from datetime import datetime + +import requests +from home.src.es.connect import ElasticWrap +from home.src.ta.helper import requests_headers + + +class YoutubeSubtitle: + """handle video subtitle functionality""" + + def __init__(self, video): + self.video = video + self.languages = False + + def _sub_conf_parse(self): + """add additional conf values to self""" + languages_raw = self.video.config["downloads"]["subtitle"] + if languages_raw: + self.languages = [i.strip() for i in languages_raw.split(",")] + + def get_subtitles(self): + """check what to do""" + self._sub_conf_parse() + if not self.languages: + # no subtitles + return False + + relevant_subtitles = [] + for lang in self.languages: + user_sub = self._get_user_subtitles(lang) + if user_sub: + relevant_subtitles.append(user_sub) + continue + + if self.video.config["downloads"]["subtitle_source"] == "auto": + auto_cap = self._get_auto_caption(lang) + if auto_cap: + relevant_subtitles.append(auto_cap) + + return relevant_subtitles + + def _get_auto_caption(self, lang): + """get auto_caption subtitles""" + print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") + all_subtitles = self.video.youtube_meta.get("automatic_captions") + + if not all_subtitles: + return False + + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f".{lang}.vtt") + all_formats = all_subtitles.get(lang) + if not all_formats: + return False + + subtitle = [i for i in all_formats if i["ext"] == "json3"][0] + subtitle.update( + {"lang": lang, "source": "auto", "media_url": media_url} + ) + + return subtitle + + def _normalize_lang(self): + """normalize country specific language keys""" + all_subtitles = self.video.youtube_meta.get("subtitles") + if not all_subtitles: + return False + + all_keys = list(all_subtitles.keys()) + for key in all_keys: + lang = key.split("-")[0] + old = all_subtitles.pop(key) + if lang == "live_chat": + continue + all_subtitles[lang] = old + + return all_subtitles + + def _get_user_subtitles(self, lang): + """get subtitles uploaded from channel owner""" + print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") + all_subtitles = self._normalize_lang() + if not all_subtitles: + return False + + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f".{lang}.vtt") + all_formats = all_subtitles.get(lang) + if not all_formats: + # no user subtitles found + return False + + subtitle = [i for i in all_formats if i["ext"] == "json3"][0] + subtitle.update( + {"lang": lang, "source": "user", "media_url": media_url} + ) + + return subtitle + + def download_subtitles(self, relevant_subtitles): + """download subtitle files to archive""" + videos_base = self.video.config["application"]["videos"] + indexed = [] + for subtitle in relevant_subtitles: + dest_path = os.path.join(videos_base, subtitle["media_url"]) + source = subtitle["source"] + lang = subtitle.get("lang") + response = requests.get( + subtitle["url"], headers=requests_headers() + ) + if not response.ok: + print(f"{self.video.youtube_id}: failed to download subtitle") + print(response.text) + continue + + parser = SubtitleParser(response.text, lang, source) + parser.process() + if not parser.all_cues: + continue + + subtitle_str = parser.get_subtitle_str() + self._write_subtitle_file(dest_path, subtitle_str) + if self.video.config["downloads"]["subtitle_index"]: + query_str = parser.create_bulk_import(self.video, source) + self._index_subtitle(query_str) + + indexed.append(subtitle) + + return indexed + + @staticmethod + def _write_subtitle_file(dest_path, subtitle_str): + """write subtitle file to disk""" + # create folder here for first video of channel + os.makedirs(os.path.split(dest_path)[0], exist_ok=True) + with open(dest_path, "w", encoding="utf-8") as subfile: + subfile.write(subtitle_str) + + @staticmethod + def _index_subtitle(query_str): + """send subtitle to es for indexing""" + _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) + + def delete(self, subtitles=False): + """delete subtitles from index and filesystem""" + youtube_id = self.video.youtube_id + videos_base = self.video.config["application"]["videos"] + # delete files + if subtitles: + files = [i["media_url"] for i in subtitles] + else: + if not self.video.json_data.get("subtitles"): + return + + files = [i["media_url"] for i in self.video.json_data["subtitles"]] + + for file_name in files: + file_path = os.path.join(videos_base, file_name) + try: + os.remove(file_path) + except FileNotFoundError: + print(f"{youtube_id}: {file_path} failed to delete") + # delete from index + path = "ta_subtitle/_delete_by_query?refresh=true" + data = {"query": {"term": {"youtube_id": {"value": youtube_id}}}} + _, _ = ElasticWrap(path).post(data=data) + + +class SubtitleParser: + """parse subtitle str from youtube""" + + def __init__(self, subtitle_str, lang, source): + self.subtitle_raw = json.loads(subtitle_str) + self.lang = lang + self.source = source + self.all_cues = False + + def process(self): + """extract relevant que data""" + self.all_cues = [] + all_events = self.subtitle_raw.get("events") + + if not all_events: + return + + if self.source == "auto": + all_events = self._flat_auto_caption(all_events) + + for idx, event in enumerate(all_events): + if "dDurationMs" not in event or "segs" not in event: + # some events won't have a duration or segs + print(f"skipping subtitle event without content: {event}") + continue + + cue = { + "start": self._ms_conv(event["tStartMs"]), + "end": self._ms_conv(event["tStartMs"] + event["dDurationMs"]), + "text": "".join([i.get("utf8") for i in event["segs"]]), + "idx": idx + 1, + } + self.all_cues.append(cue) + + @staticmethod + def _flat_auto_caption(all_events): + """flatten autocaption segments""" + flatten = [] + for event in all_events: + if "segs" not in event.keys(): + continue + text = "".join([i.get("utf8") for i in event.get("segs")]) + if not text.strip(): + continue + + if flatten: + # fix overlapping retiming issue + last = flatten[-1] + if "dDurationMs" not in last or "segs" not in last: + # some events won't have a duration or segs + print(f"skipping subtitle event without content: {event}") + continue + + last_end = last["tStartMs"] + last["dDurationMs"] + if event["tStartMs"] < last_end: + joined = last["segs"][0]["utf8"] + "\n" + text + last["segs"][0]["utf8"] = joined + continue + + event.update({"segs": [{"utf8": text}]}) + flatten.append(event) + + return flatten + + @staticmethod + def _ms_conv(ms): + """convert ms to timestamp""" + hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) + minutes = str((ms // (1000 * 60)) % 60).zfill(2) + secs = str((ms // 1000) % 60).zfill(2) + millis = str(ms % 1000).zfill(3) + + return f"{hours}:{minutes}:{secs}.{millis}" + + def get_subtitle_str(self): + """create vtt text str from cues""" + subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}" + + for cue in self.all_cues: + stamp = f"{cue.get('start')} --> {cue.get('end')}" + cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}" + subtitle_str = subtitle_str + cue_text + + return subtitle_str + + def create_bulk_import(self, video, source): + """subtitle lines for es import""" + documents = self._create_documents(video, source) + bulk_list = [] + + for document in documents: + document_id = document.get("subtitle_fragment_id") + action = {"index": {"_index": "ta_subtitle", "_id": document_id}} + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(document)) + + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + return query_str + + def _create_documents(self, video, source): + """process documents""" + documents = self._chunk_list(video.youtube_id) + channel = video.json_data.get("channel") + meta_dict = { + "youtube_id": video.youtube_id, + "title": video.json_data.get("title"), + "subtitle_channel": channel.get("channel_name"), + "subtitle_channel_id": channel.get("channel_id"), + "subtitle_last_refresh": int(datetime.now().strftime("%s")), + "subtitle_lang": self.lang, + "subtitle_source": source, + } + + _ = [i.update(meta_dict) for i in documents] + + return documents + + def _chunk_list(self, youtube_id): + """join cues for bulk import""" + chunk_list = [] + + chunk = {} + for cue in self.all_cues: + if chunk: + text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n" + chunk["subtitle_line"] = text + else: + idx = len(chunk_list) + 1 + chunk = { + "subtitle_index": idx, + "subtitle_line": cue.get("text"), + "subtitle_start": cue.get("start"), + } + + chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}" + + if cue["idx"] % 5 == 0: + chunk["subtitle_end"] = cue.get("end") + chunk_list.append(chunk) + chunk = {} + + return chunk_list diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index d02eae03..b9ce08f6 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -4,7 +4,6 @@ functionality: - index and update in es """ -import json import os from datetime import datetime @@ -14,323 +13,12 @@ from home.src.es.connect import ElasticWrap from home.src.index import channel as ta_channel from home.src.index import playlist as ta_playlist from home.src.index.generic import YouTubeItem -from home.src.ta.helper import ( - DurationConverter, - clean_string, - randomizor, - requests_headers, -) +from home.src.index.subtitle import YoutubeSubtitle +from home.src.ta.helper import DurationConverter, clean_string, randomizor from home.src.ta.ta_redis import RedisArchivist from ryd_client import ryd_client -class YoutubeSubtitle: - """handle video subtitle functionality""" - - def __init__(self, video): - self.video = video - self.languages = False - - def _sub_conf_parse(self): - """add additional conf values to self""" - languages_raw = self.video.config["downloads"]["subtitle"] - if languages_raw: - self.languages = [i.strip() for i in languages_raw.split(",")] - - def get_subtitles(self): - """check what to do""" - self._sub_conf_parse() - if not self.languages: - # no subtitles - return False - - relevant_subtitles = [] - for lang in self.languages: - user_sub = self._get_user_subtitles(lang) - if user_sub: - relevant_subtitles.append(user_sub) - continue - - if self.video.config["downloads"]["subtitle_source"] == "auto": - auto_cap = self._get_auto_caption(lang) - if auto_cap: - relevant_subtitles.append(auto_cap) - - return relevant_subtitles - - def _get_auto_caption(self, lang): - """get auto_caption subtitles""" - print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") - all_subtitles = self.video.youtube_meta.get("automatic_captions") - - if not all_subtitles: - return False - - video_media_url = self.video.json_data["media_url"] - media_url = video_media_url.replace(".mp4", f".{lang}.vtt") - all_formats = all_subtitles.get(lang) - if not all_formats: - return False - - subtitle = [i for i in all_formats if i["ext"] == "json3"][0] - subtitle.update( - {"lang": lang, "source": "auto", "media_url": media_url} - ) - - return subtitle - - def _normalize_lang(self): - """normalize country specific language keys""" - all_subtitles = self.video.youtube_meta.get("subtitles") - if not all_subtitles: - return False - - all_keys = list(all_subtitles.keys()) - for key in all_keys: - lang = key.split("-")[0] - old = all_subtitles.pop(key) - if lang == "live_chat": - continue - all_subtitles[lang] = old - - return all_subtitles - - def _get_user_subtitles(self, lang): - """get subtitles uploaded from channel owner""" - print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") - all_subtitles = self._normalize_lang() - if not all_subtitles: - return False - - video_media_url = self.video.json_data["media_url"] - media_url = video_media_url.replace(".mp4", f".{lang}.vtt") - all_formats = all_subtitles.get(lang) - if not all_formats: - # no user subtitles found - return False - - subtitle = [i for i in all_formats if i["ext"] == "json3"][0] - subtitle.update( - {"lang": lang, "source": "user", "media_url": media_url} - ) - - return subtitle - - def download_subtitles(self, relevant_subtitles): - """download subtitle files to archive""" - videos_base = self.video.config["application"]["videos"] - indexed = [] - for subtitle in relevant_subtitles: - dest_path = os.path.join(videos_base, subtitle["media_url"]) - source = subtitle["source"] - lang = subtitle.get("lang") - response = requests.get( - subtitle["url"], headers=requests_headers() - ) - if not response.ok: - print(f"{self.video.youtube_id}: failed to download subtitle") - print(response.text) - continue - - parser = SubtitleParser(response.text, lang, source) - parser.process() - if not parser.all_cues: - continue - - subtitle_str = parser.get_subtitle_str() - self._write_subtitle_file(dest_path, subtitle_str) - if self.video.config["downloads"]["subtitle_index"]: - query_str = parser.create_bulk_import(self.video, source) - self._index_subtitle(query_str) - - indexed.append(subtitle) - - return indexed - - @staticmethod - def _write_subtitle_file(dest_path, subtitle_str): - """write subtitle file to disk""" - # create folder here for first video of channel - os.makedirs(os.path.split(dest_path)[0], exist_ok=True) - with open(dest_path, "w", encoding="utf-8") as subfile: - subfile.write(subtitle_str) - - @staticmethod - def _index_subtitle(query_str): - """send subtitle to es for indexing""" - _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) - - def delete(self, subtitles=False): - """delete subtitles from index and filesystem""" - youtube_id = self.video.youtube_id - videos_base = self.video.config["application"]["videos"] - # delete files - if subtitles: - files = [i["media_url"] for i in subtitles] - else: - if not self.video.json_data.get("subtitles"): - return - - files = [i["media_url"] for i in self.video.json_data["subtitles"]] - - for file_name in files: - file_path = os.path.join(videos_base, file_name) - try: - os.remove(file_path) - except FileNotFoundError: - print(f"{youtube_id}: {file_path} failed to delete") - # delete from index - path = "ta_subtitle/_delete_by_query?refresh=true" - data = {"query": {"term": {"youtube_id": {"value": youtube_id}}}} - _, _ = ElasticWrap(path).post(data=data) - - -class SubtitleParser: - """parse subtitle str from youtube""" - - def __init__(self, subtitle_str, lang, source): - self.subtitle_raw = json.loads(subtitle_str) - self.lang = lang - self.source = source - self.all_cues = False - - def process(self): - """extract relevant que data""" - self.all_cues = [] - all_events = self.subtitle_raw.get("events") - - if not all_events: - return - - if self.source == "auto": - all_events = self._flat_auto_caption(all_events) - - for idx, event in enumerate(all_events): - if "dDurationMs" not in event or "segs" not in event: - # some events won't have a duration or segs - print(f"skipping subtitle event without content: {event}") - continue - - cue = { - "start": self._ms_conv(event["tStartMs"]), - "end": self._ms_conv(event["tStartMs"] + event["dDurationMs"]), - "text": "".join([i.get("utf8") for i in event["segs"]]), - "idx": idx + 1, - } - self.all_cues.append(cue) - - @staticmethod - def _flat_auto_caption(all_events): - """flatten autocaption segments""" - flatten = [] - for event in all_events: - if "segs" not in event.keys(): - continue - text = "".join([i.get("utf8") for i in event.get("segs")]) - if not text.strip(): - continue - - if flatten: - # fix overlapping retiming issue - last = flatten[-1] - if "dDurationMs" not in last or "segs" not in last: - # some events won't have a duration or segs - print(f"skipping subtitle event without content: {event}") - continue - - last_end = last["tStartMs"] + last["dDurationMs"] - if event["tStartMs"] < last_end: - joined = last["segs"][0]["utf8"] + "\n" + text - last["segs"][0]["utf8"] = joined - continue - - event.update({"segs": [{"utf8": text}]}) - flatten.append(event) - - return flatten - - @staticmethod - def _ms_conv(ms): - """convert ms to timestamp""" - hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) - minutes = str((ms // (1000 * 60)) % 60).zfill(2) - secs = str((ms // 1000) % 60).zfill(2) - millis = str(ms % 1000).zfill(3) - - return f"{hours}:{minutes}:{secs}.{millis}" - - def get_subtitle_str(self): - """create vtt text str from cues""" - subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}" - - for cue in self.all_cues: - stamp = f"{cue.get('start')} --> {cue.get('end')}" - cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}" - subtitle_str = subtitle_str + cue_text - - return subtitle_str - - def create_bulk_import(self, video, source): - """subtitle lines for es import""" - documents = self._create_documents(video, source) - bulk_list = [] - - for document in documents: - document_id = document.get("subtitle_fragment_id") - action = {"index": {"_index": "ta_subtitle", "_id": document_id}} - bulk_list.append(json.dumps(action)) - bulk_list.append(json.dumps(document)) - - bulk_list.append("\n") - query_str = "\n".join(bulk_list) - - return query_str - - def _create_documents(self, video, source): - """process documents""" - documents = self._chunk_list(video.youtube_id) - channel = video.json_data.get("channel") - meta_dict = { - "youtube_id": video.youtube_id, - "title": video.json_data.get("title"), - "subtitle_channel": channel.get("channel_name"), - "subtitle_channel_id": channel.get("channel_id"), - "subtitle_last_refresh": int(datetime.now().strftime("%s")), - "subtitle_lang": self.lang, - "subtitle_source": source, - } - - _ = [i.update(meta_dict) for i in documents] - - return documents - - def _chunk_list(self, youtube_id): - """join cues for bulk import""" - chunk_list = [] - - chunk = {} - for cue in self.all_cues: - if chunk: - text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n" - chunk["subtitle_line"] = text - else: - idx = len(chunk_list) + 1 - chunk = { - "subtitle_index": idx, - "subtitle_line": cue.get("text"), - "subtitle_start": cue.get("start"), - } - - chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}" - - if cue["idx"] % 5 == 0: - chunk["subtitle_end"] = cue.get("end") - chunk_list.append(chunk) - chunk = {} - - return chunk_list - - class SponsorBlock: """handle sponsor block integration""" From 21f3ee88b9d1a382f32cfb605ad94a0d1b5cc8e5 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 15 Aug 2022 15:18:38 +0700 Subject: [PATCH 11/15] extract vtt from mkv and mp4 for import --- tubearchivist/home/src/index/filesystem.py | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index 3689fed8..c307d99f 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -20,6 +20,7 @@ from home.src.ta.config import AppConfig from home.src.ta.helper import clean_string, ignore_filelist from home.src.ta.ta_redis import RedisArchivist from PIL import Image, ImageFile +from yt_dlp.utils import ISO639Utils ImageFile.LOAD_TRUNCATED_IMAGES = True @@ -257,6 +258,7 @@ class ImportFolderScanner: self._detect_youtube_id(current_video) self._dump_thumb(current_video) self._convert_thumb(current_video) + self._get_subtitles(current_video) self._convert_video(current_video) ManualImport(current_video, self.CONFIG).run() @@ -388,6 +390,34 @@ class ImportFolderScanner: os.remove(thumb_path) current_video["thumb"] = new_path + def _get_subtitles(self, current_video): + """find all subtitles in media file""" + if current_video["subtitle"]: + return + + media_path = current_video["media"] + streams = self._get_streams(media_path) + base_path, ext = os.path.splitext(media_path) + + if ext == ".webm": + print(f"{media_path}: subtitle extract from webm not supported") + return + + for idx, stream in enumerate(streams["streams"]): + if stream["codec_type"] == "subtitle": + lang = ISO639Utils.long2short(stream["tags"]["language"]) + sub_path = f"{base_path}.{lang}.vtt" + self._dump_subtitle(idx, media_path, sub_path) + current_video["subtitle"].append(sub_path) + + @staticmethod + def _dump_subtitle(idx, media_path, sub_path): + """extract subtitle from media file""" + subprocess.run( + ["ffmpeg", "-i", media_path, "-map", f"0:{idx}", sub_path], + check=True, + ) + @staticmethod def _get_streams(media_path): """return all streams from media_path""" From 0b41d288db6cf9aace401298f644dc18b0edc427 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 16 Aug 2022 12:19:27 +0700 Subject: [PATCH 12/15] fix mkv cover stream extractor --- tubearchivist/home/src/index/filesystem.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index c307d99f..4477eb19 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -316,7 +316,7 @@ class ImportFolderScanner: new_path = False if ext == ".mkv": idx, thumb_type = self._get_mkv_thumb_stream(media_path) - if idx: + if idx is not None: new_path = self.dump_mpv_thumb(media_path, idx, thumb_type) elif ext == ".mp4": @@ -340,7 +340,7 @@ class ImportFolderScanner: _, ext = os.path.splitext(tags["filename"]) return idx, ext - return False, False + return None, None @staticmethod def dump_mpv_thumb(media_path, idx, thumb_type): From a4932b163b5a2e1abd4f8fd255b0dc45b6338909 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 16 Aug 2022 15:42:26 +0700 Subject: [PATCH 13/15] bump ryd --- tubearchivist/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index a14643ef..0d9a260f 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -7,7 +7,7 @@ djangorestframework==3.13.1 Pillow==9.2.0 redis==4.3.4 requests==2.28.1 -ryd-client==0.0.5 +ryd-client==0.0.6 uWSGI==2.0.20 whitenoise==6.2.0 yt_dlp==2022.8.14 From 0210a97b4883af3e4baeeba4bbeeed875f2c308d Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 16 Aug 2022 15:42:54 +0700 Subject: [PATCH 14/15] archive vtt files for offline import --- tubearchivist/home/src/index/filesystem.py | 8 ++++++- tubearchivist/home/src/index/video.py | 26 +++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index 4477eb19..4f6c88c0 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -511,7 +511,7 @@ class ManualImport: print(f"{video_id}: manual import failed, and no metadata found.") raise ValueError - video.check_subtitles() + video.check_subtitles(subtitle_files=self.current_video["subtitle"]) video.upload_to_es() if video.offline_import and self.current_video["thumb"]: @@ -547,6 +547,12 @@ class ManualImport: new_path = os.path.join(channel_folder, file) shutil.move(old_path, new_path, copy_function=shutil.copyfile) + base_name, _ = os.path.splitext(new_path) + for old_path in self.current_video["subtitle"]: + lang = old_path.split(".")[-2] + new_path = f"{base_name}.{lang}.vtt" + shutil.move(old_path, new_path, copy_function=shutil.copyfile) + def _cleanup(self, json_data): """cleanup leftover files""" if os.path.exists(self.current_video["metadata"]): diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index b9ce08f6..a68cae2c 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -344,14 +344,38 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): if sponsorblock: self.json_data["sponsorblock"] = sponsorblock - def check_subtitles(self): + def check_subtitles(self, subtitle_files=False): """optionally add subtitles""" + if self.offline_import and subtitle_files: + indexed = self._offline_subtitles(subtitle_files) + self.json_data["subtitles"] = indexed + return + handler = YoutubeSubtitle(self) subtitles = handler.get_subtitles() if subtitles: indexed = handler.download_subtitles(relevant_subtitles=subtitles) self.json_data["subtitles"] = indexed + def _offline_subtitles(self, subtitle_files): + """import offline subtitles""" + base_name, _ = os.path.splitext(self.json_data["media_url"]) + subtitles = [] + for subtitle in subtitle_files: + lang = subtitle.split(".")[-2] + subtitle_media_url = f"{base_name}.{lang}.vtt" + to_add = { + "ext": "vtt", + "url": False, + "name": lang, + "lang": lang, + "source": "file", + "media_url": subtitle_media_url, + } + subtitles.append(to_add) + + return subtitles + def update_media_url(self): """update only media_url in es for reindex channel rename""" data = {"doc": {"media_url": self.json_data["media_url"]}} From 1a0421a45a62ef5c2cf42754757f63cedca882a5 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 16 Aug 2022 16:29:17 +0700 Subject: [PATCH 15/15] update Manual Media Files Import docs --- docs/Settings.md | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/docs/Settings.md b/docs/Settings.md index 8dd7ba2a..3bc2e74b 100644 --- a/docs/Settings.md +++ b/docs/Settings.md @@ -116,25 +116,32 @@ Additional database functionality. The button **Delete all queued** will delete all pending videos from the download queue. The button **Delete all ignored** will delete all videos you have previously ignored. ## Manual Media Files Import -So far this depends on the video you are trying to import to be still available on YouTube to get the metadata. Add the files you'd like to import to the */cache/import* folder. Then start the process from the settings page *Manual Media Files Import*. Make sure to follow one of the two methods below. +Add the files you'd like to import to the */cache/import* folder. Only add files, don't add subdirectories. All files you are adding, need to have the same *base name* as the media file. Then start the process from the settings page *Manual Media Files Import*. + +Valid media extensions are *.mp4*, *.mkv* or *.webm*. If you have other file extensions or incompatible codecs, convert them first to mp4. **Tube Archivist** can identify the videos with one of the following methods. ### Method 1: -Add a matching *.json* file with the media file. Both files need to have the same base name, for example: -- For the media file: \.mp4 -- For the JSON file: \.info.json -- Alternate JSON file: \.json +Add a matching *.info.json* file with the media file. Both files need to have the same base name, for example: +- For the media file: `.mp4` +- For the JSON file: `.info.json` -**Tube Archivist** then looks for the 'id' key within the JSON file to identify the video. +The import process then looks for the 'id' key within the JSON file to identify the video. ### Method 2: Detect the YouTube ID from filename, this accepts the default yt-dlp naming convention for file names like: -- \[\].mp4 +- `[].mp4` - The YouTube ID in square brackets at the end of the filename is the crucial part. +### Offline import: +If the video you are trying to import is not available on YouTube any more, **Tube Archivist** can import the required metadata: +- The file `.info.json` is required to extract the required information. +- Add the thumbnail as `.`, where valid file extensions are *.jpg*, *.png* or *.webp*. If there is no thumbnail file, **Tube Archivist** will try to extract it from the media file or will fallback to a default thumbnail. +- Add subtitles as `..vtt` where *lang* is the two letter ISO country code. This will archive all subtitle files you add to the import folder, independent from your configurations. Subtitles can be archived and used in the player, but they can't be indexed or made searchable due to the fact, that they have a very different structure than the subtitles as **Tube Archivist** needs them. +- For videos, where the whole channel is not available any more, you can add the `.info.json` file as generated by *youtube-dl/yt-dlp* to get the full metadata. Alternatively **Tube Archivist** will extract as much info as possible from the video info.json file. + ### Some notes: - This will **consume** the files you put into the import folder: Files will get converted to mp4 if needed (this might take a long time...) and moved to the archive, *.json* files will get deleted upon completion to avoid having duplicates on the next run. -- For best file transcoding quality, convert your media files with desired settings first before importing (#138). -- There should be no subdirectories added to */cache/import*, only video files. If your existing video library has video files inside subdirectories, you can get all the files into one directory by running `find ./ -mindepth 2 -type f -exec mv '{}' . \;` from the top-level directory of your existing video library. You can also delete any remaining empty subdirectories with `find ./ -mindepth 1 -type d -delete`. +- For best file transcoding quality, convert your media files with desired settings first before importing. - Maybe start with a subset of your files to import to make sure everything goes well... - Follow the logs to monitor progress and errors: `docker-compose logs -f tubearchivist`.