diff --git a/docs/Settings.md b/docs/Settings.md index 8dd7ba2a..3bc2e74b 100644 --- a/docs/Settings.md +++ b/docs/Settings.md @@ -116,25 +116,32 @@ Additional database functionality. The button **Delete all queued** will delete all pending videos from the download queue. The button **Delete all ignored** will delete all videos you have previously ignored. ## Manual Media Files Import -So far this depends on the video you are trying to import to be still available on YouTube to get the metadata. Add the files you'd like to import to the */cache/import* folder. Then start the process from the settings page *Manual Media Files Import*. Make sure to follow one of the two methods below. +Add the files you'd like to import to the */cache/import* folder. Only add files, don't add subdirectories. All files you are adding, need to have the same *base name* as the media file. Then start the process from the settings page *Manual Media Files Import*. + +Valid media extensions are *.mp4*, *.mkv* or *.webm*. If you have other file extensions or incompatible codecs, convert them first to mp4. **Tube Archivist** can identify the videos with one of the following methods. ### Method 1: -Add a matching *.json* file with the media file. Both files need to have the same base name, for example: -- For the media file: \.mp4 -- For the JSON file: \.info.json -- Alternate JSON file: \.json +Add a matching *.info.json* file with the media file. Both files need to have the same base name, for example: +- For the media file: `.mp4` +- For the JSON file: `.info.json` -**Tube Archivist** then looks for the 'id' key within the JSON file to identify the video. +The import process then looks for the 'id' key within the JSON file to identify the video. ### Method 2: Detect the YouTube ID from filename, this accepts the default yt-dlp naming convention for file names like: -- \[\].mp4 +- `[].mp4` - The YouTube ID in square brackets at the end of the filename is the crucial part. +### Offline import: +If the video you are trying to import is not available on YouTube any more, **Tube Archivist** can import the required metadata: +- The file `.info.json` is required to extract the required information. +- Add the thumbnail as `.`, where valid file extensions are *.jpg*, *.png* or *.webp*. If there is no thumbnail file, **Tube Archivist** will try to extract it from the media file or will fallback to a default thumbnail. +- Add subtitles as `..vtt` where *lang* is the two letter ISO country code. This will archive all subtitle files you add to the import folder, independent from your configurations. Subtitles can be archived and used in the player, but they can't be indexed or made searchable due to the fact, that they have a very different structure than the subtitles as **Tube Archivist** needs them. +- For videos, where the whole channel is not available any more, you can add the `.info.json` file as generated by *youtube-dl/yt-dlp* to get the full metadata. Alternatively **Tube Archivist** will extract as much info as possible from the video info.json file. + ### Some notes: - This will **consume** the files you put into the import folder: Files will get converted to mp4 if needed (this might take a long time...) and moved to the archive, *.json* files will get deleted upon completion to avoid having duplicates on the next run. -- For best file transcoding quality, convert your media files with desired settings first before importing (#138). -- There should be no subdirectories added to */cache/import*, only video files. If your existing video library has video files inside subdirectories, you can get all the files into one directory by running `find ./ -mindepth 2 -type f -exec mv '{}' . \;` from the top-level directory of your existing video library. You can also delete any remaining empty subdirectories with `find ./ -mindepth 1 -type d -delete`. +- For best file transcoding quality, convert your media files with desired settings first before importing. - Maybe start with a subset of your files to import to make sure everything goes well... - Follow the logs to monitor progress and errors: `docker-compose logs -f tubearchivist`. diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 50bea8d6..b9b2cb2b 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -33,7 +33,13 @@ SECRET_KEY = PW_HASH.hexdigest() DEBUG = bool(environ.get("DJANGO_DEBUG")) ALLOWED_HOSTS = [i.strip() for i in environ.get("TA_HOST").split()] -CSRF_TRUSTED_ORIGINS = [i.strip() for i in environ.get("TA_HOST").split()] + +CSRF_TRUSTED_ORIGINS = [] +for host in ALLOWED_HOSTS: + if host.startswith("http://") or host.startswith("https://"): + CSRF_TRUSTED_ORIGINS.append(host) + else: + CSRF_TRUSTED_ORIGINS.append(f"http://{host}") # Application definition @@ -87,6 +93,7 @@ TEMPLATES = [ WSGI_APPLICATION = "config.wsgi.application" if bool(environ.get("TA_LDAP")): + # pylint: disable=global-at-module-level global AUTH_LDAP_SERVER_URI AUTH_LDAP_SERVER_URI = environ.get("TA_LDAP_SERVER_URI") @@ -97,6 +104,7 @@ if bool(environ.get("TA_LDAP")): AUTH_LDAP_BIND_PASSWORD = environ.get("TA_LDAP_BIND_PASSWORD") global AUTH_LDAP_USER_SEARCH + # pylint: disable=no-member AUTH_LDAP_USER_SEARCH = LDAPSearch( environ.get("TA_LDAP_USER_BASE"), ldap.SCOPE_SUBTREE, diff --git a/tubearchivist/home/apps.py b/tubearchivist/home/apps.py index 5f02ba0f..1c44bbcc 100644 --- a/tubearchivist/home/apps.py +++ b/tubearchivist/home/apps.py @@ -77,6 +77,7 @@ class StartupCheck: "dl_queue", "dl_queue_id", "rescan", + "run_backup", ] for lock in all_locks: response = self.redis_con.del_message(lock) diff --git a/tubearchivist/home/src/download/thumbnails.py b/tubearchivist/home/src/download/thumbnails.py index 43cd981e..4e77453f 100644 --- a/tubearchivist/home/src/download/thumbnails.py +++ b/tubearchivist/home/src/download/thumbnails.py @@ -40,13 +40,13 @@ class ThumbManagerBase: for i in range(3): try: - response = requests.get(url, stream=True) + response = requests.get(url, stream=True, timeout=5) if response.ok: return Image.open(response.raw) if response.status_code == 404: return self.get_fallback() - except ConnectionError: + except requests.exceptions.RequestException: print(f"{self.item_id}: retry thumbnail download {url}") sleep((i + 1) ** i) diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 900df878..4459f9b5 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -159,12 +159,14 @@ class MultiSearchForm(forms.Form): class AddToQueueForm(forms.Form): """text area form to add to downloads""" + HELP_TEXT = "Enter at least one video, channel or playlist id/URL here..." + vid_url = forms.CharField( label=False, widget=forms.Textarea( attrs={ "rows": 4, - "placeholder": "Enter Video Urls or IDs here...", + "placeholder": HELP_TEXT, } ), ) diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index 3689fed8..4f6c88c0 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -20,6 +20,7 @@ from home.src.ta.config import AppConfig from home.src.ta.helper import clean_string, ignore_filelist from home.src.ta.ta_redis import RedisArchivist from PIL import Image, ImageFile +from yt_dlp.utils import ISO639Utils ImageFile.LOAD_TRUNCATED_IMAGES = True @@ -257,6 +258,7 @@ class ImportFolderScanner: self._detect_youtube_id(current_video) self._dump_thumb(current_video) self._convert_thumb(current_video) + self._get_subtitles(current_video) self._convert_video(current_video) ManualImport(current_video, self.CONFIG).run() @@ -314,7 +316,7 @@ class ImportFolderScanner: new_path = False if ext == ".mkv": idx, thumb_type = self._get_mkv_thumb_stream(media_path) - if idx: + if idx is not None: new_path = self.dump_mpv_thumb(media_path, idx, thumb_type) elif ext == ".mp4": @@ -338,7 +340,7 @@ class ImportFolderScanner: _, ext = os.path.splitext(tags["filename"]) return idx, ext - return False, False + return None, None @staticmethod def dump_mpv_thumb(media_path, idx, thumb_type): @@ -388,6 +390,34 @@ class ImportFolderScanner: os.remove(thumb_path) current_video["thumb"] = new_path + def _get_subtitles(self, current_video): + """find all subtitles in media file""" + if current_video["subtitle"]: + return + + media_path = current_video["media"] + streams = self._get_streams(media_path) + base_path, ext = os.path.splitext(media_path) + + if ext == ".webm": + print(f"{media_path}: subtitle extract from webm not supported") + return + + for idx, stream in enumerate(streams["streams"]): + if stream["codec_type"] == "subtitle": + lang = ISO639Utils.long2short(stream["tags"]["language"]) + sub_path = f"{base_path}.{lang}.vtt" + self._dump_subtitle(idx, media_path, sub_path) + current_video["subtitle"].append(sub_path) + + @staticmethod + def _dump_subtitle(idx, media_path, sub_path): + """extract subtitle from media file""" + subprocess.run( + ["ffmpeg", "-i", media_path, "-map", f"0:{idx}", sub_path], + check=True, + ) + @staticmethod def _get_streams(media_path): """return all streams from media_path""" @@ -481,7 +511,7 @@ class ManualImport: print(f"{video_id}: manual import failed, and no metadata found.") raise ValueError - video.check_subtitles() + video.check_subtitles(subtitle_files=self.current_video["subtitle"]) video.upload_to_es() if video.offline_import and self.current_video["thumb"]: @@ -517,6 +547,12 @@ class ManualImport: new_path = os.path.join(channel_folder, file) shutil.move(old_path, new_path, copy_function=shutil.copyfile) + base_name, _ = os.path.splitext(new_path) + for old_path in self.current_video["subtitle"]: + lang = old_path.split(".")[-2] + new_path = f"{base_name}.{lang}.vtt" + shutil.move(old_path, new_path, copy_function=shutil.copyfile) + def _cleanup(self, json_data): """cleanup leftover files""" if os.path.exists(self.current_video["metadata"]): diff --git a/tubearchivist/home/src/index/subtitle.py b/tubearchivist/home/src/index/subtitle.py new file mode 100644 index 00000000..1c8e0acc --- /dev/null +++ b/tubearchivist/home/src/index/subtitle.py @@ -0,0 +1,321 @@ +""" +functionality: +- download subtitles +- parse subtitles into it's cues +- index dubtitles +""" + +import json +import os +from datetime import datetime + +import requests +from home.src.es.connect import ElasticWrap +from home.src.ta.helper import requests_headers + + +class YoutubeSubtitle: + """handle video subtitle functionality""" + + def __init__(self, video): + self.video = video + self.languages = False + + def _sub_conf_parse(self): + """add additional conf values to self""" + languages_raw = self.video.config["downloads"]["subtitle"] + if languages_raw: + self.languages = [i.strip() for i in languages_raw.split(",")] + + def get_subtitles(self): + """check what to do""" + self._sub_conf_parse() + if not self.languages: + # no subtitles + return False + + relevant_subtitles = [] + for lang in self.languages: + user_sub = self._get_user_subtitles(lang) + if user_sub: + relevant_subtitles.append(user_sub) + continue + + if self.video.config["downloads"]["subtitle_source"] == "auto": + auto_cap = self._get_auto_caption(lang) + if auto_cap: + relevant_subtitles.append(auto_cap) + + return relevant_subtitles + + def _get_auto_caption(self, lang): + """get auto_caption subtitles""" + print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") + all_subtitles = self.video.youtube_meta.get("automatic_captions") + + if not all_subtitles: + return False + + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f".{lang}.vtt") + all_formats = all_subtitles.get(lang) + if not all_formats: + return False + + subtitle = [i for i in all_formats if i["ext"] == "json3"][0] + subtitle.update( + {"lang": lang, "source": "auto", "media_url": media_url} + ) + + return subtitle + + def _normalize_lang(self): + """normalize country specific language keys""" + all_subtitles = self.video.youtube_meta.get("subtitles") + if not all_subtitles: + return False + + all_keys = list(all_subtitles.keys()) + for key in all_keys: + lang = key.split("-")[0] + old = all_subtitles.pop(key) + if lang == "live_chat": + continue + all_subtitles[lang] = old + + return all_subtitles + + def _get_user_subtitles(self, lang): + """get subtitles uploaded from channel owner""" + print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") + all_subtitles = self._normalize_lang() + if not all_subtitles: + return False + + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f".{lang}.vtt") + all_formats = all_subtitles.get(lang) + if not all_formats: + # no user subtitles found + return False + + subtitle = [i for i in all_formats if i["ext"] == "json3"][0] + subtitle.update( + {"lang": lang, "source": "user", "media_url": media_url} + ) + + return subtitle + + def download_subtitles(self, relevant_subtitles): + """download subtitle files to archive""" + videos_base = self.video.config["application"]["videos"] + indexed = [] + for subtitle in relevant_subtitles: + dest_path = os.path.join(videos_base, subtitle["media_url"]) + source = subtitle["source"] + lang = subtitle.get("lang") + response = requests.get( + subtitle["url"], headers=requests_headers() + ) + if not response.ok: + print(f"{self.video.youtube_id}: failed to download subtitle") + print(response.text) + continue + + parser = SubtitleParser(response.text, lang, source) + parser.process() + if not parser.all_cues: + continue + + subtitle_str = parser.get_subtitle_str() + self._write_subtitle_file(dest_path, subtitle_str) + if self.video.config["downloads"]["subtitle_index"]: + query_str = parser.create_bulk_import(self.video, source) + self._index_subtitle(query_str) + + indexed.append(subtitle) + + return indexed + + @staticmethod + def _write_subtitle_file(dest_path, subtitle_str): + """write subtitle file to disk""" + # create folder here for first video of channel + os.makedirs(os.path.split(dest_path)[0], exist_ok=True) + with open(dest_path, "w", encoding="utf-8") as subfile: + subfile.write(subtitle_str) + + @staticmethod + def _index_subtitle(query_str): + """send subtitle to es for indexing""" + _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) + + def delete(self, subtitles=False): + """delete subtitles from index and filesystem""" + youtube_id = self.video.youtube_id + videos_base = self.video.config["application"]["videos"] + # delete files + if subtitles: + files = [i["media_url"] for i in subtitles] + else: + if not self.video.json_data.get("subtitles"): + return + + files = [i["media_url"] for i in self.video.json_data["subtitles"]] + + for file_name in files: + file_path = os.path.join(videos_base, file_name) + try: + os.remove(file_path) + except FileNotFoundError: + print(f"{youtube_id}: {file_path} failed to delete") + # delete from index + path = "ta_subtitle/_delete_by_query?refresh=true" + data = {"query": {"term": {"youtube_id": {"value": youtube_id}}}} + _, _ = ElasticWrap(path).post(data=data) + + +class SubtitleParser: + """parse subtitle str from youtube""" + + def __init__(self, subtitle_str, lang, source): + self.subtitle_raw = json.loads(subtitle_str) + self.lang = lang + self.source = source + self.all_cues = False + + def process(self): + """extract relevant que data""" + self.all_cues = [] + all_events = self.subtitle_raw.get("events") + + if not all_events: + return + + if self.source == "auto": + all_events = self._flat_auto_caption(all_events) + + for idx, event in enumerate(all_events): + if "dDurationMs" not in event or "segs" not in event: + # some events won't have a duration or segs + print(f"skipping subtitle event without content: {event}") + continue + + cue = { + "start": self._ms_conv(event["tStartMs"]), + "end": self._ms_conv(event["tStartMs"] + event["dDurationMs"]), + "text": "".join([i.get("utf8") for i in event["segs"]]), + "idx": idx + 1, + } + self.all_cues.append(cue) + + @staticmethod + def _flat_auto_caption(all_events): + """flatten autocaption segments""" + flatten = [] + for event in all_events: + if "segs" not in event.keys(): + continue + text = "".join([i.get("utf8") for i in event.get("segs")]) + if not text.strip(): + continue + + if flatten: + # fix overlapping retiming issue + last = flatten[-1] + if "dDurationMs" not in last or "segs" not in last: + # some events won't have a duration or segs + print(f"skipping subtitle event without content: {event}") + continue + + last_end = last["tStartMs"] + last["dDurationMs"] + if event["tStartMs"] < last_end: + joined = last["segs"][0]["utf8"] + "\n" + text + last["segs"][0]["utf8"] = joined + continue + + event.update({"segs": [{"utf8": text}]}) + flatten.append(event) + + return flatten + + @staticmethod + def _ms_conv(ms): + """convert ms to timestamp""" + hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) + minutes = str((ms // (1000 * 60)) % 60).zfill(2) + secs = str((ms // 1000) % 60).zfill(2) + millis = str(ms % 1000).zfill(3) + + return f"{hours}:{minutes}:{secs}.{millis}" + + def get_subtitle_str(self): + """create vtt text str from cues""" + subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}" + + for cue in self.all_cues: + stamp = f"{cue.get('start')} --> {cue.get('end')}" + cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}" + subtitle_str = subtitle_str + cue_text + + return subtitle_str + + def create_bulk_import(self, video, source): + """subtitle lines for es import""" + documents = self._create_documents(video, source) + bulk_list = [] + + for document in documents: + document_id = document.get("subtitle_fragment_id") + action = {"index": {"_index": "ta_subtitle", "_id": document_id}} + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(document)) + + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + return query_str + + def _create_documents(self, video, source): + """process documents""" + documents = self._chunk_list(video.youtube_id) + channel = video.json_data.get("channel") + meta_dict = { + "youtube_id": video.youtube_id, + "title": video.json_data.get("title"), + "subtitle_channel": channel.get("channel_name"), + "subtitle_channel_id": channel.get("channel_id"), + "subtitle_last_refresh": int(datetime.now().strftime("%s")), + "subtitle_lang": self.lang, + "subtitle_source": source, + } + + _ = [i.update(meta_dict) for i in documents] + + return documents + + def _chunk_list(self, youtube_id): + """join cues for bulk import""" + chunk_list = [] + + chunk = {} + for cue in self.all_cues: + if chunk: + text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n" + chunk["subtitle_line"] = text + else: + idx = len(chunk_list) + 1 + chunk = { + "subtitle_index": idx, + "subtitle_line": cue.get("text"), + "subtitle_start": cue.get("start"), + } + + chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}" + + if cue["idx"] % 5 == 0: + chunk["subtitle_end"] = cue.get("end") + chunk_list.append(chunk) + chunk = {} + + return chunk_list diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index d7d8b983..a68cae2c 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -4,7 +4,6 @@ functionality: - index and update in es """ -import json import os from datetime import datetime @@ -14,311 +13,12 @@ from home.src.es.connect import ElasticWrap from home.src.index import channel as ta_channel from home.src.index import playlist as ta_playlist from home.src.index.generic import YouTubeItem -from home.src.ta.helper import ( - DurationConverter, - clean_string, - randomizor, - requests_headers, -) +from home.src.index.subtitle import YoutubeSubtitle +from home.src.ta.helper import DurationConverter, clean_string, randomizor from home.src.ta.ta_redis import RedisArchivist from ryd_client import ryd_client -class YoutubeSubtitle: - """handle video subtitle functionality""" - - def __init__(self, video): - self.video = video - self.languages = False - - def _sub_conf_parse(self): - """add additional conf values to self""" - languages_raw = self.video.config["downloads"]["subtitle"] - if languages_raw: - self.languages = [i.strip() for i in languages_raw.split(",")] - - def get_subtitles(self): - """check what to do""" - self._sub_conf_parse() - if not self.languages: - # no subtitles - return False - - relevant_subtitles = [] - for lang in self.languages: - user_sub = self._get_user_subtitles(lang) - if user_sub: - relevant_subtitles.append(user_sub) - continue - - if self.video.config["downloads"]["subtitle_source"] == "auto": - auto_cap = self._get_auto_caption(lang) - if auto_cap: - relevant_subtitles.append(auto_cap) - - return relevant_subtitles - - def _get_auto_caption(self, lang): - """get auto_caption subtitles""" - print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") - all_subtitles = self.video.youtube_meta.get("automatic_captions") - - if not all_subtitles: - return False - - video_media_url = self.video.json_data["media_url"] - media_url = video_media_url.replace(".mp4", f".{lang}.vtt") - all_formats = all_subtitles.get(lang) - if not all_formats: - return False - - subtitle = [i for i in all_formats if i["ext"] == "json3"][0] - subtitle.update( - {"lang": lang, "source": "auto", "media_url": media_url} - ) - - return subtitle - - def _normalize_lang(self): - """normalize country specific language keys""" - all_subtitles = self.video.youtube_meta.get("subtitles") - if not all_subtitles: - return False - - all_keys = list(all_subtitles.keys()) - for key in all_keys: - lang = key.split("-")[0] - old = all_subtitles.pop(key) - if lang == "live_chat": - continue - all_subtitles[lang] = old - - return all_subtitles - - def _get_user_subtitles(self, lang): - """get subtitles uploaded from channel owner""" - print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") - all_subtitles = self._normalize_lang() - if not all_subtitles: - return False - - video_media_url = self.video.json_data["media_url"] - media_url = video_media_url.replace(".mp4", f".{lang}.vtt") - all_formats = all_subtitles.get(lang) - if not all_formats: - # no user subtitles found - return False - - subtitle = [i for i in all_formats if i["ext"] == "json3"][0] - subtitle.update( - {"lang": lang, "source": "user", "media_url": media_url} - ) - - return subtitle - - def download_subtitles(self, relevant_subtitles): - """download subtitle files to archive""" - videos_base = self.video.config["application"]["videos"] - for subtitle in relevant_subtitles: - dest_path = os.path.join(videos_base, subtitle["media_url"]) - source = subtitle["source"] - lang = subtitle.get("lang") - response = requests.get( - subtitle["url"], headers=requests_headers() - ) - if not response.ok: - print(f"{self.video.youtube_id}: failed to download subtitle") - print(response.text) - continue - - parser = SubtitleParser(response.text, lang, source) - parser.process() - subtitle_str = parser.get_subtitle_str() - self._write_subtitle_file(dest_path, subtitle_str) - if self.video.config["downloads"]["subtitle_index"]: - query_str = parser.create_bulk_import(self.video, source) - self._index_subtitle(query_str) - - @staticmethod - def _write_subtitle_file(dest_path, subtitle_str): - """write subtitle file to disk""" - # create folder here for first video of channel - os.makedirs(os.path.split(dest_path)[0], exist_ok=True) - with open(dest_path, "w", encoding="utf-8") as subfile: - subfile.write(subtitle_str) - - @staticmethod - def _index_subtitle(query_str): - """send subtitle to es for indexing""" - _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) - - def delete(self, subtitles=False): - """delete subtitles from index and filesystem""" - youtube_id = self.video.youtube_id - videos_base = self.video.config["application"]["videos"] - # delete files - if subtitles: - files = [i["media_url"] for i in subtitles] - else: - if not self.video.json_data.get("subtitles"): - return - - files = [i["media_url"] for i in self.video.json_data["subtitles"]] - - for file_name in files: - file_path = os.path.join(videos_base, file_name) - try: - os.remove(file_path) - except FileNotFoundError: - print(f"{youtube_id}: {file_path} failed to delete") - # delete from index - path = "ta_subtitle/_delete_by_query?refresh=true" - data = {"query": {"term": {"youtube_id": {"value": youtube_id}}}} - _, _ = ElasticWrap(path).post(data=data) - - -class SubtitleParser: - """parse subtitle str from youtube""" - - def __init__(self, subtitle_str, lang, source): - self.subtitle_raw = json.loads(subtitle_str) - self.lang = lang - self.source = source - self.all_cues = False - - def process(self): - """extract relevant que data""" - all_events = self.subtitle_raw.get("events") - if self.source == "auto": - all_events = self._flat_auto_caption(all_events) - - self.all_cues = [] - for idx, event in enumerate(all_events): - if "dDurationMs" not in event or "segs" not in event: - # some events won't have a duration or segs - print(f"skipping subtitle event without content: {event}") - continue - - cue = { - "start": self._ms_conv(event["tStartMs"]), - "end": self._ms_conv(event["tStartMs"] + event["dDurationMs"]), - "text": "".join([i.get("utf8") for i in event["segs"]]), - "idx": idx + 1, - } - self.all_cues.append(cue) - - @staticmethod - def _flat_auto_caption(all_events): - """flatten autocaption segments""" - flatten = [] - for event in all_events: - if "segs" not in event.keys(): - continue - text = "".join([i.get("utf8") for i in event.get("segs")]) - if not text.strip(): - continue - - if flatten: - # fix overlapping retiming issue - last = flatten[-1] - if "dDurationMs" not in last or "segs" not in last: - # some events won't have a duration or segs - print(f"skipping subtitle event without content: {event}") - continue - - last_end = last["tStartMs"] + last["dDurationMs"] - if event["tStartMs"] < last_end: - joined = last["segs"][0]["utf8"] + "\n" + text - last["segs"][0]["utf8"] = joined - continue - - event.update({"segs": [{"utf8": text}]}) - flatten.append(event) - - return flatten - - @staticmethod - def _ms_conv(ms): - """convert ms to timestamp""" - hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) - minutes = str((ms // (1000 * 60)) % 60).zfill(2) - secs = str((ms // 1000) % 60).zfill(2) - millis = str(ms % 1000).zfill(3) - - return f"{hours}:{minutes}:{secs}.{millis}" - - def get_subtitle_str(self): - """create vtt text str from cues""" - subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}" - - for cue in self.all_cues: - stamp = f"{cue.get('start')} --> {cue.get('end')}" - cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}" - subtitle_str = subtitle_str + cue_text - - return subtitle_str - - def create_bulk_import(self, video, source): - """subtitle lines for es import""" - documents = self._create_documents(video, source) - bulk_list = [] - - for document in documents: - document_id = document.get("subtitle_fragment_id") - action = {"index": {"_index": "ta_subtitle", "_id": document_id}} - bulk_list.append(json.dumps(action)) - bulk_list.append(json.dumps(document)) - - bulk_list.append("\n") - query_str = "\n".join(bulk_list) - - return query_str - - def _create_documents(self, video, source): - """process documents""" - documents = self._chunk_list(video.youtube_id) - channel = video.json_data.get("channel") - meta_dict = { - "youtube_id": video.youtube_id, - "title": video.json_data.get("title"), - "subtitle_channel": channel.get("channel_name"), - "subtitle_channel_id": channel.get("channel_id"), - "subtitle_last_refresh": int(datetime.now().strftime("%s")), - "subtitle_lang": self.lang, - "subtitle_source": source, - } - - _ = [i.update(meta_dict) for i in documents] - - return documents - - def _chunk_list(self, youtube_id): - """join cues for bulk import""" - chunk_list = [] - - chunk = {} - for cue in self.all_cues: - if chunk: - text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n" - chunk["subtitle_line"] = text - else: - idx = len(chunk_list) + 1 - chunk = { - "subtitle_index": idx, - "subtitle_line": cue.get("text"), - "subtitle_start": cue.get("start"), - } - - chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}" - - if cue["idx"] % 5 == 0: - chunk["subtitle_end"] = cue.get("end") - chunk_list.append(chunk) - chunk = {} - - return chunk_list - - class SponsorBlock: """handle sponsor block integration""" @@ -631,8 +331,8 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): return False dislikes = { - "dislike_count": result["dislikes"], - "average_rating": result["rating"], + "dislike_count": result.get("dislikes", 0), + "average_rating": result.get("rating", 0), } self.json_data["stats"].update(dislikes) @@ -644,13 +344,37 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): if sponsorblock: self.json_data["sponsorblock"] = sponsorblock - def check_subtitles(self): + def check_subtitles(self, subtitle_files=False): """optionally add subtitles""" + if self.offline_import and subtitle_files: + indexed = self._offline_subtitles(subtitle_files) + self.json_data["subtitles"] = indexed + return + handler = YoutubeSubtitle(self) subtitles = handler.get_subtitles() if subtitles: - self.json_data["subtitles"] = subtitles - handler.download_subtitles(relevant_subtitles=subtitles) + indexed = handler.download_subtitles(relevant_subtitles=subtitles) + self.json_data["subtitles"] = indexed + + def _offline_subtitles(self, subtitle_files): + """import offline subtitles""" + base_name, _ = os.path.splitext(self.json_data["media_url"]) + subtitles = [] + for subtitle in subtitle_files: + lang = subtitle.split(".")[-2] + subtitle_media_url = f"{base_name}.{lang}.vtt" + to_add = { + "ext": "vtt", + "url": False, + "name": lang, + "lang": lang, + "source": "file", + "media_url": subtitle_media_url, + } + subtitles.append(to_add) + + return subtitles def update_media_url(self): """update only media_url in es for reindex channel rename""" diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py index bc943ecc..314ecd79 100644 --- a/tubearchivist/home/src/ta/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -158,12 +158,12 @@ class UrlListParser: if parsed.query: # detect from query string query_parsed = parse_qs(parsed.query) - if "v" in query_parsed.keys(): + if "v" in query_parsed: youtube_id = query_parsed["v"][0] _ = self.find_valid_id(youtube_id) return youtube_id, "video" - if "list" in query_parsed.keys(): + if "list" in query_parsed: youtube_id = query_parsed["list"][0] return youtube_id, "playlist" @@ -202,13 +202,19 @@ class UrlListParser: "playlistend": 0, } url_info = YtWrap(obs_request).extract(url) - try: - channel_id = url_info["channel_id"] - except KeyError as error: - print(f"failed to extract channel id from {url}") - raise ValueError from error + channel_id = url_info.get("channel_id", False) + if channel_id: + return channel_id - return channel_id + url = url_info.get("url", False) + if url: + # handle old channel name redirect with url path split + channel_id = urlparse(url).path.strip("/").split("/")[1] + + return channel_id + + print(f"failed to extract channel id from {url}") + raise ValueError class DurationConverter: diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index b3da7621..ae8e43e9 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -162,8 +162,19 @@ def run_manual_import(): @shared_task(name="run_backup") def run_backup(reason="auto"): """called from settings page, dump backup to zip file""" - backup_all_indexes(reason) - print("backup finished") + have_lock = False + my_lock = RedisArchivist().get_lock("run_backup") + + try: + have_lock = my_lock.acquire(blocking=False) + if have_lock: + backup_all_indexes(reason) + else: + print("Did not acquire lock for backup task.") + finally: + if have_lock: + my_lock.release() + print("backup finished") @shared_task diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index 6ec29e19..6477013d 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -34,8 +34,8 @@

Subscriptions

-

Current page size: {{ config.subscriptions.channel_size }}

- Recent videos for channels and playlist to check when running Rescan subscriptions, max recommended 50.
+

YouTube page size: {{ config.subscriptions.channel_size }}

+ Videos to scan to find new items for the Rescan subscriptions task, max recommended 50.
{{ app_form.subscriptions_channel_size }}
diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index f9729822..0d9a260f 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -7,7 +7,7 @@ djangorestframework==3.13.1 Pillow==9.2.0 redis==4.3.4 requests==2.28.1 -ryd-client==0.0.3 +ryd-client==0.0.6 uWSGI==2.0.20 whitenoise==6.2.0 -yt_dlp==2022.8.8 +yt_dlp==2022.8.14 diff --git a/tubearchivist/static/css/style.css b/tubearchivist/static/css/style.css index 55c0ac4b..cb54d71e 100644 --- a/tubearchivist/static/css/style.css +++ b/tubearchivist/static/css/style.css @@ -459,6 +459,8 @@ video:-webkit-full-screen { .video-item.list { display: grid; grid-template-columns: 26% auto; + background-color: var(--highlight-bg); + align-items: center; } .video-progress-bar { @@ -508,8 +510,7 @@ video:-webkit-full-screen { .video-desc.list { padding: 10px; - height: unset; - background-color: var(--highlight-bg); + height: 100%; display: flex; flex-wrap: wrap-reverse; align-content: center;