refactor filesystem class task

2 years ago · 0ab41f74dc
parent 1ab7127ed3
commit 0ab41f74dc
2 changed files with 90 additions and 66 deletions
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@ -19,21 +19,25 @@ from PIL import ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
-class FilesystemScanner:
+class ScannerBase:
-    """handle scanning and fixing from filesystem"""
+    """scan the filesystem base class"""
    CONFIG = AppConfig().config
    VIDEOS = CONFIG["application"]["videos"]
    def __init__(self):
-        self.all_downloaded = self.get_all_downloaded()
+        self.to_index = False
-        self.all_indexed = self.get_all_indexed()
+        self.to_delete = False
-        self.mismatch = None
+        self.mismatch = False
-        self.to_rename = None
+        self.to_rename = False
-        self.to_index = None
+
-        self.to_delete = None
+    def scan(self):
-
+        """entry point, scan and compare"""
-    def get_all_downloaded(self):
+        all_downloaded = self._get_all_downloaded()
        all_indexed = self._get_all_indexed()
        self.list_comarison(all_downloaded, all_indexed)
    def _get_all_downloaded(self):
        """get a list of all video files downloaded"""
        channels = os.listdir(self.VIDEOS)
        all_channels = ignore_filelist(channels)
@ -51,7 +55,7 @@ class FilesystemScanner:
        return all_downloaded
    @staticmethod
-    def get_all_indexed():
+    def _get_all_indexed():
        """get a list of all indexed videos"""
        index_handler = PendingList()
        index_handler.get_download()
@ -66,64 +70,77 @@ class FilesystemScanner:
            all_indexed.append((youtube_id, media_url, published, title))
        return all_indexed
-    def list_comarison(self):
+    def list_comarison(self, all_downloaded, all_indexed):
        """compare the lists to figure out what to do"""
-        self.find_unindexed()
+        self._find_unindexed(all_downloaded, all_indexed)
-        self.find_missing()
+        self._find_missing(all_downloaded, all_indexed)
-        self.find_bad_media_url()
+        self._find_bad_media_url(all_downloaded, all_indexed)
-    def find_unindexed(self):
+    def _find_unindexed(self, all_downloaded, all_indexed):
        """find video files without a matching document indexed"""
-        all_indexed_ids = [i[0] for i in self.all_indexed]
+        all_indexed_ids = [i[0] for i in all_indexed]
-        to_index = []
+        self.to_index = []
-        for downloaded in self.all_downloaded:
+        for downloaded in all_downloaded:
            if downloaded[2] not in all_indexed_ids:
-                to_index.append(downloaded)
+                self.to_index.append(downloaded)
        self.to_index = to_index
-    def find_missing(self):
+    def _find_missing(self, all_downloaded, all_indexed):
        """find indexed videos without matching media file"""
-        all_downloaded_ids = [i[2] for i in self.all_downloaded]
+        all_downloaded_ids = [i[2] for i in all_downloaded]
-        to_delete = []
+        self.to_delete = []
-        for video in self.all_indexed:
+        for video in all_indexed:
            youtube_id = video[0]
            if youtube_id not in all_downloaded_ids:
-                to_delete.append(video)
+                self.to_delete.append(video)
        self.to_delete = to_delete
-    def find_bad_media_url(self):
+    def _find_bad_media_url(self, all_downloaded, all_indexed):
        """rename media files not matching the indexed title"""
-        to_fix = []
+        self.mismatch = []
-        to_rename = []
+        self.to_rename = []
-        for downloaded in self.all_downloaded:
+
        for downloaded in all_downloaded:
            channel, filename, downloaded_id = downloaded
            # find in indexed
-            for indexed in self.all_indexed:
+            for indexed in all_indexed:
                indexed_id, media_url, published, title = indexed
                if indexed_id == downloaded_id:
                    # found it
                    title_c = clean_string(title)
                    pub = published.replace("-", "")
-                    expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
+                    expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
-                    new_url = os.path.join(channel, expected_filename)
+                    new_url = os.path.join(channel, expected)
-                    if expected_filename != filename:
+                    if expected != filename:
                        # file to rename
-                        to_rename.append(
+                        self.to_rename.append((channel, filename, expected))
                            (channel, filename, expected_filename)
                        )
                    if media_url != new_url:
                        # media_url to update in es
-                        to_fix.append((indexed_id, new_url))
+                        self.mismatch.append((indexed_id, new_url))
                    break
-        self.mismatch = to_fix
+
-        self.to_rename = to_rename
+class Filesystem(ScannerBase):
    """handle scanning and fixing from filesystem"""
    def __init__(self, task=False):
        super().__init__()
        self.task = task
    def process(self):
        """entry point"""
        self.task.send_progress(["Scanning your archive and index."])
        self.scan()
        self.rename_files()
        self.send_mismatch_bulk()
        self.delete_from_index()
        self.add_missing()
    def rename_files(self):
        """rename media files as identified by find_bad_media_url"""
        if not self.to_rename:
            return
        total = len(self.to_rename)
        self.task.send_progress([f"Rename {total} media files."])
        for bad_filename in self.to_rename:
            channel, filename, expected_filename = bad_filename
            print(f"renaming [{filename}] to [{expected_filename}]")
@ -133,6 +150,11 @@ class FilesystemScanner:
    def send_mismatch_bulk(self):
        """build bulk update"""
        if not self.mismatch:
            return
        total = len(self.mismatch)
        self.task.send_progress([f"Fix media urls for {total} files"])
        bulk_list = []
        for video_mismatch in self.mismatch:
            youtube_id, media_url = video_mismatch
@ -148,30 +170,32 @@ class FilesystemScanner:
    def delete_from_index(self):
        """find indexed but deleted mediafile"""
        if not self.to_delete:
            return
        total = len(self.to_delete)
        self.task.send_progress([f"Clean up {total} items from index."])
        for indexed in self.to_delete:
            youtube_id = indexed[0]
            print(f"deleting {youtube_id} from index")
            path = f"ta_video/_doc/{youtube_id}"
            _, _ = ElasticWrap(path).delete()
-
+    def add_missing(self):
-def scan_filesystem():
+        """add missing videos to index"""
-    """grouped function to delete and update index"""
+        video_ids = [i[2] for i in self.to_index]
-    filesystem_handler = FilesystemScanner()
+        if not video_ids:
-    filesystem_handler.list_comarison()
+            return
-    if filesystem_handler.to_rename:
+
-        print("renaming files")
+        total = len(video_ids)
-        filesystem_handler.rename_files()
+        for idx, youtube_id in enumerate(video_ids):
-    if filesystem_handler.mismatch:
+            if self.task:
-        print("fixing media urls in index")
+                self.task.send_progress(
-        filesystem_handler.send_mismatch_bulk()
+                    message_lines=[
-    if filesystem_handler.to_delete:
+                        f"Index missing video {youtube_id}, {idx}/{total}"
-        print("delete metadata from index")
+                    ],
-        filesystem_handler.delete_from_index()
+                    progress=(idx + 1) / total,
-    if filesystem_handler.to_index:
+                )
        print("index new videos")
        video_ids = [i[2] for i in filesystem_handler.to_index]
        for youtube_id in video_ids:
            index_new_video(youtube_id)
-        CommentList(video_ids).index()
+        CommentList(video_ids, task=self.task).index()
--- a/tubearchivist/home/tasks.py
+++ b/tubearchivist/home/tasks.py
@ -19,7 +19,7 @@ from home.src.download.yt_dlp_handler import VideoDownloader
 from home.src.es.backup import ElasticBackup
 from home.src.es.index_setup import ElasitIndexWrap
 from home.src.index.channel import YoutubeChannel
-from home.src.index.filesystem import scan_filesystem
+from home.src.index.filesystem import Filesystem
 from home.src.index.manual import ImportFolderScanner
 from home.src.index.reindex import Reindex, ReindexManual, ReindexOutdated
 from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder
@ -272,7 +272,7 @@ def kill_dl(task_id):
    RedisArchivist().set_message("message:download", mess_dict, expire=True)
-@shared_task(bind=True, name="rescan_filesystem")
+@shared_task(bind=True, name="rescan_filesystem", base=BaseTask)
 def rescan_filesystem(self):
    """check the media folder for mismatches"""
    manager = TaskManager()
@ -281,7 +281,7 @@ def rescan_filesystem(self):
        return
    manager.init(self)
-    scan_filesystem()
+    Filesystem(task=self).process()
    ThumbValidator().download_missing()