refactor filesystem class task

pull/474/head
simon 2 years ago
parent 1ab7127ed3
commit 0ab41f74dc
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4

@ -19,21 +19,25 @@ from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True ImageFile.LOAD_TRUNCATED_IMAGES = True
class FilesystemScanner: class ScannerBase:
"""handle scanning and fixing from filesystem""" """scan the filesystem base class"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
VIDEOS = CONFIG["application"]["videos"] VIDEOS = CONFIG["application"]["videos"]
def __init__(self): def __init__(self):
self.all_downloaded = self.get_all_downloaded() self.to_index = False
self.all_indexed = self.get_all_indexed() self.to_delete = False
self.mismatch = None self.mismatch = False
self.to_rename = None self.to_rename = False
self.to_index = None
self.to_delete = None def scan(self):
"""entry point, scan and compare"""
def get_all_downloaded(self): all_downloaded = self._get_all_downloaded()
all_indexed = self._get_all_indexed()
self.list_comarison(all_downloaded, all_indexed)
def _get_all_downloaded(self):
"""get a list of all video files downloaded""" """get a list of all video files downloaded"""
channels = os.listdir(self.VIDEOS) channels = os.listdir(self.VIDEOS)
all_channels = ignore_filelist(channels) all_channels = ignore_filelist(channels)
@ -51,7 +55,7 @@ class FilesystemScanner:
return all_downloaded return all_downloaded
@staticmethod @staticmethod
def get_all_indexed(): def _get_all_indexed():
"""get a list of all indexed videos""" """get a list of all indexed videos"""
index_handler = PendingList() index_handler = PendingList()
index_handler.get_download() index_handler.get_download()
@ -66,64 +70,77 @@ class FilesystemScanner:
all_indexed.append((youtube_id, media_url, published, title)) all_indexed.append((youtube_id, media_url, published, title))
return all_indexed return all_indexed
def list_comarison(self): def list_comarison(self, all_downloaded, all_indexed):
"""compare the lists to figure out what to do""" """compare the lists to figure out what to do"""
self.find_unindexed() self._find_unindexed(all_downloaded, all_indexed)
self.find_missing() self._find_missing(all_downloaded, all_indexed)
self.find_bad_media_url() self._find_bad_media_url(all_downloaded, all_indexed)
def find_unindexed(self): def _find_unindexed(self, all_downloaded, all_indexed):
"""find video files without a matching document indexed""" """find video files without a matching document indexed"""
all_indexed_ids = [i[0] for i in self.all_indexed] all_indexed_ids = [i[0] for i in all_indexed]
to_index = [] self.to_index = []
for downloaded in self.all_downloaded: for downloaded in all_downloaded:
if downloaded[2] not in all_indexed_ids: if downloaded[2] not in all_indexed_ids:
to_index.append(downloaded) self.to_index.append(downloaded)
self.to_index = to_index
def find_missing(self): def _find_missing(self, all_downloaded, all_indexed):
"""find indexed videos without matching media file""" """find indexed videos without matching media file"""
all_downloaded_ids = [i[2] for i in self.all_downloaded] all_downloaded_ids = [i[2] for i in all_downloaded]
to_delete = [] self.to_delete = []
for video in self.all_indexed: for video in all_indexed:
youtube_id = video[0] youtube_id = video[0]
if youtube_id not in all_downloaded_ids: if youtube_id not in all_downloaded_ids:
to_delete.append(video) self.to_delete.append(video)
self.to_delete = to_delete
def find_bad_media_url(self): def _find_bad_media_url(self, all_downloaded, all_indexed):
"""rename media files not matching the indexed title""" """rename media files not matching the indexed title"""
to_fix = [] self.mismatch = []
to_rename = [] self.to_rename = []
for downloaded in self.all_downloaded:
for downloaded in all_downloaded:
channel, filename, downloaded_id = downloaded channel, filename, downloaded_id = downloaded
# find in indexed # find in indexed
for indexed in self.all_indexed: for indexed in all_indexed:
indexed_id, media_url, published, title = indexed indexed_id, media_url, published, title = indexed
if indexed_id == downloaded_id: if indexed_id == downloaded_id:
# found it # found it
title_c = clean_string(title)
pub = published.replace("-", "") pub = published.replace("-", "")
expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4" expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
new_url = os.path.join(channel, expected_filename) new_url = os.path.join(channel, expected)
if expected_filename != filename: if expected != filename:
# file to rename # file to rename
to_rename.append( self.to_rename.append((channel, filename, expected))
(channel, filename, expected_filename)
)
if media_url != new_url: if media_url != new_url:
# media_url to update in es # media_url to update in es
to_fix.append((indexed_id, new_url)) self.mismatch.append((indexed_id, new_url))
break break
self.mismatch = to_fix
self.to_rename = to_rename class Filesystem(ScannerBase):
"""handle scanning and fixing from filesystem"""
def __init__(self, task=False):
super().__init__()
self.task = task
def process(self):
"""entry point"""
self.task.send_progress(["Scanning your archive and index."])
self.scan()
self.rename_files()
self.send_mismatch_bulk()
self.delete_from_index()
self.add_missing()
def rename_files(self): def rename_files(self):
"""rename media files as identified by find_bad_media_url""" """rename media files as identified by find_bad_media_url"""
if not self.to_rename:
return
total = len(self.to_rename)
self.task.send_progress([f"Rename {total} media files."])
for bad_filename in self.to_rename: for bad_filename in self.to_rename:
channel, filename, expected_filename = bad_filename channel, filename, expected_filename = bad_filename
print(f"renaming [{filename}] to [{expected_filename}]") print(f"renaming [{filename}] to [{expected_filename}]")
@ -133,6 +150,11 @@ class FilesystemScanner:
def send_mismatch_bulk(self): def send_mismatch_bulk(self):
"""build bulk update""" """build bulk update"""
if not self.mismatch:
return
total = len(self.mismatch)
self.task.send_progress([f"Fix media urls for {total} files"])
bulk_list = [] bulk_list = []
for video_mismatch in self.mismatch: for video_mismatch in self.mismatch:
youtube_id, media_url = video_mismatch youtube_id, media_url = video_mismatch
@ -148,30 +170,32 @@ class FilesystemScanner:
def delete_from_index(self): def delete_from_index(self):
"""find indexed but deleted mediafile""" """find indexed but deleted mediafile"""
if not self.to_delete:
return
total = len(self.to_delete)
self.task.send_progress([f"Clean up {total} items from index."])
for indexed in self.to_delete: for indexed in self.to_delete:
youtube_id = indexed[0] youtube_id = indexed[0]
print(f"deleting {youtube_id} from index") print(f"deleting {youtube_id} from index")
path = f"ta_video/_doc/{youtube_id}" path = f"ta_video/_doc/{youtube_id}"
_, _ = ElasticWrap(path).delete() _, _ = ElasticWrap(path).delete()
def add_missing(self):
def scan_filesystem(): """add missing videos to index"""
"""grouped function to delete and update index""" video_ids = [i[2] for i in self.to_index]
filesystem_handler = FilesystemScanner() if not video_ids:
filesystem_handler.list_comarison() return
if filesystem_handler.to_rename:
print("renaming files") total = len(video_ids)
filesystem_handler.rename_files() for idx, youtube_id in enumerate(video_ids):
if filesystem_handler.mismatch: if self.task:
print("fixing media urls in index") self.task.send_progress(
filesystem_handler.send_mismatch_bulk() message_lines=[
if filesystem_handler.to_delete: f"Index missing video {youtube_id}, {idx}/{total}"
print("delete metadata from index") ],
filesystem_handler.delete_from_index() progress=(idx + 1) / total,
if filesystem_handler.to_index: )
print("index new videos")
video_ids = [i[2] for i in filesystem_handler.to_index]
for youtube_id in video_ids:
index_new_video(youtube_id) index_new_video(youtube_id)
CommentList(video_ids).index() CommentList(video_ids, task=self.task).index()

@ -19,7 +19,7 @@ from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.backup import ElasticBackup from home.src.es.backup import ElasticBackup
from home.src.es.index_setup import ElasitIndexWrap from home.src.es.index_setup import ElasitIndexWrap
from home.src.index.channel import YoutubeChannel from home.src.index.channel import YoutubeChannel
from home.src.index.filesystem import scan_filesystem from home.src.index.filesystem import Filesystem
from home.src.index.manual import ImportFolderScanner from home.src.index.manual import ImportFolderScanner
from home.src.index.reindex import Reindex, ReindexManual, ReindexOutdated from home.src.index.reindex import Reindex, ReindexManual, ReindexOutdated
from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder
@ -272,7 +272,7 @@ def kill_dl(task_id):
RedisArchivist().set_message("message:download", mess_dict, expire=True) RedisArchivist().set_message("message:download", mess_dict, expire=True)
@shared_task(bind=True, name="rescan_filesystem") @shared_task(bind=True, name="rescan_filesystem", base=BaseTask)
def rescan_filesystem(self): def rescan_filesystem(self):
"""check the media folder for mismatches""" """check the media folder for mismatches"""
manager = TaskManager() manager = TaskManager()
@ -281,7 +281,7 @@ def rescan_filesystem(self):
return return
manager.init(self) manager.init(self)
scan_filesystem() Filesystem(task=self).process()
ThumbValidator().download_missing() ThumbValidator().download_missing()

Loading…
Cancel
Save