|
|
@ -19,21 +19,25 @@ from PIL import ImageFile
|
|
|
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
|
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FilesystemScanner:
|
|
|
|
class ScannerBase:
|
|
|
|
"""handle scanning and fixing from filesystem"""
|
|
|
|
"""scan the filesystem base class"""
|
|
|
|
|
|
|
|
|
|
|
|
CONFIG = AppConfig().config
|
|
|
|
CONFIG = AppConfig().config
|
|
|
|
VIDEOS = CONFIG["application"]["videos"]
|
|
|
|
VIDEOS = CONFIG["application"]["videos"]
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
def __init__(self):
|
|
|
|
self.all_downloaded = self.get_all_downloaded()
|
|
|
|
self.to_index = False
|
|
|
|
self.all_indexed = self.get_all_indexed()
|
|
|
|
self.to_delete = False
|
|
|
|
self.mismatch = None
|
|
|
|
self.mismatch = False
|
|
|
|
self.to_rename = None
|
|
|
|
self.to_rename = False
|
|
|
|
self.to_index = None
|
|
|
|
|
|
|
|
self.to_delete = None
|
|
|
|
def scan(self):
|
|
|
|
|
|
|
|
"""entry point, scan and compare"""
|
|
|
|
def get_all_downloaded(self):
|
|
|
|
all_downloaded = self._get_all_downloaded()
|
|
|
|
|
|
|
|
all_indexed = self._get_all_indexed()
|
|
|
|
|
|
|
|
self.list_comarison(all_downloaded, all_indexed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_all_downloaded(self):
|
|
|
|
"""get a list of all video files downloaded"""
|
|
|
|
"""get a list of all video files downloaded"""
|
|
|
|
channels = os.listdir(self.VIDEOS)
|
|
|
|
channels = os.listdir(self.VIDEOS)
|
|
|
|
all_channels = ignore_filelist(channels)
|
|
|
|
all_channels = ignore_filelist(channels)
|
|
|
@ -51,7 +55,7 @@ class FilesystemScanner:
|
|
|
|
return all_downloaded
|
|
|
|
return all_downloaded
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|
def get_all_indexed():
|
|
|
|
def _get_all_indexed():
|
|
|
|
"""get a list of all indexed videos"""
|
|
|
|
"""get a list of all indexed videos"""
|
|
|
|
index_handler = PendingList()
|
|
|
|
index_handler = PendingList()
|
|
|
|
index_handler.get_download()
|
|
|
|
index_handler.get_download()
|
|
|
@ -66,64 +70,77 @@ class FilesystemScanner:
|
|
|
|
all_indexed.append((youtube_id, media_url, published, title))
|
|
|
|
all_indexed.append((youtube_id, media_url, published, title))
|
|
|
|
return all_indexed
|
|
|
|
return all_indexed
|
|
|
|
|
|
|
|
|
|
|
|
def list_comarison(self):
|
|
|
|
def list_comarison(self, all_downloaded, all_indexed):
|
|
|
|
"""compare the lists to figure out what to do"""
|
|
|
|
"""compare the lists to figure out what to do"""
|
|
|
|
self.find_unindexed()
|
|
|
|
self._find_unindexed(all_downloaded, all_indexed)
|
|
|
|
self.find_missing()
|
|
|
|
self._find_missing(all_downloaded, all_indexed)
|
|
|
|
self.find_bad_media_url()
|
|
|
|
self._find_bad_media_url(all_downloaded, all_indexed)
|
|
|
|
|
|
|
|
|
|
|
|
def find_unindexed(self):
|
|
|
|
def _find_unindexed(self, all_downloaded, all_indexed):
|
|
|
|
"""find video files without a matching document indexed"""
|
|
|
|
"""find video files without a matching document indexed"""
|
|
|
|
all_indexed_ids = [i[0] for i in self.all_indexed]
|
|
|
|
all_indexed_ids = [i[0] for i in all_indexed]
|
|
|
|
to_index = []
|
|
|
|
self.to_index = []
|
|
|
|
for downloaded in self.all_downloaded:
|
|
|
|
for downloaded in all_downloaded:
|
|
|
|
if downloaded[2] not in all_indexed_ids:
|
|
|
|
if downloaded[2] not in all_indexed_ids:
|
|
|
|
to_index.append(downloaded)
|
|
|
|
self.to_index.append(downloaded)
|
|
|
|
|
|
|
|
|
|
|
|
self.to_index = to_index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_missing(self):
|
|
|
|
def _find_missing(self, all_downloaded, all_indexed):
|
|
|
|
"""find indexed videos without matching media file"""
|
|
|
|
"""find indexed videos without matching media file"""
|
|
|
|
all_downloaded_ids = [i[2] for i in self.all_downloaded]
|
|
|
|
all_downloaded_ids = [i[2] for i in all_downloaded]
|
|
|
|
to_delete = []
|
|
|
|
self.to_delete = []
|
|
|
|
for video in self.all_indexed:
|
|
|
|
for video in all_indexed:
|
|
|
|
youtube_id = video[0]
|
|
|
|
youtube_id = video[0]
|
|
|
|
if youtube_id not in all_downloaded_ids:
|
|
|
|
if youtube_id not in all_downloaded_ids:
|
|
|
|
to_delete.append(video)
|
|
|
|
self.to_delete.append(video)
|
|
|
|
|
|
|
|
|
|
|
|
self.to_delete = to_delete
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_bad_media_url(self):
|
|
|
|
def _find_bad_media_url(self, all_downloaded, all_indexed):
|
|
|
|
"""rename media files not matching the indexed title"""
|
|
|
|
"""rename media files not matching the indexed title"""
|
|
|
|
to_fix = []
|
|
|
|
self.mismatch = []
|
|
|
|
to_rename = []
|
|
|
|
self.to_rename = []
|
|
|
|
for downloaded in self.all_downloaded:
|
|
|
|
|
|
|
|
|
|
|
|
for downloaded in all_downloaded:
|
|
|
|
channel, filename, downloaded_id = downloaded
|
|
|
|
channel, filename, downloaded_id = downloaded
|
|
|
|
# find in indexed
|
|
|
|
# find in indexed
|
|
|
|
for indexed in self.all_indexed:
|
|
|
|
for indexed in all_indexed:
|
|
|
|
indexed_id, media_url, published, title = indexed
|
|
|
|
indexed_id, media_url, published, title = indexed
|
|
|
|
if indexed_id == downloaded_id:
|
|
|
|
if indexed_id == downloaded_id:
|
|
|
|
# found it
|
|
|
|
# found it
|
|
|
|
title_c = clean_string(title)
|
|
|
|
|
|
|
|
pub = published.replace("-", "")
|
|
|
|
pub = published.replace("-", "")
|
|
|
|
expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
|
|
|
|
expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
|
|
|
|
new_url = os.path.join(channel, expected_filename)
|
|
|
|
new_url = os.path.join(channel, expected)
|
|
|
|
if expected_filename != filename:
|
|
|
|
if expected != filename:
|
|
|
|
# file to rename
|
|
|
|
# file to rename
|
|
|
|
to_rename.append(
|
|
|
|
self.to_rename.append((channel, filename, expected))
|
|
|
|
(channel, filename, expected_filename)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
if media_url != new_url:
|
|
|
|
if media_url != new_url:
|
|
|
|
# media_url to update in es
|
|
|
|
# media_url to update in es
|
|
|
|
to_fix.append((indexed_id, new_url))
|
|
|
|
self.mismatch.append((indexed_id, new_url))
|
|
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
self.mismatch = to_fix
|
|
|
|
|
|
|
|
self.to_rename = to_rename
|
|
|
|
class Filesystem(ScannerBase):
|
|
|
|
|
|
|
|
"""handle scanning and fixing from filesystem"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, task=False):
|
|
|
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
self.task = task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process(self):
|
|
|
|
|
|
|
|
"""entry point"""
|
|
|
|
|
|
|
|
self.task.send_progress(["Scanning your archive and index."])
|
|
|
|
|
|
|
|
self.scan()
|
|
|
|
|
|
|
|
self.rename_files()
|
|
|
|
|
|
|
|
self.send_mismatch_bulk()
|
|
|
|
|
|
|
|
self.delete_from_index()
|
|
|
|
|
|
|
|
self.add_missing()
|
|
|
|
|
|
|
|
|
|
|
|
def rename_files(self):
|
|
|
|
def rename_files(self):
|
|
|
|
"""rename media files as identified by find_bad_media_url"""
|
|
|
|
"""rename media files as identified by find_bad_media_url"""
|
|
|
|
|
|
|
|
if not self.to_rename:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
total = len(self.to_rename)
|
|
|
|
|
|
|
|
self.task.send_progress([f"Rename {total} media files."])
|
|
|
|
for bad_filename in self.to_rename:
|
|
|
|
for bad_filename in self.to_rename:
|
|
|
|
channel, filename, expected_filename = bad_filename
|
|
|
|
channel, filename, expected_filename = bad_filename
|
|
|
|
print(f"renaming [{filename}] to [{expected_filename}]")
|
|
|
|
print(f"renaming [{filename}] to [{expected_filename}]")
|
|
|
@ -133,6 +150,11 @@ class FilesystemScanner:
|
|
|
|
|
|
|
|
|
|
|
|
def send_mismatch_bulk(self):
|
|
|
|
def send_mismatch_bulk(self):
|
|
|
|
"""build bulk update"""
|
|
|
|
"""build bulk update"""
|
|
|
|
|
|
|
|
if not self.mismatch:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
total = len(self.mismatch)
|
|
|
|
|
|
|
|
self.task.send_progress([f"Fix media urls for {total} files"])
|
|
|
|
bulk_list = []
|
|
|
|
bulk_list = []
|
|
|
|
for video_mismatch in self.mismatch:
|
|
|
|
for video_mismatch in self.mismatch:
|
|
|
|
youtube_id, media_url = video_mismatch
|
|
|
|
youtube_id, media_url = video_mismatch
|
|
|
@ -148,30 +170,32 @@ class FilesystemScanner:
|
|
|
|
|
|
|
|
|
|
|
|
def delete_from_index(self):
|
|
|
|
def delete_from_index(self):
|
|
|
|
"""find indexed but deleted mediafile"""
|
|
|
|
"""find indexed but deleted mediafile"""
|
|
|
|
|
|
|
|
if not self.to_delete:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
total = len(self.to_delete)
|
|
|
|
|
|
|
|
self.task.send_progress([f"Clean up {total} items from index."])
|
|
|
|
for indexed in self.to_delete:
|
|
|
|
for indexed in self.to_delete:
|
|
|
|
youtube_id = indexed[0]
|
|
|
|
youtube_id = indexed[0]
|
|
|
|
print(f"deleting {youtube_id} from index")
|
|
|
|
print(f"deleting {youtube_id} from index")
|
|
|
|
path = f"ta_video/_doc/{youtube_id}"
|
|
|
|
path = f"ta_video/_doc/{youtube_id}"
|
|
|
|
_, _ = ElasticWrap(path).delete()
|
|
|
|
_, _ = ElasticWrap(path).delete()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_missing(self):
|
|
|
|
def scan_filesystem():
|
|
|
|
"""add missing videos to index"""
|
|
|
|
"""grouped function to delete and update index"""
|
|
|
|
video_ids = [i[2] for i in self.to_index]
|
|
|
|
filesystem_handler = FilesystemScanner()
|
|
|
|
if not video_ids:
|
|
|
|
filesystem_handler.list_comarison()
|
|
|
|
return
|
|
|
|
if filesystem_handler.to_rename:
|
|
|
|
|
|
|
|
print("renaming files")
|
|
|
|
total = len(video_ids)
|
|
|
|
filesystem_handler.rename_files()
|
|
|
|
for idx, youtube_id in enumerate(video_ids):
|
|
|
|
if filesystem_handler.mismatch:
|
|
|
|
if self.task:
|
|
|
|
print("fixing media urls in index")
|
|
|
|
self.task.send_progress(
|
|
|
|
filesystem_handler.send_mismatch_bulk()
|
|
|
|
message_lines=[
|
|
|
|
if filesystem_handler.to_delete:
|
|
|
|
f"Index missing video {youtube_id}, {idx}/{total}"
|
|
|
|
print("delete metadata from index")
|
|
|
|
],
|
|
|
|
filesystem_handler.delete_from_index()
|
|
|
|
progress=(idx + 1) / total,
|
|
|
|
if filesystem_handler.to_index:
|
|
|
|
)
|
|
|
|
print("index new videos")
|
|
|
|
|
|
|
|
video_ids = [i[2] for i in filesystem_handler.to_index]
|
|
|
|
|
|
|
|
for youtube_id in video_ids:
|
|
|
|
|
|
|
|
index_new_video(youtube_id)
|
|
|
|
index_new_video(youtube_id)
|
|
|
|
|
|
|
|
|
|
|
|
CommentList(video_ids).index()
|
|
|
|
CommentList(video_ids, task=self.task).index()
|
|
|
|