refacter filesystem scanner

pull/491/head
simon 1 year ago
parent 5334d79d0d
commit 9b30c7df6e
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4

@ -1,198 +1,85 @@
""" """
Functionality: Functionality:
- reindexing old documents
- syncing updated values between indexes
- scan the filesystem to delete or index - scan the filesystem to delete or index
""" """
import json
import os import os
from home.src.download.queue import PendingList from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.es.connect import ElasticWrap
from home.src.index.comments import CommentList from home.src.index.comments import CommentList
from home.src.index.video import index_new_video from home.src.index.video import YoutubeVideo, index_new_video
from home.src.ta.config import AppConfig from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist from home.src.ta.helper import ignore_filelist
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
class Scanner:
"""scan index and filesystem"""
class ScannerBase: VIDEOS = AppConfig().config["application"]["videos"]
"""scan the filesystem base class"""
CONFIG = AppConfig().config def __init__(self, task=False):
VIDEOS = CONFIG["application"]["videos"] self.task = task
def __init__(self):
self.to_index = False
self.to_delete = False self.to_delete = False
self.mismatch = False self.to_index = False
self.to_rename = False
def scan(self): def scan(self):
"""entry point, scan and compare""" """scan the filesystem"""
all_downloaded = self._get_all_downloaded() downloaded = self._get_downloaded()
all_indexed = self._get_all_indexed() indexed = self._get_indexed()
self.list_comarison(all_downloaded, all_indexed) self.to_index = downloaded - indexed
self.to_delete = indexed - downloaded
def _get_all_downloaded(self):
"""get a list of all video files downloaded""" def _get_downloaded(self):
channels = os.listdir(self.VIDEOS) """get downloaded ids"""
all_channels = ignore_filelist(channels) if self.task:
all_channels.sort() self.task.send_progress(["Scan your filesystem for videos."])
all_downloaded = []
for channel_name in all_channels:
channel_path = os.path.join(self.VIDEOS, channel_name)
channel_files = os.listdir(channel_path)
channel_files_clean = ignore_filelist(channel_files)
all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
for video in all_videos:
youtube_id = video[9:20]
all_downloaded.append((channel_name, video, youtube_id))
return all_downloaded
@staticmethod
def _get_all_indexed():
"""get a list of all indexed videos"""
index_handler = PendingList()
index_handler.get_download()
index_handler.get_indexed()
all_indexed = []
for video in index_handler.all_videos:
youtube_id = video["youtube_id"]
media_url = video["media_url"]
published = video["published"]
title = video["title"]
all_indexed.append((youtube_id, media_url, published, title))
return all_indexed
def list_comarison(self, all_downloaded, all_indexed):
"""compare the lists to figure out what to do"""
self._find_unindexed(all_downloaded, all_indexed)
self._find_missing(all_downloaded, all_indexed)
self._find_bad_media_url(all_downloaded, all_indexed)
def _find_unindexed(self, all_downloaded, all_indexed):
"""find video files without a matching document indexed"""
all_indexed_ids = [i[0] for i in all_indexed]
self.to_index = []
for downloaded in all_downloaded:
if downloaded[2] not in all_indexed_ids:
self.to_index.append(downloaded)
def _find_missing(self, all_downloaded, all_indexed):
"""find indexed videos without matching media file"""
all_downloaded_ids = [i[2] for i in all_downloaded]
self.to_delete = []
for video in all_indexed:
youtube_id = video[0]
if youtube_id not in all_downloaded_ids:
self.to_delete.append(video)
def _find_bad_media_url(self, all_downloaded, all_indexed):
"""rename media files not matching the indexed title"""
self.mismatch = []
self.to_rename = []
for downloaded in all_downloaded:
channel, filename, downloaded_id = downloaded
# find in indexed
for indexed in all_indexed:
indexed_id, media_url, published, title = indexed
if indexed_id == downloaded_id:
# found it
pub = published.replace("-", "")
expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
new_url = os.path.join(channel, expected)
if expected != filename:
# file to rename
self.to_rename.append((channel, filename, expected))
if media_url != new_url:
# media_url to update in es
self.mismatch.append((indexed_id, new_url))
break
class Filesystem(ScannerBase):
"""handle scanning and fixing from filesystem"""
def __init__(self, task=False): downloaded = set()
super().__init__() channels = ignore_filelist(os.listdir(self.VIDEOS))
self.task = task for channel in channels:
folder = os.path.join(self.VIDEOS, channel)
files = ignore_filelist(os.listdir(folder))
downloaded.update(set(i.split(".")[0] for i in files))
def process(self): return downloaded
"""entry point"""
if self.task:
self.task.send_progress(["Scanning your archive and index."])
self.scan()
self.rename_files()
self.send_mismatch_bulk()
self.delete_from_index()
self.add_missing()
def rename_files(self):
"""rename media files as identified by find_bad_media_url"""
if not self.to_rename:
return
total = len(self.to_rename) def _get_indexed(self):
"""get all indexed ids"""
if self.task: if self.task:
self.task.send_progress([f"Rename {total} media files."]) self.task.send_progress(["Get all videos indexed."])
for bad_filename in self.to_rename:
channel, filename, expected_filename = bad_filename
print(f"renaming [{filename}] to [{expected_filename}]")
old_path = os.path.join(self.VIDEOS, channel, filename)
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
os.rename(old_path, new_path)
def send_mismatch_bulk(self):
"""build bulk update"""
if not self.mismatch:
return
total = len(self.mismatch) data = {"query": {"match_all": {}}, "_source": ["youtube_id"]}
if self.task: response = IndexPaginate("ta_video", data).get_results()
self.task.send_progress([f"Fix media urls for {total} files"]) return set(i["youtube_id"] for i in response)
bulk_list = []
for video_mismatch in self.mismatch: def apply(self):
youtube_id, media_url = video_mismatch """apply all changes"""
print(f"{youtube_id}: fixing media url {media_url}") self.delete()
action = {"update": {"_id": youtube_id, "_index": "ta_video"}} self.index()
source = {"doc": {"media_url": media_url}} self.url_fix()
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source)) def delete(self):
# add last newline """delete videos from index"""
bulk_list.append("\n")
data = "\n".join(bulk_list)
_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)
def delete_from_index(self):
"""find indexed but deleted mediafile"""
if not self.to_delete: if not self.to_delete:
print("nothing to delete")
return return
total = len(self.to_delete)
if self.task: if self.task:
self.task.send_progress([f"Clean up {total} items from index."]) self.task.send_progress(
for indexed in self.to_delete: [f"Remove {len(self.to_delete)} videos from index."]
youtube_id = indexed[0] )
print(f"deleting {youtube_id} from index")
path = f"ta_video/_doc/{youtube_id}" for youtube_id in self.to_delete:
_, _ = ElasticWrap(path).delete() YoutubeVideo(youtube_id).delete_media_file()
def add_missing(self): def index(self):
"""add missing videos to index""" """index new"""
video_ids = [i[2] for i in self.to_index] if not self.to_index:
if not video_ids: print("nothing to index")
return return
total = len(video_ids) total = len(self.to_index)
for idx, youtube_id in enumerate(video_ids): for idx, youtube_id in enumerate(self.to_index):
if self.task: if self.task:
self.task.send_progress( self.task.send_progress(
message_lines=[ message_lines=[
@ -202,4 +89,36 @@ class Filesystem(ScannerBase):
) )
index_new_video(youtube_id) index_new_video(youtube_id)
CommentList(video_ids, task=self.task).index() CommentList(self.to_index, task=self.task).index()
def url_fix(self):
"""
update path v0.3.6 to v0.3.7
fix url not matching channel-videoid pattern
"""
bool_must = (
"doc['media_url'].value == "
+ "(doc['channel.channel_id'].value + '/' + "
+ "doc['youtube_id'].value) + '.mp4'"
)
to_update = (
"ctx._source['media_url'] = "
+ "ctx._source.channel['channel_id'] + '/' + "
+ "ctx._source['youtube_id'] + '.mp4'"
)
data = {
"query": {
"bool": {
"must_not": [{"script": {"script": {"source": bool_must}}}]
}
},
"script": {"source": to_update},
}
response, _ = ElasticWrap("ta_video/_update_by_query").post(data=data)
updated = response.get("updates")
if updated:
print(f"updated {updated} bad media_url")
if self.task:
self.task.send_progress(
[f"Updated {updated} wrong media urls."]
)

@ -19,7 +19,7 @@ from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.backup import ElasticBackup from home.src.es.backup import ElasticBackup
from home.src.es.index_setup import ElasitIndexWrap from home.src.es.index_setup import ElasitIndexWrap
from home.src.index.channel import YoutubeChannel from home.src.index.channel import YoutubeChannel
from home.src.index.filesystem import Filesystem from home.src.index.filesystem import Scanner
from home.src.index.manual import ImportFolderScanner from home.src.index.manual import ImportFolderScanner
from home.src.index.reindex import Reindex, ReindexManual, ReindexPopulate from home.src.index.reindex import Reindex, ReindexManual, ReindexPopulate
from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder
@ -290,7 +290,9 @@ def rescan_filesystem(self):
return return
manager.init(self) manager.init(self)
Filesystem(task=self).process() handler = Scanner(task=self)
handler.scan()
handler.apply()
ThumbValidator(task=self).validate() ThumbValidator(task=self).validate()

Loading…
Cancel
Save