|
|
@ -1,198 +1,85 @@
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Functionality:
|
|
|
|
Functionality:
|
|
|
|
- reindexing old documents
|
|
|
|
|
|
|
|
- syncing updated values between indexes
|
|
|
|
|
|
|
|
- scan the filesystem to delete or index
|
|
|
|
- scan the filesystem to delete or index
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
from home.src.download.queue import PendingList
|
|
|
|
from home.src.es.connect import ElasticWrap, IndexPaginate
|
|
|
|
from home.src.es.connect import ElasticWrap
|
|
|
|
|
|
|
|
from home.src.index.comments import CommentList
|
|
|
|
from home.src.index.comments import CommentList
|
|
|
|
from home.src.index.video import index_new_video
|
|
|
|
from home.src.index.video import YoutubeVideo, index_new_video
|
|
|
|
from home.src.ta.config import AppConfig
|
|
|
|
from home.src.ta.config import AppConfig
|
|
|
|
from home.src.ta.helper import clean_string, ignore_filelist
|
|
|
|
from home.src.ta.helper import ignore_filelist
|
|
|
|
from PIL import ImageFile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Scanner:
|
|
|
|
|
|
|
|
"""scan index and filesystem"""
|
|
|
|
|
|
|
|
|
|
|
|
class ScannerBase:
|
|
|
|
VIDEOS = AppConfig().config["application"]["videos"]
|
|
|
|
"""scan the filesystem base class"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CONFIG = AppConfig().config
|
|
|
|
def __init__(self, task=False):
|
|
|
|
VIDEOS = CONFIG["application"]["videos"]
|
|
|
|
self.task = task
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
|
|
self.to_index = False
|
|
|
|
|
|
|
|
self.to_delete = False
|
|
|
|
self.to_delete = False
|
|
|
|
self.mismatch = False
|
|
|
|
self.to_index = False
|
|
|
|
self.to_rename = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scan(self):
|
|
|
|
def scan(self):
|
|
|
|
"""entry point, scan and compare"""
|
|
|
|
"""scan the filesystem"""
|
|
|
|
all_downloaded = self._get_all_downloaded()
|
|
|
|
downloaded = self._get_downloaded()
|
|
|
|
all_indexed = self._get_all_indexed()
|
|
|
|
indexed = self._get_indexed()
|
|
|
|
self.list_comarison(all_downloaded, all_indexed)
|
|
|
|
self.to_index = downloaded - indexed
|
|
|
|
|
|
|
|
self.to_delete = indexed - downloaded
|
|
|
|
def _get_all_downloaded(self):
|
|
|
|
|
|
|
|
"""get a list of all video files downloaded"""
|
|
|
|
def _get_downloaded(self):
|
|
|
|
channels = os.listdir(self.VIDEOS)
|
|
|
|
"""get downloaded ids"""
|
|
|
|
all_channels = ignore_filelist(channels)
|
|
|
|
if self.task:
|
|
|
|
all_channels.sort()
|
|
|
|
self.task.send_progress(["Scan your filesystem for videos."])
|
|
|
|
all_downloaded = []
|
|
|
|
|
|
|
|
for channel_name in all_channels:
|
|
|
|
|
|
|
|
channel_path = os.path.join(self.VIDEOS, channel_name)
|
|
|
|
|
|
|
|
channel_files = os.listdir(channel_path)
|
|
|
|
|
|
|
|
channel_files_clean = ignore_filelist(channel_files)
|
|
|
|
|
|
|
|
all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
|
|
|
|
|
|
|
|
for video in all_videos:
|
|
|
|
|
|
|
|
youtube_id = video[9:20]
|
|
|
|
|
|
|
|
all_downloaded.append((channel_name, video, youtube_id))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return all_downloaded
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
|
|
def _get_all_indexed():
|
|
|
|
|
|
|
|
"""get a list of all indexed videos"""
|
|
|
|
|
|
|
|
index_handler = PendingList()
|
|
|
|
|
|
|
|
index_handler.get_download()
|
|
|
|
|
|
|
|
index_handler.get_indexed()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
all_indexed = []
|
|
|
|
|
|
|
|
for video in index_handler.all_videos:
|
|
|
|
|
|
|
|
youtube_id = video["youtube_id"]
|
|
|
|
|
|
|
|
media_url = video["media_url"]
|
|
|
|
|
|
|
|
published = video["published"]
|
|
|
|
|
|
|
|
title = video["title"]
|
|
|
|
|
|
|
|
all_indexed.append((youtube_id, media_url, published, title))
|
|
|
|
|
|
|
|
return all_indexed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def list_comarison(self, all_downloaded, all_indexed):
|
|
|
|
|
|
|
|
"""compare the lists to figure out what to do"""
|
|
|
|
|
|
|
|
self._find_unindexed(all_downloaded, all_indexed)
|
|
|
|
|
|
|
|
self._find_missing(all_downloaded, all_indexed)
|
|
|
|
|
|
|
|
self._find_bad_media_url(all_downloaded, all_indexed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_unindexed(self, all_downloaded, all_indexed):
|
|
|
|
|
|
|
|
"""find video files without a matching document indexed"""
|
|
|
|
|
|
|
|
all_indexed_ids = [i[0] for i in all_indexed]
|
|
|
|
|
|
|
|
self.to_index = []
|
|
|
|
|
|
|
|
for downloaded in all_downloaded:
|
|
|
|
|
|
|
|
if downloaded[2] not in all_indexed_ids:
|
|
|
|
|
|
|
|
self.to_index.append(downloaded)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_missing(self, all_downloaded, all_indexed):
|
|
|
|
|
|
|
|
"""find indexed videos without matching media file"""
|
|
|
|
|
|
|
|
all_downloaded_ids = [i[2] for i in all_downloaded]
|
|
|
|
|
|
|
|
self.to_delete = []
|
|
|
|
|
|
|
|
for video in all_indexed:
|
|
|
|
|
|
|
|
youtube_id = video[0]
|
|
|
|
|
|
|
|
if youtube_id not in all_downloaded_ids:
|
|
|
|
|
|
|
|
self.to_delete.append(video)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_bad_media_url(self, all_downloaded, all_indexed):
|
|
|
|
|
|
|
|
"""rename media files not matching the indexed title"""
|
|
|
|
|
|
|
|
self.mismatch = []
|
|
|
|
|
|
|
|
self.to_rename = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for downloaded in all_downloaded:
|
|
|
|
|
|
|
|
channel, filename, downloaded_id = downloaded
|
|
|
|
|
|
|
|
# find in indexed
|
|
|
|
|
|
|
|
for indexed in all_indexed:
|
|
|
|
|
|
|
|
indexed_id, media_url, published, title = indexed
|
|
|
|
|
|
|
|
if indexed_id == downloaded_id:
|
|
|
|
|
|
|
|
# found it
|
|
|
|
|
|
|
|
pub = published.replace("-", "")
|
|
|
|
|
|
|
|
expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
|
|
|
|
|
|
|
|
new_url = os.path.join(channel, expected)
|
|
|
|
|
|
|
|
if expected != filename:
|
|
|
|
|
|
|
|
# file to rename
|
|
|
|
|
|
|
|
self.to_rename.append((channel, filename, expected))
|
|
|
|
|
|
|
|
if media_url != new_url:
|
|
|
|
|
|
|
|
# media_url to update in es
|
|
|
|
|
|
|
|
self.mismatch.append((indexed_id, new_url))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Filesystem(ScannerBase):
|
|
|
|
|
|
|
|
"""handle scanning and fixing from filesystem"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, task=False):
|
|
|
|
downloaded = set()
|
|
|
|
super().__init__()
|
|
|
|
channels = ignore_filelist(os.listdir(self.VIDEOS))
|
|
|
|
self.task = task
|
|
|
|
for channel in channels:
|
|
|
|
|
|
|
|
folder = os.path.join(self.VIDEOS, channel)
|
|
|
|
|
|
|
|
files = ignore_filelist(os.listdir(folder))
|
|
|
|
|
|
|
|
downloaded.update(set(i.split(".")[0] for i in files))
|
|
|
|
|
|
|
|
|
|
|
|
def process(self):
|
|
|
|
return downloaded
|
|
|
|
"""entry point"""
|
|
|
|
|
|
|
|
if self.task:
|
|
|
|
|
|
|
|
self.task.send_progress(["Scanning your archive and index."])
|
|
|
|
|
|
|
|
self.scan()
|
|
|
|
|
|
|
|
self.rename_files()
|
|
|
|
|
|
|
|
self.send_mismatch_bulk()
|
|
|
|
|
|
|
|
self.delete_from_index()
|
|
|
|
|
|
|
|
self.add_missing()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rename_files(self):
|
|
|
|
|
|
|
|
"""rename media files as identified by find_bad_media_url"""
|
|
|
|
|
|
|
|
if not self.to_rename:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
total = len(self.to_rename)
|
|
|
|
def _get_indexed(self):
|
|
|
|
|
|
|
|
"""get all indexed ids"""
|
|
|
|
if self.task:
|
|
|
|
if self.task:
|
|
|
|
self.task.send_progress([f"Rename {total} media files."])
|
|
|
|
self.task.send_progress(["Get all videos indexed."])
|
|
|
|
for bad_filename in self.to_rename:
|
|
|
|
|
|
|
|
channel, filename, expected_filename = bad_filename
|
|
|
|
|
|
|
|
print(f"renaming [{filename}] to [{expected_filename}]")
|
|
|
|
|
|
|
|
old_path = os.path.join(self.VIDEOS, channel, filename)
|
|
|
|
|
|
|
|
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
|
|
|
|
|
|
|
|
os.rename(old_path, new_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def send_mismatch_bulk(self):
|
|
|
|
|
|
|
|
"""build bulk update"""
|
|
|
|
|
|
|
|
if not self.mismatch:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
total = len(self.mismatch)
|
|
|
|
data = {"query": {"match_all": {}}, "_source": ["youtube_id"]}
|
|
|
|
if self.task:
|
|
|
|
response = IndexPaginate("ta_video", data).get_results()
|
|
|
|
self.task.send_progress([f"Fix media urls for {total} files"])
|
|
|
|
return set(i["youtube_id"] for i in response)
|
|
|
|
bulk_list = []
|
|
|
|
|
|
|
|
for video_mismatch in self.mismatch:
|
|
|
|
def apply(self):
|
|
|
|
youtube_id, media_url = video_mismatch
|
|
|
|
"""apply all changes"""
|
|
|
|
print(f"{youtube_id}: fixing media url {media_url}")
|
|
|
|
self.delete()
|
|
|
|
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
|
|
|
|
self.index()
|
|
|
|
source = {"doc": {"media_url": media_url}}
|
|
|
|
self.url_fix()
|
|
|
|
bulk_list.append(json.dumps(action))
|
|
|
|
|
|
|
|
bulk_list.append(json.dumps(source))
|
|
|
|
def delete(self):
|
|
|
|
# add last newline
|
|
|
|
"""delete videos from index"""
|
|
|
|
bulk_list.append("\n")
|
|
|
|
|
|
|
|
data = "\n".join(bulk_list)
|
|
|
|
|
|
|
|
_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def delete_from_index(self):
|
|
|
|
|
|
|
|
"""find indexed but deleted mediafile"""
|
|
|
|
|
|
|
|
if not self.to_delete:
|
|
|
|
if not self.to_delete:
|
|
|
|
|
|
|
|
print("nothing to delete")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
total = len(self.to_delete)
|
|
|
|
|
|
|
|
if self.task:
|
|
|
|
if self.task:
|
|
|
|
self.task.send_progress([f"Clean up {total} items from index."])
|
|
|
|
self.task.send_progress(
|
|
|
|
for indexed in self.to_delete:
|
|
|
|
[f"Remove {len(self.to_delete)} videos from index."]
|
|
|
|
youtube_id = indexed[0]
|
|
|
|
)
|
|
|
|
print(f"deleting {youtube_id} from index")
|
|
|
|
|
|
|
|
path = f"ta_video/_doc/{youtube_id}"
|
|
|
|
for youtube_id in self.to_delete:
|
|
|
|
_, _ = ElasticWrap(path).delete()
|
|
|
|
YoutubeVideo(youtube_id).delete_media_file()
|
|
|
|
|
|
|
|
|
|
|
|
def add_missing(self):
|
|
|
|
def index(self):
|
|
|
|
"""add missing videos to index"""
|
|
|
|
"""index new"""
|
|
|
|
video_ids = [i[2] for i in self.to_index]
|
|
|
|
if not self.to_index:
|
|
|
|
if not video_ids:
|
|
|
|
print("nothing to index")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
total = len(video_ids)
|
|
|
|
total = len(self.to_index)
|
|
|
|
for idx, youtube_id in enumerate(video_ids):
|
|
|
|
for idx, youtube_id in enumerate(self.to_index):
|
|
|
|
if self.task:
|
|
|
|
if self.task:
|
|
|
|
self.task.send_progress(
|
|
|
|
self.task.send_progress(
|
|
|
|
message_lines=[
|
|
|
|
message_lines=[
|
|
|
@ -202,4 +89,36 @@ class Filesystem(ScannerBase):
|
|
|
|
)
|
|
|
|
)
|
|
|
|
index_new_video(youtube_id)
|
|
|
|
index_new_video(youtube_id)
|
|
|
|
|
|
|
|
|
|
|
|
CommentList(video_ids, task=self.task).index()
|
|
|
|
CommentList(self.to_index, task=self.task).index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def url_fix(self):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
update path v0.3.6 to v0.3.7
|
|
|
|
|
|
|
|
fix url not matching channel-videoid pattern
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
bool_must = (
|
|
|
|
|
|
|
|
"doc['media_url'].value == "
|
|
|
|
|
|
|
|
+ "(doc['channel.channel_id'].value + '/' + "
|
|
|
|
|
|
|
|
+ "doc['youtube_id'].value) + '.mp4'"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
to_update = (
|
|
|
|
|
|
|
|
"ctx._source['media_url'] = "
|
|
|
|
|
|
|
|
+ "ctx._source.channel['channel_id'] + '/' + "
|
|
|
|
|
|
|
|
+ "ctx._source['youtube_id'] + '.mp4'"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
|
|
|
"query": {
|
|
|
|
|
|
|
|
"bool": {
|
|
|
|
|
|
|
|
"must_not": [{"script": {"script": {"source": bool_must}}}]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"script": {"source": to_update},
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
response, _ = ElasticWrap("ta_video/_update_by_query").post(data=data)
|
|
|
|
|
|
|
|
updated = response.get("updates")
|
|
|
|
|
|
|
|
if updated:
|
|
|
|
|
|
|
|
print(f"updated {updated} bad media_url")
|
|
|
|
|
|
|
|
if self.task:
|
|
|
|
|
|
|
|
self.task.send_progress(
|
|
|
|
|
|
|
|
[f"Updated {updated} wrong media urls."]
|
|
|
|
|
|
|
|
)
|
|
|
|