mirror of
https://github.com/tubearchivist/tubearchivist
synced 2024-11-17 21:25:49 +00:00
better reindex_type handler, fix off by one
This commit is contained in:
parent
2a35b42d88
commit
1d9c274390
@ -8,6 +8,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from typing import Callable, TypedDict
|
||||||
|
|
||||||
from home.models import CustomPeriodicTask
|
from home.models import CustomPeriodicTask
|
||||||
from home.src.download.subscriptions import ChannelSubscription
|
from home.src.download.subscriptions import ChannelSubscription
|
||||||
@ -23,10 +24,19 @@ from home.src.ta.settings import EnvironmentSettings
|
|||||||
from home.src.ta.ta_redis import RedisQueue
|
from home.src.ta.ta_redis import RedisQueue
|
||||||
|
|
||||||
|
|
||||||
|
class ReindexConfigType(TypedDict):
|
||||||
|
"""represents config type"""
|
||||||
|
|
||||||
|
index_name: str
|
||||||
|
queue_name: str
|
||||||
|
active_key: str
|
||||||
|
refresh_key: str
|
||||||
|
|
||||||
|
|
||||||
class ReindexBase:
|
class ReindexBase:
|
||||||
"""base config class for reindex task"""
|
"""base config class for reindex task"""
|
||||||
|
|
||||||
REINDEX_CONFIG = {
|
REINDEX_CONFIG: dict[str, ReindexConfigType] = {
|
||||||
"video": {
|
"video": {
|
||||||
"index_name": "ta_video",
|
"index_name": "ta_video",
|
||||||
"queue_name": "reindex:ta_video",
|
"queue_name": "reindex:ta_video",
|
||||||
@ -54,7 +64,7 @@ class ReindexBase:
|
|||||||
self.config = AppConfig().config
|
self.config = AppConfig().config
|
||||||
self.now = int(datetime.now().timestamp())
|
self.now = int(datetime.now().timestamp())
|
||||||
|
|
||||||
def populate(self, all_ids, reindex_config):
|
def populate(self, all_ids, reindex_config: ReindexConfigType):
|
||||||
"""add all to reindex ids to redis queue"""
|
"""add all to reindex ids to redis queue"""
|
||||||
if not all_ids:
|
if not all_ids:
|
||||||
return
|
return
|
||||||
@ -65,13 +75,13 @@ class ReindexBase:
|
|||||||
class ReindexPopulate(ReindexBase):
|
class ReindexPopulate(ReindexBase):
|
||||||
"""add outdated and recent documents to reindex queue"""
|
"""add outdated and recent documents to reindex queue"""
|
||||||
|
|
||||||
INTERVAL_DEFAIULT = 90
|
INTERVAL_DEFAIULT: int = 90
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.interval = self.INTERVAL_DEFAIULT
|
self.interval = self.INTERVAL_DEFAIULT
|
||||||
|
|
||||||
def get_interval(self):
|
def get_interval(self) -> None:
|
||||||
"""get reindex days interval from task"""
|
"""get reindex days interval from task"""
|
||||||
try:
|
try:
|
||||||
task = CustomPeriodicTask.objects.get(name="check_reindex")
|
task = CustomPeriodicTask.objects.get(name="check_reindex")
|
||||||
@ -82,7 +92,7 @@ class ReindexPopulate(ReindexBase):
|
|||||||
if task_config.get("days"):
|
if task_config.get("days"):
|
||||||
self.interval = task_config.get("days")
|
self.interval = task_config.get("days")
|
||||||
|
|
||||||
def add_recent(self):
|
def add_recent(self) -> None:
|
||||||
"""add recent videos to refresh"""
|
"""add recent videos to refresh"""
|
||||||
gte = datetime.fromtimestamp(self.now - self.DAYS3).date().isoformat()
|
gte = datetime.fromtimestamp(self.now - self.DAYS3).date().isoformat()
|
||||||
must_list = [
|
must_list = [
|
||||||
@ -100,10 +110,10 @@ class ReindexPopulate(ReindexBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
all_ids = [i["_source"]["youtube_id"] for i in hits]
|
all_ids = [i["_source"]["youtube_id"] for i in hits]
|
||||||
reindex_config = self.REINDEX_CONFIG.get("video")
|
reindex_config: ReindexConfigType = self.REINDEX_CONFIG["video"]
|
||||||
self.populate(all_ids, reindex_config)
|
self.populate(all_ids, reindex_config)
|
||||||
|
|
||||||
def add_outdated(self):
|
def add_outdated(self) -> None:
|
||||||
"""add outdated documents"""
|
"""add outdated documents"""
|
||||||
for reindex_config in self.REINDEX_CONFIG.values():
|
for reindex_config in self.REINDEX_CONFIG.values():
|
||||||
total_hits = self._get_total_hits(reindex_config)
|
total_hits = self._get_total_hits(reindex_config)
|
||||||
@ -112,7 +122,7 @@ class ReindexPopulate(ReindexBase):
|
|||||||
self.populate(all_ids, reindex_config)
|
self.populate(all_ids, reindex_config)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_total_hits(reindex_config):
|
def _get_total_hits(reindex_config: ReindexConfigType) -> int:
|
||||||
"""get total hits from index"""
|
"""get total hits from index"""
|
||||||
index_name = reindex_config["index_name"]
|
index_name = reindex_config["index_name"]
|
||||||
active_key = reindex_config["active_key"]
|
active_key = reindex_config["active_key"]
|
||||||
@ -124,7 +134,7 @@ class ReindexPopulate(ReindexBase):
|
|||||||
|
|
||||||
return len(total)
|
return len(total)
|
||||||
|
|
||||||
def _get_daily_should(self, total_hits):
|
def _get_daily_should(self, total_hits: int) -> int:
|
||||||
"""calc how many should reindex daily"""
|
"""calc how many should reindex daily"""
|
||||||
daily_should = int((total_hits // self.interval + 1) * self.MULTIPLY)
|
daily_should = int((total_hits // self.interval + 1) * self.MULTIPLY)
|
||||||
if daily_should >= 10000:
|
if daily_should >= 10000:
|
||||||
@ -132,7 +142,9 @@ class ReindexPopulate(ReindexBase):
|
|||||||
|
|
||||||
return daily_should
|
return daily_should
|
||||||
|
|
||||||
def _get_outdated_ids(self, reindex_config, daily_should):
|
def _get_outdated_ids(
|
||||||
|
self, reindex_config: ReindexConfigType, daily_should: int
|
||||||
|
) -> list[str]:
|
||||||
"""get outdated from index_name"""
|
"""get outdated from index_name"""
|
||||||
index_name = reindex_config["index_name"]
|
index_name = reindex_config["index_name"]
|
||||||
refresh_key = reindex_config["refresh_key"]
|
refresh_key = reindex_config["refresh_key"]
|
||||||
@ -169,7 +181,7 @@ class ReindexManual(ReindexBase):
|
|||||||
self.extract_videos = extract_videos
|
self.extract_videos = extract_videos
|
||||||
self.data = False
|
self.data = False
|
||||||
|
|
||||||
def extract_data(self, data):
|
def extract_data(self, data) -> None:
|
||||||
"""process data"""
|
"""process data"""
|
||||||
self.data = data
|
self.data = data
|
||||||
for key, values in self.data.items():
|
for key, values in self.data.items():
|
||||||
@ -180,7 +192,9 @@ class ReindexManual(ReindexBase):
|
|||||||
|
|
||||||
self.process_index(reindex_config, values)
|
self.process_index(reindex_config, values)
|
||||||
|
|
||||||
def process_index(self, index_config, values):
|
def process_index(
|
||||||
|
self, index_config: ReindexConfigType, values: list[str]
|
||||||
|
) -> None:
|
||||||
"""process values per index"""
|
"""process values per index"""
|
||||||
index_name = index_config["index_name"]
|
index_name = index_config["index_name"]
|
||||||
if index_name == "ta_video":
|
if index_name == "ta_video":
|
||||||
@ -190,7 +204,7 @@ class ReindexManual(ReindexBase):
|
|||||||
elif index_name == "ta_playlist":
|
elif index_name == "ta_playlist":
|
||||||
self._add_playlists(values)
|
self._add_playlists(values)
|
||||||
|
|
||||||
def _add_videos(self, values):
|
def _add_videos(self, values: list[str]) -> None:
|
||||||
"""add list of videos to reindex queue"""
|
"""add list of videos to reindex queue"""
|
||||||
if not values:
|
if not values:
|
||||||
return
|
return
|
||||||
@ -198,7 +212,7 @@ class ReindexManual(ReindexBase):
|
|||||||
queue_name = self.REINDEX_CONFIG["video"]["queue_name"]
|
queue_name = self.REINDEX_CONFIG["video"]["queue_name"]
|
||||||
RedisQueue(queue_name).add_list(values)
|
RedisQueue(queue_name).add_list(values)
|
||||||
|
|
||||||
def _add_channels(self, values):
|
def _add_channels(self, values: list[str]) -> None:
|
||||||
"""add list of channels to reindex queue"""
|
"""add list of channels to reindex queue"""
|
||||||
queue_name = self.REINDEX_CONFIG["channel"]["queue_name"]
|
queue_name = self.REINDEX_CONFIG["channel"]["queue_name"]
|
||||||
RedisQueue(queue_name).add_list(values)
|
RedisQueue(queue_name).add_list(values)
|
||||||
@ -208,7 +222,7 @@ class ReindexManual(ReindexBase):
|
|||||||
all_videos = self._get_channel_videos(channel_id)
|
all_videos = self._get_channel_videos(channel_id)
|
||||||
self._add_videos(all_videos)
|
self._add_videos(all_videos)
|
||||||
|
|
||||||
def _add_playlists(self, values):
|
def _add_playlists(self, values: list[str]) -> None:
|
||||||
"""add list of playlists to reindex queue"""
|
"""add list of playlists to reindex queue"""
|
||||||
queue_name = self.REINDEX_CONFIG["playlist"]["queue_name"]
|
queue_name = self.REINDEX_CONFIG["playlist"]["queue_name"]
|
||||||
RedisQueue(queue_name).add_list(values)
|
RedisQueue(queue_name).add_list(values)
|
||||||
@ -218,7 +232,7 @@ class ReindexManual(ReindexBase):
|
|||||||
all_videos = self._get_playlist_videos(playlist_id)
|
all_videos = self._get_playlist_videos(playlist_id)
|
||||||
self._add_videos(all_videos)
|
self._add_videos(all_videos)
|
||||||
|
|
||||||
def _get_channel_videos(self, channel_id):
|
def _get_channel_videos(self, channel_id: str) -> list[str]:
|
||||||
"""get all videos from channel"""
|
"""get all videos from channel"""
|
||||||
data = {
|
data = {
|
||||||
"query": {"term": {"channel.channel_id": {"value": channel_id}}},
|
"query": {"term": {"channel.channel_id": {"value": channel_id}}},
|
||||||
@ -227,7 +241,7 @@ class ReindexManual(ReindexBase):
|
|||||||
all_results = IndexPaginate("ta_video", data).get_results()
|
all_results = IndexPaginate("ta_video", data).get_results()
|
||||||
return [i["youtube_id"] for i in all_results]
|
return [i["youtube_id"] for i in all_results]
|
||||||
|
|
||||||
def _get_playlist_videos(self, playlist_id):
|
def _get_playlist_videos(self, playlist_id: str) -> list[str]:
|
||||||
"""get all videos from playlist"""
|
"""get all videos from playlist"""
|
||||||
data = {
|
data = {
|
||||||
"query": {"term": {"playlist.keyword": {"value": playlist_id}}},
|
"query": {"term": {"playlist.keyword": {"value": playlist_id}}},
|
||||||
@ -249,7 +263,7 @@ class Reindex(ReindexBase):
|
|||||||
"playlists": 0,
|
"playlists": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
def reindex_all(self):
|
def reindex_all(self) -> None:
|
||||||
"""reindex all in queue"""
|
"""reindex all in queue"""
|
||||||
if not self.cookie_is_valid():
|
if not self.cookie_is_valid():
|
||||||
print("[reindex] cookie invalid, exiting...")
|
print("[reindex] cookie invalid, exiting...")
|
||||||
@ -259,27 +273,26 @@ class Reindex(ReindexBase):
|
|||||||
if not RedisQueue(index_config["queue_name"]).length():
|
if not RedisQueue(index_config["queue_name"]).length():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
while True:
|
self.reindex_type(name, index_config)
|
||||||
has_next = self.reindex_index(name, index_config)
|
|
||||||
if not has_next:
|
|
||||||
break
|
|
||||||
|
|
||||||
def reindex_index(self, name, index_config):
|
def reindex_type(self, name: str, index_config: ReindexConfigType) -> None:
|
||||||
"""reindex all of a single index"""
|
"""reindex all of a single index"""
|
||||||
reindex = self._get_reindex_map(index_config["index_name"])
|
reindex = self._get_reindex_map(index_config["index_name"])
|
||||||
queue = RedisQueue(index_config["queue_name"])
|
queue = RedisQueue(index_config["queue_name"])
|
||||||
total = queue.max_score()
|
while True:
|
||||||
youtube_id, idx = queue.get_next()
|
total = queue.max_score()
|
||||||
if youtube_id:
|
youtube_id, idx = queue.get_next()
|
||||||
|
if not youtube_id or not idx or not total:
|
||||||
|
break
|
||||||
|
|
||||||
if self.task:
|
if self.task:
|
||||||
self._notify(name, total, idx)
|
self._notify(name, total, idx)
|
||||||
|
|
||||||
reindex(youtube_id)
|
reindex(youtube_id)
|
||||||
sleep_interval = self.config["downloads"].get("sleep_interval", 0)
|
sleep_interval = self.config["downloads"].get("sleep_interval", 0)
|
||||||
sleep(sleep_interval)
|
sleep(sleep_interval)
|
||||||
|
|
||||||
return bool(youtube_id)
|
def _get_reindex_map(self, index_name: str) -> Callable:
|
||||||
|
|
||||||
def _get_reindex_map(self, index_name):
|
|
||||||
"""return def to run for index"""
|
"""return def to run for index"""
|
||||||
def_map = {
|
def_map = {
|
||||||
"ta_video": self._reindex_single_video,
|
"ta_video": self._reindex_single_video,
|
||||||
@ -287,15 +300,15 @@ class Reindex(ReindexBase):
|
|||||||
"ta_playlist": self._reindex_single_playlist,
|
"ta_playlist": self._reindex_single_playlist,
|
||||||
}
|
}
|
||||||
|
|
||||||
return def_map.get(index_name)
|
return def_map[index_name]
|
||||||
|
|
||||||
def _notify(self, name, total, idx):
|
def _notify(self, name: str, total: int, idx: int) -> None:
|
||||||
"""send notification back to task"""
|
"""send notification back to task"""
|
||||||
message = [f"Reindexing {name.title()}s {idx}/{total}"]
|
message = [f"Reindexing {name.title()}s {idx}/{total}"]
|
||||||
progress = idx / total
|
progress = idx / total
|
||||||
self.task.send_progress(message, progress=progress)
|
self.task.send_progress(message, progress=progress)
|
||||||
|
|
||||||
def _reindex_single_video(self, youtube_id):
|
def _reindex_single_video(self, youtube_id: str) -> None:
|
||||||
"""refresh data for single video"""
|
"""refresh data for single video"""
|
||||||
video = YoutubeVideo(youtube_id)
|
video = YoutubeVideo(youtube_id)
|
||||||
|
|
||||||
@ -334,9 +347,7 @@ class Reindex(ReindexBase):
|
|||||||
Comments(youtube_id, config=self.config).reindex_comments()
|
Comments(youtube_id, config=self.config).reindex_comments()
|
||||||
self.processed["videos"] += 1
|
self.processed["videos"] += 1
|
||||||
|
|
||||||
return
|
def _reindex_single_channel(self, channel_id: str) -> None:
|
||||||
|
|
||||||
def _reindex_single_channel(self, channel_id):
|
|
||||||
"""refresh channel data and sync to videos"""
|
"""refresh channel data and sync to videos"""
|
||||||
# read current state
|
# read current state
|
||||||
channel = YoutubeChannel(channel_id)
|
channel = YoutubeChannel(channel_id)
|
||||||
@ -367,7 +378,7 @@ class Reindex(ReindexBase):
|
|||||||
ChannelFullScan(channel_id).scan()
|
ChannelFullScan(channel_id).scan()
|
||||||
self.processed["channels"] += 1
|
self.processed["channels"] += 1
|
||||||
|
|
||||||
def _reindex_single_playlist(self, playlist_id):
|
def _reindex_single_playlist(self, playlist_id: str) -> None:
|
||||||
"""refresh playlist data"""
|
"""refresh playlist data"""
|
||||||
playlist = YoutubePlaylist(playlist_id)
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
playlist.get_from_es()
|
playlist.get_from_es()
|
||||||
@ -383,9 +394,8 @@ class Reindex(ReindexBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
self.processed["playlists"] += 1
|
self.processed["playlists"] += 1
|
||||||
return
|
|
||||||
|
|
||||||
def cookie_is_valid(self):
|
def cookie_is_valid(self) -> bool:
|
||||||
"""return true if cookie is enabled and valid"""
|
"""return true if cookie is enabled and valid"""
|
||||||
if not self.config["downloads"]["cookie_import"]:
|
if not self.config["downloads"]["cookie_import"]:
|
||||||
# is not activated, continue reindex
|
# is not activated, continue reindex
|
||||||
@ -394,7 +404,7 @@ class Reindex(ReindexBase):
|
|||||||
valid = CookieHandler(self.config).validate()
|
valid = CookieHandler(self.config).validate()
|
||||||
return valid
|
return valid
|
||||||
|
|
||||||
def build_message(self):
|
def build_message(self) -> str:
|
||||||
"""build progress message"""
|
"""build progress message"""
|
||||||
message = ""
|
message = ""
|
||||||
for key, value in self.processed.items():
|
for key, value in self.processed.items():
|
||||||
@ -424,7 +434,7 @@ class ReindexProgress(ReindexBase):
|
|||||||
self.request_type = request_type
|
self.request_type = request_type
|
||||||
self.request_id = request_id
|
self.request_id = request_id
|
||||||
|
|
||||||
def get_progress(self):
|
def get_progress(self) -> dict:
|
||||||
"""get progress from task"""
|
"""get progress from task"""
|
||||||
queue_name, request_type = self._get_queue_name()
|
queue_name, request_type = self._get_queue_name()
|
||||||
total = self._get_total_in_queue(queue_name)
|
total = self._get_total_in_queue(queue_name)
|
||||||
|
@ -155,13 +155,13 @@ class RedisQueue(RedisBase):
|
|||||||
mapping = {i[1]: next_score + i[0] for i in enumerate(to_add)}
|
mapping = {i[1]: next_score + i[0] for i in enumerate(to_add)}
|
||||||
self.conn.zadd(self.key, mapping)
|
self.conn.zadd(self.key, mapping)
|
||||||
|
|
||||||
def max_score(self) -> float | None:
|
def max_score(self) -> int | None:
|
||||||
"""get max score"""
|
"""get max score"""
|
||||||
last = self.conn.zrange(self.key, -1, -1, withscores=True)
|
last = self.conn.zrange(self.key, -1, -1, withscores=True)
|
||||||
if not last:
|
if not last:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return last[0][1]
|
return int(last[0][1])
|
||||||
|
|
||||||
def _get_next_score(self) -> float:
|
def _get_next_score(self) -> float:
|
||||||
"""get next score in queue to append"""
|
"""get next score in queue to append"""
|
||||||
@ -171,13 +171,13 @@ class RedisQueue(RedisBase):
|
|||||||
|
|
||||||
return last[0][1] + 1
|
return last[0][1] + 1
|
||||||
|
|
||||||
def get_next(self) -> tuple[str | None, float | None]:
|
def get_next(self) -> tuple[str | None, int | None]:
|
||||||
"""return next element in the queue, if available"""
|
"""return next element in the queue, if available"""
|
||||||
result = self.conn.zpopmin(self.key)
|
result = self.conn.zpopmin(self.key)
|
||||||
if not result:
|
if not result:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
item, idx = result[0]
|
item, idx = result[0][0], int(result[0][1]) + 1
|
||||||
|
|
||||||
return item, idx
|
return item, idx
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user