better reindex_type handler, fix off by one

This commit is contained in:
Simon 2024-05-15 13:18:28 +02:00
parent 2a35b42d88
commit 1d9c274390
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
2 changed files with 54 additions and 44 deletions

View File

@ -8,6 +8,7 @@ import json
import os import os
from datetime import datetime from datetime import datetime
from time import sleep from time import sleep
from typing import Callable, TypedDict
from home.models import CustomPeriodicTask from home.models import CustomPeriodicTask
from home.src.download.subscriptions import ChannelSubscription from home.src.download.subscriptions import ChannelSubscription
@ -23,10 +24,19 @@ from home.src.ta.settings import EnvironmentSettings
from home.src.ta.ta_redis import RedisQueue from home.src.ta.ta_redis import RedisQueue
class ReindexConfigType(TypedDict):
"""represents config type"""
index_name: str
queue_name: str
active_key: str
refresh_key: str
class ReindexBase: class ReindexBase:
"""base config class for reindex task""" """base config class for reindex task"""
REINDEX_CONFIG = { REINDEX_CONFIG: dict[str, ReindexConfigType] = {
"video": { "video": {
"index_name": "ta_video", "index_name": "ta_video",
"queue_name": "reindex:ta_video", "queue_name": "reindex:ta_video",
@ -54,7 +64,7 @@ class ReindexBase:
self.config = AppConfig().config self.config = AppConfig().config
self.now = int(datetime.now().timestamp()) self.now = int(datetime.now().timestamp())
def populate(self, all_ids, reindex_config): def populate(self, all_ids, reindex_config: ReindexConfigType):
"""add all to reindex ids to redis queue""" """add all to reindex ids to redis queue"""
if not all_ids: if not all_ids:
return return
@ -65,13 +75,13 @@ class ReindexBase:
class ReindexPopulate(ReindexBase): class ReindexPopulate(ReindexBase):
"""add outdated and recent documents to reindex queue""" """add outdated and recent documents to reindex queue"""
INTERVAL_DEFAIULT = 90 INTERVAL_DEFAIULT: int = 90
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.interval = self.INTERVAL_DEFAIULT self.interval = self.INTERVAL_DEFAIULT
def get_interval(self): def get_interval(self) -> None:
"""get reindex days interval from task""" """get reindex days interval from task"""
try: try:
task = CustomPeriodicTask.objects.get(name="check_reindex") task = CustomPeriodicTask.objects.get(name="check_reindex")
@ -82,7 +92,7 @@ class ReindexPopulate(ReindexBase):
if task_config.get("days"): if task_config.get("days"):
self.interval = task_config.get("days") self.interval = task_config.get("days")
def add_recent(self): def add_recent(self) -> None:
"""add recent videos to refresh""" """add recent videos to refresh"""
gte = datetime.fromtimestamp(self.now - self.DAYS3).date().isoformat() gte = datetime.fromtimestamp(self.now - self.DAYS3).date().isoformat()
must_list = [ must_list = [
@ -100,10 +110,10 @@ class ReindexPopulate(ReindexBase):
return return
all_ids = [i["_source"]["youtube_id"] for i in hits] all_ids = [i["_source"]["youtube_id"] for i in hits]
reindex_config = self.REINDEX_CONFIG.get("video") reindex_config: ReindexConfigType = self.REINDEX_CONFIG["video"]
self.populate(all_ids, reindex_config) self.populate(all_ids, reindex_config)
def add_outdated(self): def add_outdated(self) -> None:
"""add outdated documents""" """add outdated documents"""
for reindex_config in self.REINDEX_CONFIG.values(): for reindex_config in self.REINDEX_CONFIG.values():
total_hits = self._get_total_hits(reindex_config) total_hits = self._get_total_hits(reindex_config)
@ -112,7 +122,7 @@ class ReindexPopulate(ReindexBase):
self.populate(all_ids, reindex_config) self.populate(all_ids, reindex_config)
@staticmethod @staticmethod
def _get_total_hits(reindex_config): def _get_total_hits(reindex_config: ReindexConfigType) -> int:
"""get total hits from index""" """get total hits from index"""
index_name = reindex_config["index_name"] index_name = reindex_config["index_name"]
active_key = reindex_config["active_key"] active_key = reindex_config["active_key"]
@ -124,7 +134,7 @@ class ReindexPopulate(ReindexBase):
return len(total) return len(total)
def _get_daily_should(self, total_hits): def _get_daily_should(self, total_hits: int) -> int:
"""calc how many should reindex daily""" """calc how many should reindex daily"""
daily_should = int((total_hits // self.interval + 1) * self.MULTIPLY) daily_should = int((total_hits // self.interval + 1) * self.MULTIPLY)
if daily_should >= 10000: if daily_should >= 10000:
@ -132,7 +142,9 @@ class ReindexPopulate(ReindexBase):
return daily_should return daily_should
def _get_outdated_ids(self, reindex_config, daily_should): def _get_outdated_ids(
self, reindex_config: ReindexConfigType, daily_should: int
) -> list[str]:
"""get outdated from index_name""" """get outdated from index_name"""
index_name = reindex_config["index_name"] index_name = reindex_config["index_name"]
refresh_key = reindex_config["refresh_key"] refresh_key = reindex_config["refresh_key"]
@ -169,7 +181,7 @@ class ReindexManual(ReindexBase):
self.extract_videos = extract_videos self.extract_videos = extract_videos
self.data = False self.data = False
def extract_data(self, data): def extract_data(self, data) -> None:
"""process data""" """process data"""
self.data = data self.data = data
for key, values in self.data.items(): for key, values in self.data.items():
@ -180,7 +192,9 @@ class ReindexManual(ReindexBase):
self.process_index(reindex_config, values) self.process_index(reindex_config, values)
def process_index(self, index_config, values): def process_index(
self, index_config: ReindexConfigType, values: list[str]
) -> None:
"""process values per index""" """process values per index"""
index_name = index_config["index_name"] index_name = index_config["index_name"]
if index_name == "ta_video": if index_name == "ta_video":
@ -190,7 +204,7 @@ class ReindexManual(ReindexBase):
elif index_name == "ta_playlist": elif index_name == "ta_playlist":
self._add_playlists(values) self._add_playlists(values)
def _add_videos(self, values): def _add_videos(self, values: list[str]) -> None:
"""add list of videos to reindex queue""" """add list of videos to reindex queue"""
if not values: if not values:
return return
@ -198,7 +212,7 @@ class ReindexManual(ReindexBase):
queue_name = self.REINDEX_CONFIG["video"]["queue_name"] queue_name = self.REINDEX_CONFIG["video"]["queue_name"]
RedisQueue(queue_name).add_list(values) RedisQueue(queue_name).add_list(values)
def _add_channels(self, values): def _add_channels(self, values: list[str]) -> None:
"""add list of channels to reindex queue""" """add list of channels to reindex queue"""
queue_name = self.REINDEX_CONFIG["channel"]["queue_name"] queue_name = self.REINDEX_CONFIG["channel"]["queue_name"]
RedisQueue(queue_name).add_list(values) RedisQueue(queue_name).add_list(values)
@ -208,7 +222,7 @@ class ReindexManual(ReindexBase):
all_videos = self._get_channel_videos(channel_id) all_videos = self._get_channel_videos(channel_id)
self._add_videos(all_videos) self._add_videos(all_videos)
def _add_playlists(self, values): def _add_playlists(self, values: list[str]) -> None:
"""add list of playlists to reindex queue""" """add list of playlists to reindex queue"""
queue_name = self.REINDEX_CONFIG["playlist"]["queue_name"] queue_name = self.REINDEX_CONFIG["playlist"]["queue_name"]
RedisQueue(queue_name).add_list(values) RedisQueue(queue_name).add_list(values)
@ -218,7 +232,7 @@ class ReindexManual(ReindexBase):
all_videos = self._get_playlist_videos(playlist_id) all_videos = self._get_playlist_videos(playlist_id)
self._add_videos(all_videos) self._add_videos(all_videos)
def _get_channel_videos(self, channel_id): def _get_channel_videos(self, channel_id: str) -> list[str]:
"""get all videos from channel""" """get all videos from channel"""
data = { data = {
"query": {"term": {"channel.channel_id": {"value": channel_id}}}, "query": {"term": {"channel.channel_id": {"value": channel_id}}},
@ -227,7 +241,7 @@ class ReindexManual(ReindexBase):
all_results = IndexPaginate("ta_video", data).get_results() all_results = IndexPaginate("ta_video", data).get_results()
return [i["youtube_id"] for i in all_results] return [i["youtube_id"] for i in all_results]
def _get_playlist_videos(self, playlist_id): def _get_playlist_videos(self, playlist_id: str) -> list[str]:
"""get all videos from playlist""" """get all videos from playlist"""
data = { data = {
"query": {"term": {"playlist.keyword": {"value": playlist_id}}}, "query": {"term": {"playlist.keyword": {"value": playlist_id}}},
@ -249,7 +263,7 @@ class Reindex(ReindexBase):
"playlists": 0, "playlists": 0,
} }
def reindex_all(self): def reindex_all(self) -> None:
"""reindex all in queue""" """reindex all in queue"""
if not self.cookie_is_valid(): if not self.cookie_is_valid():
print("[reindex] cookie invalid, exiting...") print("[reindex] cookie invalid, exiting...")
@ -259,27 +273,26 @@ class Reindex(ReindexBase):
if not RedisQueue(index_config["queue_name"]).length(): if not RedisQueue(index_config["queue_name"]).length():
continue continue
while True: self.reindex_type(name, index_config)
has_next = self.reindex_index(name, index_config)
if not has_next:
break
def reindex_index(self, name, index_config): def reindex_type(self, name: str, index_config: ReindexConfigType) -> None:
"""reindex all of a single index""" """reindex all of a single index"""
reindex = self._get_reindex_map(index_config["index_name"]) reindex = self._get_reindex_map(index_config["index_name"])
queue = RedisQueue(index_config["queue_name"]) queue = RedisQueue(index_config["queue_name"])
total = queue.max_score() while True:
youtube_id, idx = queue.get_next() total = queue.max_score()
if youtube_id: youtube_id, idx = queue.get_next()
if not youtube_id or not idx or not total:
break
if self.task: if self.task:
self._notify(name, total, idx) self._notify(name, total, idx)
reindex(youtube_id) reindex(youtube_id)
sleep_interval = self.config["downloads"].get("sleep_interval", 0) sleep_interval = self.config["downloads"].get("sleep_interval", 0)
sleep(sleep_interval) sleep(sleep_interval)
return bool(youtube_id) def _get_reindex_map(self, index_name: str) -> Callable:
def _get_reindex_map(self, index_name):
"""return def to run for index""" """return def to run for index"""
def_map = { def_map = {
"ta_video": self._reindex_single_video, "ta_video": self._reindex_single_video,
@ -287,15 +300,15 @@ class Reindex(ReindexBase):
"ta_playlist": self._reindex_single_playlist, "ta_playlist": self._reindex_single_playlist,
} }
return def_map.get(index_name) return def_map[index_name]
def _notify(self, name, total, idx): def _notify(self, name: str, total: int, idx: int) -> None:
"""send notification back to task""" """send notification back to task"""
message = [f"Reindexing {name.title()}s {idx}/{total}"] message = [f"Reindexing {name.title()}s {idx}/{total}"]
progress = idx / total progress = idx / total
self.task.send_progress(message, progress=progress) self.task.send_progress(message, progress=progress)
def _reindex_single_video(self, youtube_id): def _reindex_single_video(self, youtube_id: str) -> None:
"""refresh data for single video""" """refresh data for single video"""
video = YoutubeVideo(youtube_id) video = YoutubeVideo(youtube_id)
@ -334,9 +347,7 @@ class Reindex(ReindexBase):
Comments(youtube_id, config=self.config).reindex_comments() Comments(youtube_id, config=self.config).reindex_comments()
self.processed["videos"] += 1 self.processed["videos"] += 1
return def _reindex_single_channel(self, channel_id: str) -> None:
def _reindex_single_channel(self, channel_id):
"""refresh channel data and sync to videos""" """refresh channel data and sync to videos"""
# read current state # read current state
channel = YoutubeChannel(channel_id) channel = YoutubeChannel(channel_id)
@ -367,7 +378,7 @@ class Reindex(ReindexBase):
ChannelFullScan(channel_id).scan() ChannelFullScan(channel_id).scan()
self.processed["channels"] += 1 self.processed["channels"] += 1
def _reindex_single_playlist(self, playlist_id): def _reindex_single_playlist(self, playlist_id: str) -> None:
"""refresh playlist data""" """refresh playlist data"""
playlist = YoutubePlaylist(playlist_id) playlist = YoutubePlaylist(playlist_id)
playlist.get_from_es() playlist.get_from_es()
@ -383,9 +394,8 @@ class Reindex(ReindexBase):
return return
self.processed["playlists"] += 1 self.processed["playlists"] += 1
return
def cookie_is_valid(self): def cookie_is_valid(self) -> bool:
"""return true if cookie is enabled and valid""" """return true if cookie is enabled and valid"""
if not self.config["downloads"]["cookie_import"]: if not self.config["downloads"]["cookie_import"]:
# is not activated, continue reindex # is not activated, continue reindex
@ -394,7 +404,7 @@ class Reindex(ReindexBase):
valid = CookieHandler(self.config).validate() valid = CookieHandler(self.config).validate()
return valid return valid
def build_message(self): def build_message(self) -> str:
"""build progress message""" """build progress message"""
message = "" message = ""
for key, value in self.processed.items(): for key, value in self.processed.items():
@ -424,7 +434,7 @@ class ReindexProgress(ReindexBase):
self.request_type = request_type self.request_type = request_type
self.request_id = request_id self.request_id = request_id
def get_progress(self): def get_progress(self) -> dict:
"""get progress from task""" """get progress from task"""
queue_name, request_type = self._get_queue_name() queue_name, request_type = self._get_queue_name()
total = self._get_total_in_queue(queue_name) total = self._get_total_in_queue(queue_name)

View File

@ -155,13 +155,13 @@ class RedisQueue(RedisBase):
mapping = {i[1]: next_score + i[0] for i in enumerate(to_add)} mapping = {i[1]: next_score + i[0] for i in enumerate(to_add)}
self.conn.zadd(self.key, mapping) self.conn.zadd(self.key, mapping)
def max_score(self) -> float | None: def max_score(self) -> int | None:
"""get max score""" """get max score"""
last = self.conn.zrange(self.key, -1, -1, withscores=True) last = self.conn.zrange(self.key, -1, -1, withscores=True)
if not last: if not last:
return None return None
return last[0][1] return int(last[0][1])
def _get_next_score(self) -> float: def _get_next_score(self) -> float:
"""get next score in queue to append""" """get next score in queue to append"""
@ -171,13 +171,13 @@ class RedisQueue(RedisBase):
return last[0][1] + 1 return last[0][1] + 1
def get_next(self) -> tuple[str | None, float | None]: def get_next(self) -> tuple[str | None, int | None]:
"""return next element in the queue, if available""" """return next element in the queue, if available"""
result = self.conn.zpopmin(self.key) result = self.conn.zpopmin(self.key)
if not result: if not result:
return None, None return None, None
item, idx = result[0] item, idx = result[0][0], int(result[0][1]) + 1
return item, idx return item, idx