From 5b7e3e877b37750a2d9dffe85ac22268b96ce4ee Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 12 Nov 2022 11:48:48 +0700
Subject: [PATCH] implement basic comment archive

---
 tubearchivist/home/src/es/index_mapping.json |  66 ++++++++++++
 tubearchivist/home/src/index/comments.py     | 101 +++++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 tubearchivist/home/src/index/comments.py

diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json
index f023eefe..bed6973c 100644
--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@@ -460,6 +460,72 @@
                 },
                 "number_of_replicas": "0"
             }
+        },
+        {
+            "index_name": "comments",
+            "expected_map": {
+                "youtube_id": {
+                    "type": "keyword"
+                },
+                "comment_last_refresh": {
+                    "type": "date"
+                },
+                "comment_comments": {
+                    "properties": {
+                        "comment_id": {
+                            "type": "keyword"
+                        },
+                        "comment_text": {
+                            "type" : "text"
+                        },
+                        "comment_timestamp": {
+                            "type": "date"
+                        },
+                        "comment_time_text": {
+                            "type" : "text"
+                        },
+                        "comment_likecount": {
+                            "type": "long"
+                        },
+                        "comment_is_favorited": {
+                            "type": "boolean"
+                        },
+                        "comment_author": {
+                            "type": "text",
+                            "fields": {
+                                "keyword": {
+                                    "type": "keyword",
+                                    "ignore_above": 256,
+                                    "normalizer": "to_lower"
+                                }
+                            }
+                        },
+                        "comment_author_id": {
+                            "type": "keyword"
+                        },
+                        "comment_author_thumbnail": {
+                            "type": "keyword"
+                        },
+                        "comment_author_is_uploader": {
+                            "type": "boolean"
+                        },
+                        "comment_parent": {
+                            "type": "keyword"
+                        }
+                    }
+                }
+            },
+            "expected_set": {
+                "analysis": {
+                    "normalizer": {
+                        "to_lower": {
+                            "type": "custom",
+                            "filter": ["lowercase"]
+                        }
+                    }
+                },
+                "number_of_replicas": "0"
+            }
         }
     ]
 }
\ No newline at end of file
diff --git a/tubearchivist/home/src/index/comments.py b/tubearchivist/home/src/index/comments.py
new file mode 100644
index 00000000..05f9a6ed
--- /dev/null
+++ b/tubearchivist/home/src/index/comments.py
@@ -0,0 +1,101 @@
+"""
+Functionality:
+- Download comments
+- Index comments in ES
+- Retrieve comments from ES
+"""
+
+from datetime import datetime
+
+from home.src.download.yt_dlp_base import YtWrap
+from home.src.es.connect import ElasticWrap
+
+
+class Comments:
+    """hold all comments functionality"""
+
+    def __init__(self, youtube_id):
+        self.youtube_id = youtube_id
+        self.es_path = f"ta_comments/_doc/{youtube_id}"
+        self.max_comments = "all,100,all,30"
+        self.json_data = False
+
+    def build_json(self):
+        """build json document for es"""
+        comments_raw = self.get_comments()
+        comments_format = self.format_comments(comments_raw)
+
+        self.json_data = {
+            "youtube_id": self.youtube_id,
+            "comment_last_refresh": int(datetime.now().strftime("%s")),
+            "comment_comments": comments_format,
+        }
+
+    def build_yt_obs(self):
+        """
+        get extractor config
+        max-comments,max-parents,max-replies,max-replies-per-thread
+        """
+        max_comments_list = [i.strip() for i in self.max_comments.split(",")]
+        comment_sort = "top"
+
+        yt_obs = {
+            "skip_download": True,
+            "quiet": False,
+            "getcomments": True,
+            "extractor_args": {
+                "youtube": {
+                    "max_comments": max_comments_list,
+                    "comment_sort": [comment_sort],
+                }
+            },
+        }
+
+        return yt_obs
+
+    def get_comments(self):
+        """get comments from youtube"""
+        print(f"comments: get comments with format {self.max_comments}")
+        yt_obs = self.build_yt_obs()
+        info_json = YtWrap(yt_obs).extract(self.youtube_id)
+        comments_raw = info_json.get("comments")
+        return comments_raw
+
+    def format_comments(self, comments_raw):
+        """process comments to match format"""
+        comments = []
+
+        for comment in comments_raw:
+            cleaned_comment = self.clean_comment(comment)
+            comments.append(cleaned_comment)
+
+        return comments
+
+    def clean_comment(self, comment):
+        """parse metadata from comment for indexing"""
+        time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])
+        time_text = time_text_datetime.strftime("%Y-%m-%d %H:%M:%S")
+
+        cleaned_comment = {
+            "comment_id": comment["id"],
+            "comment_text": comment["text"].replace("\xa0", ""),
+            "comment_timestamp": comment["timestamp"],
+            "comment_time_text": time_text,
+            "comment_likecount": comment["like_count"],
+            "comment_is_favorited": comment["is_favorited"],
+            "comment_author": comment["author"],
+            "comment_author_id": comment["author_id"],
+            "comment_author_thumbnail": comment["author_thumbnail"],
+            "comment_author_is_uploader": comment["author_is_uploader"],
+            "comment_parent": comment["parent"],
+        }
+
+        return cleaned_comment
+
+    def upload_comments(self):
+        """upload comments to es"""
+        _, _ = ElasticWrap(self.es_path).put(self.json_data)
+
+    def delete_comments(self):
+        """delete comments from es"""
+        _, _ = ElasticWrap(self.es_path).delete()