From fb3c73d1948fa8cfa1c0b4c337bf910404e82bed Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sat, 18 Feb 2023 10:58:39 -0800
Subject: [PATCH] add srt loader (#1140)

---
 .../document_loaders/examples/srt.ipynb       | 93 +++++++++++++++++++
 langchain/document_loaders/__init__.py        |  2 +
 langchain/document_loaders/srt.py             | 28 ++++++
 3 files changed, 123 insertions(+)
 create mode 100644 docs/modules/document_loaders/examples/srt.ipynb
 create mode 100644 langchain/document_loaders/srt.py
diff --git a/docs/modules/document_loaders/examples/srt.ipynb b/docs/modules/document_loaders/examples/srt.ipynb
new file mode 100644
index 00000000..7eeafad6
--- /dev/null
+++ b/docs/modules/document_loaders/examples/srt.ipynb
@@ -0,0 +1,93 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4bdaea79",
+   "metadata": {},
+   "source": [
+    "# Subtitle Files\n",
+    "How to load data from subtitle (`.srt`) files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "2cbb7f5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import SRTLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "865d8a14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = SRTLoader(\"example_data/Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.srt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "173a9234",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "15e00030",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'<i>Corruption discovered\\nat the core of the Banking Clan!</i> <i>Reunited, Rush Clovis\\nand Senator A'"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0].page_content[:100]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b7a8dc4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index b64a5ee3..386311f9 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -24,6 +24,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader
 from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
 from langchain.document_loaders.s3_file import S3FileLoader
+from langchain.document_loaders.srt import SRTLoader
 from langchain.document_loaders.telegram import TelegramChatLoader
 from langchain.document_loaders.text import TextLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
@@ -63,4 +64,5 @@ __all__ = [
     "OnlinePDFLoader",
     "PDFMinerLoader",
     "TelegramChatLoader",
+    "SRTLoader",
 ]
diff --git a/langchain/document_loaders/srt.py b/langchain/document_loaders/srt.py
new file mode 100644
index 00000000..ce38f1c2
--- /dev/null
+++ b/langchain/document_loaders/srt.py
@@ -0,0 +1,28 @@
+"""Loader for .srt (subtitle) files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class SRTLoader(BaseLoader):
+    """Loader for .srt (subtitle) files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import pysrt  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "package `pysrt` not found, please install it with `pysrt`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load using pysrt file."""
+        import pysrt
+
+        parsed_info = pysrt.open(self.file_path)
+        text = " ".join([t.text for t in parsed_info])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]