From fb3c73d1948fa8cfa1c0b4c337bf910404e82bed Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 18 Feb 2023 10:58:39 -0800 Subject: [PATCH] add srt loader (#1140) --- .../document_loaders/examples/srt.ipynb | 93 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/srt.py | 28 ++++++ 3 files changed, 123 insertions(+) create mode 100644 docs/modules/document_loaders/examples/srt.ipynb create mode 100644 langchain/document_loaders/srt.py diff --git a/docs/modules/document_loaders/examples/srt.ipynb b/docs/modules/document_loaders/examples/srt.ipynb new file mode 100644 index 00000000..7eeafad6 --- /dev/null +++ b/docs/modules/document_loaders/examples/srt.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4bdaea79", + "metadata": {}, + "source": [ + "# Subtitle Files\n", + "How to load data from subtitle (`.srt`) files" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2cbb7f5c", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import SRTLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "865d8a14", + "metadata": {}, + "outputs": [], + "source": [ + "loader = SRTLoader(\"example_data/Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.srt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "173a9234", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "15e00030", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Corruption discovered\\nat the core of the Banking Clan! Reunited, Rush Clovis\\nand Senator A'" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].page_content[:100]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b7a8dc4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index b64a5ee3..386311f9 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -24,6 +24,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader +from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader @@ -63,4 +64,5 @@ __all__ = [ "OnlinePDFLoader", "PDFMinerLoader", "TelegramChatLoader", + "SRTLoader", ] diff --git a/langchain/document_loaders/srt.py b/langchain/document_loaders/srt.py new file mode 100644 index 00000000..ce38f1c2 --- /dev/null +++ b/langchain/document_loaders/srt.py @@ -0,0 +1,28 @@ +"""Loader for .srt (subtitle) files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class SRTLoader(BaseLoader): + """Loader for .srt (subtitle) files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import pysrt # noqa:F401 + except ImportError: + raise ValueError( + "package `pysrt` not found, please install it with `pysrt`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load using pysrt file.""" + import pysrt + + parsed_info = pysrt.open(self.file_path) + text = " ".join([t.text for t in parsed_info]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)]