mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
84dc2dd059
- **Description:** Add a new format, `CHUNKS`, to `langchain_community.document_loaders.youtube.YoutubeLoader` which creates multiple `Document` objects from YouTube video transcripts (captions), each of a fixed duration. The metadata of each chunk `Document` includes the start time of each one and a URL to that time in the video on the YouTube website. I had implemented this for UMich (@umich-its-ai) in a local module, but it makes sense to contribute this to LangChain community for all to benefit and to simplify maintenance. - **Issue:** N/A - **Dependencies:** N/A - **Twitter:** lsloan_umich - **Mastodon:** [lsloan@mastodon.social](https://mastodon.social/@lsloan) With regards to **tests and documentation**, most existing features of the `YoutubeLoader` class are not tested. Only the `YoutubeLoader.extract_video_id()` static method had a test. However, while I was waiting for this PR to be reviewed and merged, I had time to add a test for the chunking feature I've proposed in this PR. I have added an example of using chunking to the `docs/docs/integrations/document_loaders/youtube_transcript.ipynb` notebook. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
219 lines
10 KiB
Python
219 lines
10 KiB
Python
import pytest
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders import YoutubeLoader
|
|
from langchain_community.document_loaders.youtube import TranscriptFormat
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"youtube_url, expected_video_id",
|
|
[
|
|
("http://www.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
|
|
("http://youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
|
|
("http://m.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
|
|
("http://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
|
|
("https://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
|
|
("https://www.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
|
|
("https://m.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
|
|
("https://youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
|
|
("http://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
|
|
("http://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
|
|
("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
|
|
("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
|
|
("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
|
|
("https://www.youtube.com/shorts/cd0Fy92_w_s", "cd0Fy92_w_s"),
|
|
],
|
|
)
|
|
def test_video_id_extraction(youtube_url: str, expected_video_id: str) -> None:
|
|
"""Test that the video id is extracted from a youtube url"""
|
|
assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id
|
|
|
|
|
|
def test__get_transcript_chunks() -> None:
|
|
test_transcript_pieces = [
|
|
{"text": "♪ Hail to the victors valiant ♪", "start": 3.719, "duration": 5.0},
|
|
{"text": "♪ Hail to the conquering heroes ♪", "start": 8.733, "duration": 5.0},
|
|
{"text": "♪ Hail, hail to Michigan ♪", "start": 14.541, "duration": 5.0},
|
|
{"text": "♪ The leaders and best ♪", "start": 19.785, "duration": 5.0},
|
|
{"text": "♪ Hail to the victors valiant ♪", "start": 25.661, "duration": 4.763},
|
|
{"text": "♪ Hail to the conquering heroes ♪", "start": 30.424, "duration": 5.0},
|
|
{"text": "♪ Hail, hail to Michigan ♪", "start": 36.37, "duration": 4.91},
|
|
{"text": "♪ The champions of the west ♪", "start": 41.28, "duration": 2.232},
|
|
{"text": "♪ Hail to the victors valiant ♪", "start": 43.512, "duration": 4.069},
|
|
{
|
|
"text": "♪ Hail to the conquering heroes ♪",
|
|
"start": 47.581,
|
|
"duration": 4.487,
|
|
},
|
|
{"text": "♪ Hail, hail to Michigan ♪", "start": 52.068, "duration": 4.173},
|
|
{"text": "♪ The leaders and best ♪", "start": 56.241, "duration": 4.542},
|
|
{"text": "♪ Hail to victors valiant ♪", "start": 60.783, "duration": 3.944},
|
|
{
|
|
"text": "♪ Hail to the conquering heroes ♪",
|
|
"start": 64.727,
|
|
"duration": 4.117,
|
|
},
|
|
{"text": "♪ Hail, hail to Michigan ♪", "start": 68.844, "duration": 3.969},
|
|
{"text": "♪ The champions of the west ♪", "start": 72.813, "duration": 4.232},
|
|
{"text": "(choir clapping rhythmically)", "start": 77.045, "duration": 3.186},
|
|
{"text": "- Go blue!", "start": 80.231, "duration": 0.841},
|
|
{"text": "(choir clapping rhythmically)", "start": 81.072, "duration": 3.149},
|
|
{"text": "Go blue!", "start": 84.221, "duration": 0.919},
|
|
{"text": "♪ It's great to be ♪", "start": 85.14, "duration": 1.887},
|
|
{
|
|
"text": "♪ A Michigan Wolverine ♪\n- Go blue!",
|
|
"start": 87.027,
|
|
"duration": 2.07,
|
|
},
|
|
{"text": "♪ It's great to be ♪", "start": 89.097, "duration": 1.922},
|
|
{
|
|
"text": "♪ A Michigan Wolverine ♪\n- Go blue!",
|
|
"start": 91.019,
|
|
"duration": 2.137,
|
|
},
|
|
{
|
|
"text": "♪ It's great to be ♪\n(choir scatting)",
|
|
"start": 93.156,
|
|
"duration": 1.92,
|
|
},
|
|
{
|
|
"text": "♪ a Michigan Wolverine ♪\n(choir scatting)",
|
|
"start": 95.076,
|
|
"duration": 2.118,
|
|
},
|
|
{
|
|
"text": "♪ It's great to be ♪\n(choir scatting)",
|
|
"start": 97.194,
|
|
"duration": 1.85,
|
|
},
|
|
{
|
|
"text": "♪ A Michigan ♪\n(choir scatting)",
|
|
"start": 99.044,
|
|
"duration": 1.003,
|
|
},
|
|
{"text": "- Let's go blue!", "start": 100.047, "duration": 1.295},
|
|
{
|
|
"text": "♪ Hail to the victors valiant ♪",
|
|
"start": 101.342,
|
|
"duration": 1.831,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the conquering heroes ♪",
|
|
"start": 103.173,
|
|
"duration": 2.21,
|
|
},
|
|
{"text": "♪ Hail, hail to Michigan ♪", "start": 105.383, "duration": 1.964},
|
|
{"text": "♪ The leaders and best ♪", "start": 107.347, "duration": 2.21},
|
|
{
|
|
"text": "♪ Hail to the victors valiant ♪",
|
|
"start": 109.557,
|
|
"duration": 1.643,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the conquering heroes ♪",
|
|
"start": 111.2,
|
|
"duration": 2.129,
|
|
},
|
|
{"text": "♪ Hail, hail to Michigan ♪", "start": 113.329, "duration": 2.091},
|
|
{"text": "♪ The champions of the west ♪", "start": 115.42, "duration": 2.254},
|
|
{
|
|
"text": "♪ Hail to the victors valiant ♪",
|
|
"start": 117.674,
|
|
"duration": 4.039,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the conquering heroes ♪",
|
|
"start": 121.713,
|
|
"duration": 4.103,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the blue, hail to the blue ♪",
|
|
"start": 125.816,
|
|
"duration": 1.978,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the blue, hail to the blue ♪",
|
|
"start": 127.794,
|
|
"duration": 2.095,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the blue, hail to the blue ♪",
|
|
"start": 129.889,
|
|
"duration": 1.932,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the blue, hail to the blue ♪",
|
|
"start": 131.821,
|
|
"duration": 2.091,
|
|
},
|
|
{
|
|
"text": "♪ Hail to the blue, hail to the blue ♪",
|
|
"start": 133.912,
|
|
"duration": 2.109,
|
|
},
|
|
{"text": "♪ Hail to the blue, hail ♪", "start": 136.021, "duration": 3.643},
|
|
{"text": "♪ To Michigan ♪", "start": 139.664, "duration": 4.105},
|
|
{"text": "♪ The champions of the west ♪", "start": 143.769, "duration": 3.667},
|
|
{"text": "♪ Go blue ♪", "start": 154.122, "duration": 2.167},
|
|
]
|
|
test_transcript_chunks = [
|
|
Document(
|
|
page_content="♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The leaders and best ♪", # noqa: E501
|
|
metadata={
|
|
"source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=0s",
|
|
"start_seconds": 0,
|
|
"start_timestamp": "00:00:00",
|
|
},
|
|
),
|
|
Document(
|
|
page_content="♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The champions of the west ♪ ♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪", # noqa: E501
|
|
metadata={
|
|
"source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=30s",
|
|
"start_seconds": 30,
|
|
"start_timestamp": "00:00:30",
|
|
},
|
|
),
|
|
Document(
|
|
page_content="♪ The leaders and best ♪ ♪ Hail to victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The champions of the west ♪ (choir clapping rhythmically) - Go blue! (choir clapping rhythmically) Go blue! ♪ It's great to be ♪ ♪ A Michigan Wolverine ♪\n- Go blue!", # noqa: E501
|
|
metadata={
|
|
"source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=60s",
|
|
"start_seconds": 60,
|
|
"start_timestamp": "00:01:00",
|
|
},
|
|
),
|
|
Document(
|
|
page_content="♪ It's great to be ♪ ♪ A Michigan Wolverine ♪\n- Go blue! ♪ It's great to be ♪\n(choir scatting) ♪ a Michigan Wolverine ♪\n(choir scatting) ♪ It's great to be ♪\n(choir scatting) ♪ A Michigan ♪\n(choir scatting) - Let's go blue! ♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The leaders and best ♪ ♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The champions of the west ♪", # noqa: E501
|
|
metadata={
|
|
"source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=90s",
|
|
"start_seconds": 90,
|
|
"start_timestamp": "00:01:30",
|
|
},
|
|
),
|
|
Document(
|
|
page_content="♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail ♪ ♪ To Michigan ♪ ♪ The champions of the west ♪", # noqa: E501
|
|
metadata={
|
|
"source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=120s",
|
|
"start_seconds": 120,
|
|
"start_timestamp": "00:02:00",
|
|
},
|
|
),
|
|
Document(
|
|
page_content="♪ Go blue ♪",
|
|
metadata={
|
|
"source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=150s",
|
|
"start_seconds": 150,
|
|
"start_timestamp": "00:02:30",
|
|
},
|
|
),
|
|
]
|
|
|
|
ytl = YoutubeLoader(
|
|
"TKCMw0utiak",
|
|
transcript_format=TranscriptFormat.CHUNKS,
|
|
chunk_size_seconds=30,
|
|
)
|
|
assert (
|
|
list(ytl._get_transcript_chunks(test_transcript_pieces))
|
|
== test_transcript_chunks
|
|
)
|