diff --git a/docs/modules/indexes/document_loaders.rst b/docs/modules/indexes/document_loaders.rst index ea2ba661..22b106a7 100644 --- a/docs/modules/indexes/document_loaders.rst +++ b/docs/modules/indexes/document_loaders.rst @@ -30,6 +30,7 @@ For detailed instructions on how to get set up with Unstructured, see installati :maxdepth: 1 :glob: + ./document_loaders/examples/audio.ipynb ./document_loaders/examples/conll-u.ipynb ./document_loaders/examples/copypaste.ipynb ./document_loaders/examples/csv.ipynb diff --git a/docs/modules/indexes/document_loaders/examples/audio.ipynb b/docs/modules/indexes/document_loaders/examples/audio.ipynb new file mode 100644 index 00000000..22f11a62 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/audio.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "14bfa52e", + "metadata": {}, + "source": [ + "# OpenAIWhisperParser\n", + "\n", + "This notebook goes over how to load data from an audio file, such as an mp3.\n", + "\n", + "We use the `OpenAIWhisperParser`, which will use the [OpenAI Whisper API](https://platform.openai.com/docs/guides/speech-to-text) to transcribe audio to text.\n", + "\n", + "Note: You will need to have an `OPENAI_API_KEY` supplied." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e2257932", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.generic import GenericLoader\n", + "from langchain.document_loaders.parsers import OpenAIWhisperParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e21b9c4d", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory contains audio for the first 20 minutes of one Andrej Karpathy video \n", + "# \"The spelled-out intro to neural networks and backpropagation: building micrograd\"\n", + "# https://www.youtube.com/watch?v=VMj-3S1tku0\n", + "audio_file_path = \"example_data/\"\n", + "loader = GenericLoader.from_filesystem(audio_file_path, glob=\"*.mp3\", parser=OpenAIWhisperParser())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f50fbf64", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ca414073", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content=\"Hello, my name is Andrej and I've been training deep neural networks for a bit more than a decade. And in this lecture I'd like to show you what neural network training looks like under the hood. So in particular we are going to start with a blank Jupyter notebook and by the end of this lecture we will define and train a neural net and you'll get to see everything that goes on under the hood and exactly sort of how that works on an intuitive level. Now specifically what I would like to do is I would like to take you through building of micrograd. Now micrograd is this library that I released on GitHub about two years ago but at the time I only uploaded the source code and you'd have to go in by yourself and really figure out how it works. So in this lecture I will take you through it step by step and kind of comment on all the pieces of it. So what is micrograd and why is it interesting? Thank you. Micrograd is basically an autograd engine. Autograd is short for automatic gradient and really what it does is it implements back propagation. Now back propagation is this algorithm that allows you to efficiently evaluate the gradient of some kind of a loss function with respect to the weights of a neural network and what that allows us to do then is we can iteratively tune the weights of that neural network to minimize the loss function and therefore improve the accuracy of the network. So back propagation would be at the mathematical core of any modern deep neural network library like say PyTorch or JAX. So the functionality of micrograd is I think best illustrated by an example. So if we just scroll down here you'll see that micrograd basically allows you to build out mathematical expressions and here what we are doing is we have an expression that we're building out where you have two inputs a and b and you'll see that a and b are negative four and two but we are wrapping those values into this value object that we are going to build out as part of micrograd. So this value object will wrap the numbers themselves and then we are going to build out a mathematical expression here where a and b are transformed into c d and eventually e f and g and I'm showing some of the functionality of micrograd and the operations that it supports. So you can add two value objects, you can multiply them, you can raise them to a constant power, you can offset by one, negate, squash at zero, square, divide by constant, divide by it, etc. And so we're building out an expression graph with these two inputs a and b and we're creating an output value of g and micrograd will in the background build out this entire mathematical expression. So it will for example know that c is also a value, c was a result of an addition operation and the child nodes of c are a and b because the and it will maintain pointers to a and b value objects. So we'll basically know exactly how all of this is laid out and then not only can we do what we call the forward pass where we actually look at the value of g of course, that's pretty straightforward, we will access that using the dot data attribute and so the output of the forward pass, the value of g, is 24.7 it turns out. But the big deal is that we can also take this g value object and we can call dot backward and this will basically initialize backpropagation at the node g. And what backpropagation is going to do is it's going to start at g and it's going to go backwards through that expression graph and it's going to recursively apply the chain rule from calculus. And what that allows us to do then is we're going to evaluate basically the derivative of g with respect to all the internal nodes like e, d, and c but also with respect to the inputs a and b. And then we can actually query this derivative of g with respect to a, for example that's a.grad, in this case it happens to be 138, and the derivative of g with respect to b which also happens to be here 645. And this derivative we'll see soon is very important information because it's telling us how a and b are affecting g through this mathematical expression. So in particular a.grad is 138, so if we slightly nudge a and make it slightly larger, 138 is telling us that g will grow and the slope of that growth is going to be 138 and the slope of growth of b is going to be 645. So that's going to tell us about how g will respond if a and b get tweaked a tiny amount in a positive direction. Now you might be confused about what this expression is that we built out here and this expression by the way is completely meaningless. I just made it up, I'm just flexing about the kinds of operations that are supported by micrograd. What we actually really care about are neural networks but it turns out that neural networks are just mathematical expressions just like this one but actually slightly a bit less crazy even. Neural networks are just a mathematical expression, they take the input data as an input and they take the weights of a neural network as an input and it's a mathematical expression and the output are your predictions of your neural net or the loss function, we'll see this in a bit. But basically neural networks just happen to be a certain class of mathematical expressions but back propagation is actually significantly more general. It doesn't actually care about neural networks at all, it only cares about arbitrary mathematical expressions and then we happen to use that machinery for training of neural networks. Now one more note I would like to make at this stage is that as you see here micrograd is a scalar valued autograd engine so it's working on the you know level of individual scalars like negative 4 and 2 and we're taking neural nets and we're breaking them down all the way to these atoms of individual scalars and all the little pluses and times and it's just excessive and so obviously you would never be doing any of this in production. It's really just done for pedagogical reasons because it allows us to not have to deal with these n-dimensional tensors that you would use in modern deep neural network library. So this is really done so that you understand and refactor out back propagation and chain rule and understanding of neural training and then if you actually want to train bigger networks you have to be using these tensors but none of the math changes, this is done purely for efficiency. We are basically taking all the scalars all the scalar values we're packaging them up into tensors which are just arrays of these scalars and then because we have these large arrays we're making operations on those large arrays that allows us to take advantage of the parallelism in a computer and all those operations can be done in parallel and then the whole thing runs faster but really none of the math changes and they're done purely for efficiency so I don't think that it's pedagogically useful to be dealing with tensors from scratch and I think and that's why I fundamentally wrote micrograd because you can understand how things work at the fundamental level and then you can speed it up later. Okay so here's the fun part. My claim is that micrograd is what you need to train neural networks and everything else is just efficiency so you'd think that micrograd would be a very complex piece of code and that turns out to not be the case. So if we just go to micrograd and you'll see that there's only two files here in micrograd. This is the actual engine, it doesn't know anything about neural nets and this is the entire neural nets library on top of micrograd. So engine and nn.py. So the actual back propagation autograd engine that gives you the power of neural networks is literally 100 lines of code of like very simple python which we'll understand by the end of this lecture and then nn.py, this neural network library built on top of the autograd engine is like a joke. It's like we have to define what is a neuron and then we have to define what is a layer of neurons and then we define what is a multilayer perceptron which is just a sequence of layers of neurons and so it's just a total joke. So basically there's a lot of power that comes from only 150 lines of code and that's all you need to understand to understand neural network training and everything else is just efficiency and of course there's a lot to efficiency but fundamentally that's all that's happening. Okay so now let's dive right in and implement micrograd step by step. The first thing I'd like to do is I'd like to make sure that you have a very good understanding intuitively of what a derivative is and exactly what information it gives you. So let's start with some basic imports that I copy-paste in every jupyter notebook always and let's define a function, a scalar valued function f of x as follows. So I just made this up randomly. I just wanted a scalar valued function that takes a single scalar x and returns a single scalar y and we can call this function of course so we can pass in say 3.0 and get 20 back. Now we can also plot this function to get a sense of its shape. You can tell from the mathematical expression that this is probably a parabola, it's a quadratic and so if we just create a set of scalar values that we can feed in using for example a range from negative 5 to 5 in steps of 0.25. So this is so x is just from negative 5 to 5 not including 5 in steps of 0.25 and we can actually call this function on this numpy array as well so we get a set of y's if we call f on x's and these y's are basically also applying the function on every one of these elements independently and we can plot this using matplotlib. So plt.plot x's and y's and we get a nice parabola. So previously here we fed in 3.0 somewhere here and we received 20 back which is here the y-coordinate. So now I'd like to think through what is the derivative of this function at any single input point x. So what is the derivative at different points x of this function? Now if you remember back to your calculus class you've probably derived derivatives so we take this mathematical expression 3x squared minus 4x plus 5 and you would write out on a piece of paper and you would apply the product rule and all the other rules and derive the mathematical expression of the great derivative of the original function and then you could plug in different texts and see what the derivative is. We're not going to actually do that because no one in neural networks actually writes out the expression for the neural net. It would be a massive expression, it would be thousands, tens of thousands of terms. No one actually derives the derivative of course and so we're not going to take this kind of like symbolic approach. Instead what I'd like to do is I'd like to look at the definition of derivative and just make sure that we really understand what the derivative is measuring, what it's telling you about the function. And so if we just look up derivative we see that okay so this is not a very good definition of derivative. This is a definition of what it means to be differentiable but if you remember from your calculus it is the limit as h goes to zero of f of x plus h minus f of x over h. So basically what it's saying is if you slightly bump up your at some point x that you're interested in or a and if you slightly bump up you know you slightly increase it by small number h how does the function respond with what sensitivity does it respond where is the slope at that point does the function go up or does it go down and by how much and that's the slope of that function the the slope of that response at that point and so we can basically evaluate the derivative here numerically by taking a very small h of course the definition would ask us to take h to zero we're just going to pick a very small h 0.001 and let's say we're interested in 0.3.0 so we can look at f of x of course as 20 and now f of x plus h so if we slightly nudge x in a positive direction how is the function going to respond and just looking at this do you expand do you expect f of x plus h to be slightly greater than 20 or do you expect it to be slightly lower than 20 and since this 3 is here and this is 20 if we slightly go positively the function will respond positively so you'd expect this to be slightly greater than 20 and now by how much is telling you the sort of the the strength of that slope right the the size of the slope so f of x plus h minus f of x this is how much the function responded in a positive direction and we have to normalize by the run so we have the rise over run to get the slope so this of course is just a numerical approximation of the slope because we have to make h very very small to converge to the exact amount now if i'm doing too many zeros at some point i'm going to i'm going to get an incorrect answer because we're using floating point arithmetic and the representations of all these numbers in computer memory is finite and at some point we get into trouble so we can converge towards the right answer with this approach but basically at 3 the slope is 14 and you can see that by taking 3x squared minus 4x plus 5 and differentiating it in our head so 3x squared would be 6x minus 4 and then we plug in x equals 3 so that's 18 minus 4 is 14 so this is correct so that's at 3 now how about the slope at say negative 3 would you expect what would you expect for the slope now telling the exact value is really hard but what is the sign of that slope so at negative 3 if we slightly go in the positive direction at x the function would actually go down and so that tells you that the slope would be negative so we'll get a slight number below below 20 and so if we take the slope we expect something negative negative 22 okay and at some point here of course the slope would be zero now for this specific function i looked it up previously and it's at point uh 2 over 3 so at roughly 2 over 3 that's somewhere here this this derivative would be zero so basically at that precise point yeah at that precise point if we nudge in a positive direction the function doesn't respond this stays the same almost and so that's why the slope is zero okay now let's look at a bit more complex case so we're going to start you know complexifying a bit so now we have a function here with output variable d that is a function of three scalar inputs a b and c so a b and c are some specific values three inputs into our expression graph and a single output d and so if we just print d we get four and now what i like to do is i'd like to again look at the derivatives of d with respect to a b and c and uh think through uh again just the intuition of what this derivative is telling us so in order to evaluate this derivative we're going to get a bit hacky here we're going to again have a very small value of h and then we're going to fix the inputs at some values that we're interested in so these are the this is the point a b c at which we're going to be evaluating the the derivative of d with respect to all a b and c at that point so there are the inputs and now we have d1 is that expression and then we're going to for example look at the derivative of d with respect to a so we'll take a and we'll bump it by h and then we'll get d2 to be the exact same function and now we're going to print um you know f1 d1 is d1 d2 is d2 and print slope so the derivative or slope here will be um of course d2 minus d1 divide h so d2 minus d1 is how much the function increased uh when we bumped the uh the specific input that we're interested in by a tiny amount and this is the normalized by this is the normalized by h to get the slope so um yeah so this so i just run this we're going to print d1 which we know is four now d2 will be bumped a will be bumped by h so let's just think through a little bit uh what d2 will be uh printed out here in particular d1 will be four will d2 be a number slightly greater than four or slightly lower than four and that's going to tell us the sign of the derivative so we're bumping a by h b is minus three c is 10 so you can just intuitively think through this derivative and what it's doing a will be slightly more positive and but b is a negative number so if a is slightly more positive because b is negative three we're actually going to be adding less to d so you'd actually expect that the value of the function will go down so let's just see this yeah and so we went from four to 3.9996 and that tells you that the slope will be negative and then um will be a negative number because we went down and then the exact number of slope will be exact amount of slope is negative three and you can also convince yourself that negative three is the right answer um mathematically and analytically because if you have a times b plus c and you are you know you have calculus then uh differentiating a times b plus c with respect to a gives you just b and indeed the value of b is negative three which is the derivative that we have so you can tell that that's correct so now if we do this with b so if we bump b by a little bit in a positive direction we'd get different slopes so what is the influence of b on the output d so if we bump b by a tiny amount in a positive direction then because a is positive we'll be adding more to d right so um and now what is the what is the sensitivity what is the slope of that addition and it might not surprise you that this should be two and why is it two because d of d by db differentiating with respect to b would be would give us a and the value of a is two so that's also working well and then if c gets bumped a tiny amount in h by h then of course a times b is unaffected and now c becomes slightly bit higher what does that do to the function it makes it slightly bit higher because we're simply adding c and it makes it slightly bit higher by the exact same amount that we added to c and so that tells you that the slope is one that will be the the rate at which d will increase as we scale c okay so we now have some intuitive sense of what this derivative is telling you about the function and we'd like to move to neural networks now as i mentioned neural networks will be pretty massive expressions mathematical expressions so we need some data structures that maintain these expressions and that's what we're going to start to build out now so we're going to build out this value object that i showed you in the readme page of micrograd so let me copy paste a skeleton of the first very simple value object so class value takes a single scalar value that it wraps and keeps track of and that's it so we can for example do value of 2.0 and then we can get we can look at its content and python will internally use the wrapper function to return this string like that so this is a value object that we're going to call value object\", metadata={'source': 'example_data/Lecture_1_0.mp3'})]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py index 94ac136d..4cddd93f 100644 --- a/langchain/document_loaders/parsers/__init__.py +++ b/langchain/document_loaders/parsers/__init__.py @@ -1,3 +1,4 @@ +from langchain.document_loaders.parsers.audio import OpenAIWhisperParser from langchain.document_loaders.parsers.html import BS4HTMLParser from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, @@ -9,6 +10,7 @@ from langchain.document_loaders.parsers.pdf import ( __all__ = [ "BS4HTMLParser", + "OpenAIWhisperParser", "PDFMinerParser", "PDFPlumberParser", "PyMuPDFParser", diff --git a/langchain/document_loaders/parsers/audio.py b/langchain/document_loaders/parsers/audio.py new file mode 100644 index 00000000..d55918c2 --- /dev/null +++ b/langchain/document_loaders/parsers/audio.py @@ -0,0 +1,21 @@ +from typing import Iterator + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.schema import Document + + +class OpenAIWhisperParser(BaseBlobParser): + """Transcribe and parse audio files. + Audio transcription is with OpenAI Whisper model.""" + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + + import openai + + with blob.as_bytes_io() as f: + transcript = openai.Audio.transcribe("whisper-1", f) + yield Document( + page_content=transcript.text, metadata={"source": blob.source} + ) diff --git a/tests/unit_tests/document_loaders/parsers/test_public_api.py b/tests/unit_tests/document_loaders/parsers/test_public_api.py index 344b6281..a737a9ff 100644 --- a/tests/unit_tests/document_loaders/parsers/test_public_api.py +++ b/tests/unit_tests/document_loaders/parsers/test_public_api.py @@ -5,6 +5,7 @@ def test_parsers_public_api_correct() -> None: """Test public API of parsers for breaking changes.""" assert set(__all__) == { "BS4HTMLParser", + "OpenAIWhisperParser", "PyPDFParser", "PDFMinerParser", "PyMuPDFParser",