enhancement: add elements mode to UnstructuredURLLoader (#3456)

### Summary

Updates the `UnstructuredURLLoader` to include a "elements" mode that
retains additional metadata from `unstructured`. This makes
`UnstructuredURLLoader` consistent with other unstructured loaders,
which also support "elements" mode. Patched mode into the existing
`UnstructuredURLLoader` class instead of inheriting from
`UnstructuredBaseLoader` because it significantly simplified the
implementation.

### Testing

This should still work and show the url in the source for the metadata

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, headers={"Accept": "application/json"}, strategy="fast")
docs = loader.load()
print(docs[0].page_content[:1000])
docs[0].metadata
``` 

This should now work and show additional metadata from `unstructured`.

This should still work and show the url in the source for the metadata

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, headers={"Accept": "application/json"}, strategy="fast", mode="elements")
docs = loader.load()
print(docs[0].page_content[:1000])
docs[0].metadata
```
This commit is contained in:
Matt Robinson 2023-04-27 01:09:45 -04:00 committed by GitHub
parent a3e3f26090
commit 8e10ac422e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -15,6 +15,7 @@ class UnstructuredURLLoader(BaseLoader):
self, self,
urls: List[str], urls: List[str],
continue_on_failure: bool = True, continue_on_failure: bool = True,
mode: str = "single",
**unstructured_kwargs: Any, **unstructured_kwargs: Any,
): ):
"""Initialize with file path.""" """Initialize with file path."""
@ -29,6 +30,9 @@ class UnstructuredURLLoader(BaseLoader):
"`pip install unstructured`" "`pip install unstructured`"
) )
self._validate_mode(mode)
self.mode = mode
headers = unstructured_kwargs.pop("headers", {}) headers = unstructured_kwargs.pop("headers", {})
if len(headers.keys()) != 0: if len(headers.keys()) != 0:
warn_about_headers = False warn_about_headers = False
@ -48,6 +52,13 @@ class UnstructuredURLLoader(BaseLoader):
self.headers = headers self.headers = headers
self.unstructured_kwargs = unstructured_kwargs self.unstructured_kwargs = unstructured_kwargs
def _validate_mode(self, mode: str) -> None:
_valid_modes = {"single", "elements"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
def __is_headers_available_for_html(self) -> bool: def __is_headers_available_for_html(self) -> bool:
_unstructured_version = self.__version.split("-")[0] _unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
@ -94,7 +105,15 @@ class UnstructuredURLLoader(BaseLoader):
continue continue
else: else:
raise e raise e
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url} if self.mode == "single":
docs.append(Document(page_content=text, metadata=metadata)) text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
elif self.mode == "elements":
for element in elements:
metadata = element.metadata.to_dict()
metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata))
return docs return docs