diff --git a/docs/modules/document_loaders/examples/CoNLL-U.ipynb b/docs/modules/document_loaders/examples/CoNLL-U.ipynb new file mode 100644 index 00000000..e0ee735e --- /dev/null +++ b/docs/modules/document_loaders/examples/CoNLL-U.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9f98a15e", + "metadata": {}, + "source": [ + "# CoNLL-U\n", + "This is an example of how to load a file in [CoNLL-U](https://universaldependencies.org/format.html) format. The whole file is treated as one document. The example data (`conllu.conllu`) is based on one of the standard UD/CoNLL-U examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9b2e33e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import CoNLLULoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b5eec48", + "metadata": {}, + "outputs": [], + "source": [ + "loader = CoNLLULoader(\"example_data/conllu.conllu\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10f3f725", + "metadata": {}, + "outputs": [], + "source": [ + "document = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acbb3579", + "metadata": {}, + "outputs": [], + "source": [ + "document" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/example_data/conllu.conllu b/docs/modules/document_loaders/examples/example_data/conllu.conllu new file mode 100644 index 00000000..d7f9dc97 --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/conllu.conllu @@ -0,0 +1,8 @@ +# sent_id = 1 +# text = They buy and sell books. +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _ +2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _ +3 and and CONJ CC _ 4 cc 4:cc _ +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _ +5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No +6 . . PUNCT . _ 2 punct 2:punct _ diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index 690cfdb3..bca3f6f8 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -57,6 +57,8 @@ There are a lot of different document loaders that LangChain supports. Below are `Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF. +`CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file. + .. toctree:: :maxdepth: 1 :glob: diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 5de8ae02..eb3a7c2a 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -3,6 +3,7 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.azlyrics import AZLyricsLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader +from langchain.document_loaders.conllu import CoNLLULoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader @@ -77,4 +78,5 @@ __all__ = [ "SRTLoader", "FacebookChatLoader", "NotebookLoader", + "CoNLLULoader", ] diff --git a/langchain/document_loaders/conllu.py b/langchain/document_loaders/conllu.py new file mode 100644 index 00000000..f1fc12da --- /dev/null +++ b/langchain/document_loaders/conllu.py @@ -0,0 +1,33 @@ +"""Load CoNLL-U files.""" +import csv +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class CoNLLULoader(BaseLoader): + """Load CoNLL-U files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + self.file_path = file_path + + def load(self) -> List[Document]: + """Load from file path.""" + with open(self.file_path, encoding="utf8") as f: + tsv = list(csv.reader(f, delimiter="\t")) + + # If len(line) > 1, the line is not a comment + lines = [line for line in tsv if len(line) > 1] + + text = "" + for i, line in enumerate(lines): + # Do not add a space after a punctuation mark or at the end of the sentence + if line[9] == "SpaceAfter=No" or i == len(lines) - 1: + text += line[1] + else: + text += line[1] + " " + + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)]