add CoNLL-U document loader (#1297)

I've added a simple
[CoNLL-U](https://universaldependencies.org/format.html) document
loader. CoNLL-U is a common format for NLP tasks and is used, for
example, in the Universal Dependencies treebank corpora. The loader
reads a single file in standard CoNLL-U format and returns a document.
docker-utility-pexpect
Ingo Kleiber 1 year ago committed by GitHub
parent d29f74114e
commit fd9975dad7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,116 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9f98a15e",
"metadata": {},
"source": [
"# CoNLL-U\n",
"This is an example of how to load a file in [CoNLL-U](https://universaldependencies.org/format.html) format. The whole file is treated as one document. The example data (`conllu.conllu`) is based on one of the standard UD/CoNLL-U examples."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9b2e33e",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import CoNLLULoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b5eec48",
"metadata": {},
"outputs": [],
"source": [
"loader = CoNLLULoader(\"example_data/conllu.conllu\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10f3f725",
"metadata": {},
"outputs": [],
"source": [
"document = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "acbb3579",
"metadata": {},
"outputs": [],
"source": [
"document"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -0,0 +1,8 @@
# sent_id = 1
# text = They buy and sell books.
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
3 and and CONJ CC _ 4 cc 4:cc _
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
6 . . PUNCT . _ 2 punct 2:punct _

@ -57,6 +57,8 @@ There are a lot of different document loaders that LangChain supports. Below are
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
`CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file.
.. toctree::
:maxdepth: 1
:glob:

@ -3,6 +3,7 @@
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
from langchain.document_loaders.azlyrics import AZLyricsLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
from langchain.document_loaders.conllu import CoNLLULoader
from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.docx import UnstructuredDocxLoader
from langchain.document_loaders.email import UnstructuredEmailLoader
@ -77,4 +78,5 @@ __all__ = [
"SRTLoader",
"FacebookChatLoader",
"NotebookLoader",
"CoNLLULoader",
]

@ -0,0 +1,33 @@
"""Load CoNLL-U files."""
import csv
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class CoNLLULoader(BaseLoader):
"""Load CoNLL-U files."""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
def load(self) -> List[Document]:
"""Load from file path."""
with open(self.file_path, encoding="utf8") as f:
tsv = list(csv.reader(f, delimiter="\t"))
# If len(line) > 1, the line is not a comment
lines = [line for line in tsv if len(line) > 1]
text = ""
for i, line in enumerate(lines):
# Do not add a space after a punctuation mark or at the end of the sentence
if line[9] == "SpaceAfter=No" or i == len(lines) - 1:
text += line[1]
else:
text += line[1] + " "
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
Loading…
Cancel
Save