forked from Archives/langchain
add CoNLL-U document loader (#1297)
I've added a simple [CoNLL-U](https://universaldependencies.org/format.html) document loader. CoNLL-U is a common format for NLP tasks and is used, for example, in the Universal Dependencies treebank corpora. The loader reads a single file in standard CoNLL-U format and returns a document.docker-utility-pexpect
parent
d29f74114e
commit
fd9975dad7
@ -0,0 +1,116 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9f98a15e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# CoNLL-U\n",
|
||||||
|
"This is an example of how to load a file in [CoNLL-U](https://universaldependencies.org/format.html) format. The whole file is treated as one document. The example data (`conllu.conllu`) is based on one of the standard UD/CoNLL-U examples."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d9b2e33e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import CoNLLULoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5b5eec48",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = CoNLLULoader(\"example_data/conllu.conllu\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "10f3f725",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"document = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "acbb3579",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"document"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.8"
|
||||||
|
},
|
||||||
|
"toc": {
|
||||||
|
"base_numbering": 1,
|
||||||
|
"nav_menu": {},
|
||||||
|
"number_sections": true,
|
||||||
|
"sideBar": true,
|
||||||
|
"skip_h1_title": false,
|
||||||
|
"title_cell": "Table of Contents",
|
||||||
|
"title_sidebar": "Contents",
|
||||||
|
"toc_cell": false,
|
||||||
|
"toc_position": {},
|
||||||
|
"toc_section_display": true,
|
||||||
|
"toc_window_display": false
|
||||||
|
},
|
||||||
|
"varInspector": {
|
||||||
|
"cols": {
|
||||||
|
"lenName": 16,
|
||||||
|
"lenType": 16,
|
||||||
|
"lenVar": 40
|
||||||
|
},
|
||||||
|
"kernels_config": {
|
||||||
|
"python": {
|
||||||
|
"delete_cmd_postfix": "",
|
||||||
|
"delete_cmd_prefix": "del ",
|
||||||
|
"library": "var_list.py",
|
||||||
|
"varRefreshCmd": "print(var_dic_list())"
|
||||||
|
},
|
||||||
|
"r": {
|
||||||
|
"delete_cmd_postfix": ") ",
|
||||||
|
"delete_cmd_prefix": "rm(",
|
||||||
|
"library": "var_list.r",
|
||||||
|
"varRefreshCmd": "cat(var_dic_list()) "
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"types_to_exclude": [
|
||||||
|
"module",
|
||||||
|
"function",
|
||||||
|
"builtin_function_or_method",
|
||||||
|
"instance",
|
||||||
|
"_Feature"
|
||||||
|
],
|
||||||
|
"window_display": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,8 @@
|
|||||||
|
# sent_id = 1
|
||||||
|
# text = They buy and sell books.
|
||||||
|
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
|
||||||
|
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
|
||||||
|
3 and and CONJ CC _ 4 cc 4:cc _
|
||||||
|
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
|
||||||
|
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
|
||||||
|
6 . . PUNCT . _ 2 punct 2:punct _
|
@ -0,0 +1,33 @@
|
|||||||
|
"""Load CoNLL-U files."""
|
||||||
|
import csv
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class CoNLLULoader(BaseLoader):
|
||||||
|
"""Load CoNLL-U files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load from file path."""
|
||||||
|
with open(self.file_path, encoding="utf8") as f:
|
||||||
|
tsv = list(csv.reader(f, delimiter="\t"))
|
||||||
|
|
||||||
|
# If len(line) > 1, the line is not a comment
|
||||||
|
lines = [line for line in tsv if len(line) > 1]
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
# Do not add a space after a punctuation mark or at the end of the sentence
|
||||||
|
if line[9] == "SpaceAfter=No" or i == len(lines) - 1:
|
||||||
|
text += line[1]
|
||||||
|
else:
|
||||||
|
text += line[1] + " "
|
||||||
|
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue