mirror of https://github.com/hwchase17/langchain
add CoNLL-U document loader (#1297)
I've added a simple [CoNLL-U](https://universaldependencies.org/format.html) document loader. CoNLL-U is a common format for NLP tasks and is used, for example, in the Universal Dependencies treebank corpora. The loader reads a single file in standard CoNLL-U format and returns a document.pull/1311/head
parent
d29f74114e
commit
fd9975dad7
@ -0,0 +1,116 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9f98a15e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# CoNLL-U\n",
|
||||
"This is an example of how to load a file in [CoNLL-U](https://universaldependencies.org/format.html) format. The whole file is treated as one document. The example data (`conllu.conllu`) is based on one of the standard UD/CoNLL-U examples."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d9b2e33e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import CoNLLULoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5b5eec48",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = CoNLLULoader(\"example_data/conllu.conllu\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10f3f725",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "acbb3579",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.8"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": false
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
# sent_id = 1
|
||||
# text = They buy and sell books.
|
||||
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
|
||||
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
|
||||
3 and and CONJ CC _ 4 cc 4:cc _
|
||||
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
|
||||
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
|
||||
6 . . PUNCT . _ 2 punct 2:punct _
|
@ -0,0 +1,33 @@
|
||||
"""Load CoNLL-U files."""
|
||||
import csv
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class CoNLLULoader(BaseLoader):
|
||||
"""Load CoNLL-U files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
with open(self.file_path, encoding="utf8") as f:
|
||||
tsv = list(csv.reader(f, delimiter="\t"))
|
||||
|
||||
# If len(line) > 1, the line is not a comment
|
||||
lines = [line for line in tsv if len(line) > 1]
|
||||
|
||||
text = ""
|
||||
for i, line in enumerate(lines):
|
||||
# Do not add a space after a punctuation mark or at the end of the sentence
|
||||
if line[9] == "SpaceAfter=No" or i == len(lines) - 1:
|
||||
text += line[1]
|
||||
else:
|
||||
text += line[1] + " "
|
||||
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue