fix(document_loaders/telegram): fix pandas calls + add tests (#4806)

# Fix Telegram API loader + add tests.
I was testing this integration and it was broken with next error:
```python
message_threads = loader._get_message_threads(df)
KeyError: False
```
Also, this particular loader didn't have any tests / related group in
poetry, so I added those as well.

@hwchase17 / @eyurtsev please take a look on this fix PR.

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
dynamic_agent_tools
Raduan Al-Shedivat 1 year ago committed by GitHub
parent 206c87d525
commit 00c6ec8a2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -59,9 +59,11 @@
"id": "3e64cac2", "id": "3e64cac2",
"metadata": {}, "metadata": {},
"source": [ "source": [
"`TelegramChatApiLoader` loads data directly from any specified channel from Telegram. In order to export the data, you will need to authenticate your Telegram account. \n", "`TelegramChatApiLoader` loads data directly from any specified chat from Telegram. In order to export the data, you will need to authenticate your Telegram account. \n",
"\n", "\n",
"You can get the API_HASH and API_ID from https://my.telegram.org/auth?to=apps\n", "You can get the API_HASH and API_ID from https://my.telegram.org/auth?to=apps\n",
"\n",
"chat_entity recommended to be the [entity](https://docs.telethon.dev/en/stable/concepts/entities.html?highlight=Entity#what-is-an-entity) of a channel.\n",
"\n" "\n"
] ]
}, },
@ -72,10 +74,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = TelegramChatApiLoader(user_name =\"\"\\\n", "loader = TelegramChatApiLoader(\n",
" chat_url=\"<CHAT_URL>\",\\\n", " chat_entity=\"<CHAT_URL>\", # recommended to use Entity here\n",
" api_hash=\"<API HASH>\",\\\n", " api_hash=\"<API HASH >\", \n",
" api_id=\"<API_ID>\")" " api_id=\"<API_ID>\", \n",
" user_name =\"\", # needed only for caching the session.\n",
")"
] ]
}, },
{ {
@ -113,10 +117,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.13" "version": "3.9.13"
} }
}, },
"nbformat": 4, "nbformat": 4,

@ -12,6 +12,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
if TYPE_CHECKING: if TYPE_CHECKING:
import pandas as pd import pandas as pd
from telethon.hints import EntityLike
def concatenate_rows(row: dict) -> str: def concatenate_rows(row: dict) -> str:
@ -82,16 +83,18 @@ class TelegramChatApiLoader(BaseLoader):
def __init__( def __init__(
self, self,
chat_url: Optional[str] = None, chat_entity: Optional[EntityLike] = None,
api_id: Optional[int] = None, api_id: Optional[int] = None,
api_hash: Optional[str] = None, api_hash: Optional[str] = None,
username: Optional[str] = None, username: Optional[str] = None,
file_path: str = "telegram_data.json",
): ):
"""Initialize with API parameters.""" """Initialize with API parameters."""
self.chat_url = chat_url self.chat_entity = chat_entity
self.api_id = api_id self.api_id = api_id
self.api_hash = api_hash self.api_hash = api_hash
self.username = username self.username = username
self.file_path = file_path
async def fetch_data_from_telegram(self) -> None: async def fetch_data_from_telegram(self) -> None:
"""Fetch data from Telegram API and save it as a JSON file.""" """Fetch data from Telegram API and save it as a JSON file."""
@ -99,7 +102,7 @@ class TelegramChatApiLoader(BaseLoader):
data = [] data = []
async with TelegramClient(self.username, self.api_id, self.api_hash) as client: async with TelegramClient(self.username, self.api_id, self.api_hash) as client:
async for message in client.iter_messages(self.chat_url): async for message in client.iter_messages(self.chat_entity):
is_reply = message.reply_to is not None is_reply = message.reply_to is not None
reply_to_id = message.reply_to.reply_to_msg_id if is_reply else None reply_to_id = message.reply_to.reply_to_msg_id if is_reply else None
data.append( data.append(
@ -113,11 +116,9 @@ class TelegramChatApiLoader(BaseLoader):
} }
) )
with open("telegram_data.json", "w", encoding="utf-8") as f: with open(self.file_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4) json.dump(data, f, ensure_ascii=False, indent=4)
self.file_path = "telegram_data.json"
def _get_message_threads(self, data: pd.DataFrame) -> dict: def _get_message_threads(self, data: pd.DataFrame) -> dict:
"""Create a dictionary of message threads from the given data. """Create a dictionary of message threads from the given data.
@ -160,10 +161,10 @@ class TelegramChatApiLoader(BaseLoader):
return all_replies return all_replies
# Filter out parent messages # Filter out parent messages
parent_messages = data[data["is_reply"] is False] parent_messages = data[~data["is_reply"]]
# Filter out reply messages and drop rows with NaN in 'reply_to_id' # Filter out reply messages and drop rows with NaN in 'reply_to_id'
reply_messages = data[data["is_reply"] is True].dropna(subset=["reply_to_id"]) reply_messages = data[data["is_reply"]].dropna(subset=["reply_to_id"])
# Convert 'reply_to_id' to integer # Convert 'reply_to_id' to integer
reply_messages["reply_to_id"] = reply_messages["reply_to_id"].astype(int) reply_messages["reply_to_id"] = reply_messages["reply_to_id"].astype(int)
@ -217,24 +218,32 @@ class TelegramChatApiLoader(BaseLoader):
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
if self.chat_url is not None:
if self.chat_entity is not None:
try: try:
import nest_asyncio import nest_asyncio
import pandas as pd
nest_asyncio.apply() nest_asyncio.apply()
asyncio.run(self.fetch_data_from_telegram()) asyncio.run(self.fetch_data_from_telegram())
except ImportError: except ImportError:
raise ValueError( raise ValueError(
"please install with `pip install nest_asyncio`,\ """`nest_asyncio` package not found.
`pip install nest_asyncio` " please install with `pip install nest_asyncio`
"""
) )
p = Path(self.file_path) p = Path(self.file_path)
with open(p, encoding="utf8") as f: with open(p, encoding="utf8") as f:
d = json.load(f) d = json.load(f)
try:
import pandas as pd
except ImportError:
raise ValueError(
"""`pandas` package not found.
please install with `pip install pandas`
"""
)
normalized_messages = pd.json_normalize(d) normalized_messages = pd.json_normalize(d)
df = pd.DataFrame(normalized_messages) df = pd.DataFrame(normalized_messages)

35
poetry.lock generated

@ -6311,6 +6311,17 @@ files = [
{file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
] ]
[[package]]
name = "pyaes"
version = "1.6.1"
description = "Pure-Python Implementation of the AES block-cipher and common modes of operation"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "pyaes-1.6.1.tar.gz", hash = "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"},
]
[[package]] [[package]]
name = "pyarrow" name = "pyarrow"
version = "12.0.0" version = "12.0.0"
@ -6595,7 +6606,6 @@ files = [
{file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"}, {file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"},
{file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"}, {file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"},
{file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"}, {file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"},
{file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"},
] ]
[package.dependencies] [package.dependencies]
@ -8604,6 +8614,25 @@ files = [
[package.dependencies] [package.dependencies]
redis = ">=4.4.4" redis = ">=4.4.4"
[[package]]
name = "telethon"
version = "1.28.5"
description = "Full-featured Telegram client library for Python 3"
category = "main"
optional = true
python-versions = ">=3.5"
files = [
{file = "Telethon-1.28.5-py3-none-any.whl", hash = "sha256:edc42fd58b8e1569830d3ead564cafa60fd51d684f03ee2a1fdd5f77a5a10438"},
{file = "Telethon-1.28.5.tar.gz", hash = "sha256:b3990ec22351a3f3e1af376729c985025bbdd3bdabdde8c156112c3d3dfe1941"},
]
[package.dependencies]
pyaes = "*"
rsa = "*"
[package.extras]
cryptg = ["cryptg"]
[[package]] [[package]]
name = "tenacity" name = "tenacity"
version = "8.2.2" version = "8.2.2"
@ -10262,7 +10291,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
cohere = ["cohere"] cohere = ["cohere"]
embeddings = ["sentence-transformers"] embeddings = ["sentence-transformers"]
extended-testing = ["jq", "lxml", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "tqdm"] extended-testing = ["jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm"]
hnswlib = ["docarray", "hnswlib", "protobuf"] hnswlib = ["docarray", "hnswlib", "protobuf"]
in-memory-store = ["docarray"] in-memory-store = ["docarray"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
@ -10272,4 +10301,4 @@ qdrant = ["qdrant-client"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "a41fec88e32b74b859208dd79de58c096466aeda631c17048e6b903100f7ac70" content-hash = "b4cc0a605ec9b6ee8752f7d708a5700143815d32f699461ce6470ca44b62701a"

@ -86,6 +86,8 @@ lxml = {version = "^4.9.2", optional = true}
pymupdf = {version = "^1.22.3", optional = true} pymupdf = {version = "^1.22.3", optional = true}
pypdfium2 = {version = "^4.10.0", optional = true} pypdfium2 = {version = "^4.10.0", optional = true}
gql = {version = "^3.4.1", optional = true} gql = {version = "^3.4.1", optional = true}
pandas = {version = "^2.0.1", optional = true}
telethon = {version = "^1.28.5", optional = true}
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]
@ -190,6 +192,8 @@ extended_testing = [
"pypdfium2", "pypdfium2",
"tqdm", "tqdm",
"lxml", "lxml",
"pandas",
"telethon",
] ]
[tool.ruff] [tool.ruff]

@ -1,18 +0,0 @@
from pathlib import Path
from langchain.document_loaders import TelegramChatFileLoader
def test_telegram_chat_file_loader() -> None:
"""Test TelegramChatLoader."""
file_path = Path(__file__).parent.parent / "examples/telegram.json"
loader = TelegramChatFileLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
assert docs[0].metadata["source"] == str(file_path)
assert docs[0].page_content == (
"Henry on 2020-01-01T00:00:02: It's 2020...\n\n"
"Henry on 2020-01-01T00:00:04: Fireworks!\n\n"
"Grace 🧤 ðŸ\x8d on 2020-01-01T00:00:05: You're a minute late!\n\n"
)

@ -0,0 +1,34 @@
[
{
"sender_id": -1111111,
"text": "Hello, world!",
"date": "2023-05-15T19:30:49+00:00",
"message.id": 1785,
"is_reply": false,
"reply_to_id": null
},
{
"sender_id": -1111111,
"text": "Telegram is the best!",
"date": "2023-05-08T20:17:10+00:00",
"message.id": 1784,
"is_reply": true,
"reply_to_id": 1783
},
{
"sender_id": -1111111,
"text": "Langchain is great.",
"date": "2023-05-03T23:43:33+00:00",
"message.id": 1783,
"is_reply": true,
"reply_to_id": 1782
},
{
"sender_id": -1111111,
"text": "LLMs are awesome!",
"date": "2023-05-03T15:32:25+00:00",
"message.id": 1782,
"is_reply": false,
"reply_to_id": null
}
]

@ -0,0 +1,36 @@
from pathlib import Path
import pytest
from langchain.document_loaders import TelegramChatApiLoader, TelegramChatFileLoader
def test_telegram_chat_file_loader() -> None:
"""Test TelegramChatFileLoader."""
file_path = Path(__file__).parent / "test_docs" / "telegram.json"
loader = TelegramChatFileLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
assert docs[0].metadata["source"] == str(file_path)
assert docs[0].page_content == (
"Henry on 2020-01-01T00:00:02: It's 2020...\n\n"
"Henry on 2020-01-01T00:00:04: Fireworks!\n\n"
"Grace 🧤 ðŸ\x8d on 2020-01-01T00:00:05: You're a minute late!\n\n"
)
@pytest.mark.requires("telethon", "pandas")
def test_telegram_channel_loader_parsing() -> None:
"""Test TelegramChatApiLoader."""
file_path = Path(__file__).parent / "test_docs" / "telegram_channel.json"
# if we don't provide any value, it will skip fetching from telegram
# and will check the parsing logic.
loader = TelegramChatApiLoader(file_path=str(file_path))
docs = loader.load()
assert len(docs) == 1
print(docs[0].page_content)
assert docs[0].page_content == (
"Hello, world!.\nLLMs are awesome! Langchain is great. Telegram is the best!."
)
Loading…
Cancel
Save