diff --git a/docs/modules/indexes/document_loaders/examples/telegram.ipynb b/docs/modules/indexes/document_loaders/examples/telegram.ipynb index bf54fc97..9e26cd81 100644 --- a/docs/modules/indexes/document_loaders/examples/telegram.ipynb +++ b/docs/modules/indexes/document_loaders/examples/telegram.ipynb @@ -59,9 +59,11 @@ "id": "3e64cac2", "metadata": {}, "source": [ - "`TelegramChatApiLoader` loads data directly from any specified channel from Telegram. In order to export the data, you will need to authenticate your Telegram account. \n", + "`TelegramChatApiLoader` loads data directly from any specified chat from Telegram. In order to export the data, you will need to authenticate your Telegram account. \n", "\n", "You can get the API_HASH and API_ID from https://my.telegram.org/auth?to=apps\n", + "\n", + "chat_entity – recommended to be the [entity](https://docs.telethon.dev/en/stable/concepts/entities.html?highlight=Entity#what-is-an-entity) of a channel.\n", "\n" ] }, @@ -72,10 +74,12 @@ "metadata": {}, "outputs": [], "source": [ - "loader = TelegramChatApiLoader(user_name =\"\"\\\n", - " chat_url=\"\",\\\n", - " api_hash=\"\",\\\n", - " api_id=\"\")" + "loader = TelegramChatApiLoader(\n", + " chat_entity=\"\", # recommended to use Entity here\n", + " api_hash=\"\", \n", + " api_id=\"\", \n", + " user_name =\"\", # needed only for caching the session.\n", + ")" ] }, { @@ -113,10 +117,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" - - } }, "nbformat": 4, diff --git a/langchain/document_loaders/telegram.py b/langchain/document_loaders/telegram.py index 6b9b8921..795e1879 100644 --- a/langchain/document_loaders/telegram.py +++ b/langchain/document_loaders/telegram.py @@ -12,6 +12,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter if TYPE_CHECKING: import pandas as pd + from telethon.hints import EntityLike def concatenate_rows(row: dict) -> str: @@ -82,16 +83,18 @@ class TelegramChatApiLoader(BaseLoader): def __init__( self, - chat_url: Optional[str] = None, + chat_entity: Optional[EntityLike] = None, api_id: Optional[int] = None, api_hash: Optional[str] = None, username: Optional[str] = None, + file_path: str = "telegram_data.json", ): """Initialize with API parameters.""" - self.chat_url = chat_url + self.chat_entity = chat_entity self.api_id = api_id self.api_hash = api_hash self.username = username + self.file_path = file_path async def fetch_data_from_telegram(self) -> None: """Fetch data from Telegram API and save it as a JSON file.""" @@ -99,7 +102,7 @@ class TelegramChatApiLoader(BaseLoader): data = [] async with TelegramClient(self.username, self.api_id, self.api_hash) as client: - async for message in client.iter_messages(self.chat_url): + async for message in client.iter_messages(self.chat_entity): is_reply = message.reply_to is not None reply_to_id = message.reply_to.reply_to_msg_id if is_reply else None data.append( @@ -113,11 +116,9 @@ class TelegramChatApiLoader(BaseLoader): } ) - with open("telegram_data.json", "w", encoding="utf-8") as f: + with open(self.file_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) - self.file_path = "telegram_data.json" - def _get_message_threads(self, data: pd.DataFrame) -> dict: """Create a dictionary of message threads from the given data. @@ -160,10 +161,10 @@ class TelegramChatApiLoader(BaseLoader): return all_replies # Filter out parent messages - parent_messages = data[data["is_reply"] is False] + parent_messages = data[~data["is_reply"]] # Filter out reply messages and drop rows with NaN in 'reply_to_id' - reply_messages = data[data["is_reply"] is True].dropna(subset=["reply_to_id"]) + reply_messages = data[data["is_reply"]].dropna(subset=["reply_to_id"]) # Convert 'reply_to_id' to integer reply_messages["reply_to_id"] = reply_messages["reply_to_id"].astype(int) @@ -217,24 +218,32 @@ class TelegramChatApiLoader(BaseLoader): def load(self) -> List[Document]: """Load documents.""" - if self.chat_url is not None: + + if self.chat_entity is not None: try: import nest_asyncio - import pandas as pd nest_asyncio.apply() asyncio.run(self.fetch_data_from_telegram()) except ImportError: raise ValueError( - "please install with `pip install nest_asyncio`,\ - `pip install nest_asyncio` " + """`nest_asyncio` package not found. + please install with `pip install nest_asyncio` + """ ) p = Path(self.file_path) with open(p, encoding="utf8") as f: d = json.load(f) - + try: + import pandas as pd + except ImportError: + raise ValueError( + """`pandas` package not found. + please install with `pip install pandas` + """ + ) normalized_messages = pd.json_normalize(d) df = pd.DataFrame(normalized_messages) diff --git a/poetry.lock b/poetry.lock index 405486c0..9040804c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6311,6 +6311,17 @@ files = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +[[package]] +name = "pyaes" +version = "1.6.1" +description = "Pure-Python Implementation of the AES block-cipher and common modes of operation" +category = "main" +optional = true +python-versions = "*" +files = [ + {file = "pyaes-1.6.1.tar.gz", hash = "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"}, +] + [[package]] name = "pyarrow" version = "12.0.0" @@ -6595,7 +6606,6 @@ files = [ {file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"}, {file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"}, {file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"}, - {file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"}, ] [package.dependencies] @@ -8604,6 +8614,25 @@ files = [ [package.dependencies] redis = ">=4.4.4" +[[package]] +name = "telethon" +version = "1.28.5" +description = "Full-featured Telegram client library for Python 3" +category = "main" +optional = true +python-versions = ">=3.5" +files = [ + {file = "Telethon-1.28.5-py3-none-any.whl", hash = "sha256:edc42fd58b8e1569830d3ead564cafa60fd51d684f03ee2a1fdd5f77a5a10438"}, + {file = "Telethon-1.28.5.tar.gz", hash = "sha256:b3990ec22351a3f3e1af376729c985025bbdd3bdabdde8c156112c3d3dfe1941"}, +] + +[package.dependencies] +pyaes = "*" +rsa = "*" + +[package.extras] +cryptg = ["cryptg"] + [[package]] name = "tenacity" version = "8.2.2" @@ -10262,7 +10291,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] -extended-testing = ["jq", "lxml", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "tqdm"] +extended-testing = ["jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm"] hnswlib = ["docarray", "hnswlib", "protobuf"] in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] @@ -10272,4 +10301,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "a41fec88e32b74b859208dd79de58c096466aeda631c17048e6b903100f7ac70" +content-hash = "b4cc0a605ec9b6ee8752f7d708a5700143815d32f699461ce6470ca44b62701a" diff --git a/pyproject.toml b/pyproject.toml index cdaa0121..5deb45e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,8 @@ lxml = {version = "^4.9.2", optional = true} pymupdf = {version = "^1.22.3", optional = true} pypdfium2 = {version = "^4.10.0", optional = true} gql = {version = "^3.4.1", optional = true} +pandas = {version = "^2.0.1", optional = true} +telethon = {version = "^1.28.5", optional = true} [tool.poetry.group.docs.dependencies] @@ -190,6 +192,8 @@ extended_testing = [ "pypdfium2", "tqdm", "lxml", + "pandas", + "telethon", ] [tool.ruff] diff --git a/tests/integration_tests/document_loaders/test_telegram.py b/tests/integration_tests/document_loaders/test_telegram.py deleted file mode 100644 index 5b07abbe..00000000 --- a/tests/integration_tests/document_loaders/test_telegram.py +++ /dev/null @@ -1,18 +0,0 @@ -from pathlib import Path - -from langchain.document_loaders import TelegramChatFileLoader - - -def test_telegram_chat_file_loader() -> None: - """Test TelegramChatLoader.""" - file_path = Path(__file__).parent.parent / "examples/telegram.json" - loader = TelegramChatFileLoader(str(file_path)) - docs = loader.load() - - assert len(docs) == 1 - assert docs[0].metadata["source"] == str(file_path) - assert docs[0].page_content == ( - "Henry on 2020-01-01T00:00:02: It's 2020...\n\n" - "Henry on 2020-01-01T00:00:04: Fireworks!\n\n" - "Grace 🧤 ðŸ\x8d’ on 2020-01-01T00:00:05: You're a minute late!\n\n" - ) diff --git a/tests/unit_tests/document_loader/__init__.py b/tests/unit_tests/document_loaders/__init__.py similarity index 100% rename from tests/unit_tests/document_loader/__init__.py rename to tests/unit_tests/document_loaders/__init__.py diff --git a/tests/unit_tests/document_loader/blob_loaders/__init__.py b/tests/unit_tests/document_loaders/blob_loaders/__init__.py similarity index 100% rename from tests/unit_tests/document_loader/blob_loaders/__init__.py rename to tests/unit_tests/document_loaders/blob_loaders/__init__.py diff --git a/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py b/tests/unit_tests/document_loaders/blob_loaders/test_filesystem_blob_loader.py similarity index 100% rename from tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py rename to tests/unit_tests/document_loaders/blob_loaders/test_filesystem_blob_loader.py diff --git a/tests/unit_tests/document_loader/blob_loaders/test_public_api.py b/tests/unit_tests/document_loaders/blob_loaders/test_public_api.py similarity index 100% rename from tests/unit_tests/document_loader/blob_loaders/test_public_api.py rename to tests/unit_tests/document_loaders/blob_loaders/test_public_api.py diff --git a/tests/unit_tests/document_loader/blob_loaders/test_schema.py b/tests/unit_tests/document_loaders/blob_loaders/test_schema.py similarity index 100% rename from tests/unit_tests/document_loader/blob_loaders/test_schema.py rename to tests/unit_tests/document_loaders/blob_loaders/test_schema.py diff --git a/tests/unit_tests/document_loader/loaders/__init__.py b/tests/unit_tests/document_loaders/loaders/__init__.py similarity index 100% rename from tests/unit_tests/document_loader/loaders/__init__.py rename to tests/unit_tests/document_loaders/loaders/__init__.py diff --git a/tests/unit_tests/document_loader/loaders/vendors/__init__.py b/tests/unit_tests/document_loaders/loaders/vendors/__init__.py similarity index 100% rename from tests/unit_tests/document_loader/loaders/vendors/__init__.py rename to tests/unit_tests/document_loaders/loaders/vendors/__init__.py diff --git a/tests/unit_tests/document_loader/loaders/vendors/test_data/docugami-example.xml b/tests/unit_tests/document_loaders/loaders/vendors/test_data/docugami-example.xml similarity index 100% rename from tests/unit_tests/document_loader/loaders/vendors/test_data/docugami-example.xml rename to tests/unit_tests/document_loaders/loaders/vendors/test_data/docugami-example.xml diff --git a/tests/unit_tests/document_loader/loaders/vendors/test_docugami.py b/tests/unit_tests/document_loaders/loaders/vendors/test_docugami.py similarity index 100% rename from tests/unit_tests/document_loader/loaders/vendors/test_docugami.py rename to tests/unit_tests/document_loaders/loaders/vendors/test_docugami.py diff --git a/tests/unit_tests/document_loader/parsers/__init__.py b/tests/unit_tests/document_loaders/parsers/__init__.py similarity index 100% rename from tests/unit_tests/document_loader/parsers/__init__.py rename to tests/unit_tests/document_loaders/parsers/__init__.py diff --git a/tests/unit_tests/document_loader/parsers/test_generic.py b/tests/unit_tests/document_loaders/parsers/test_generic.py similarity index 100% rename from tests/unit_tests/document_loader/parsers/test_generic.py rename to tests/unit_tests/document_loaders/parsers/test_generic.py diff --git a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py b/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py similarity index 100% rename from tests/unit_tests/document_loader/parsers/test_pdf_parsers.py rename to tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py diff --git a/tests/unit_tests/document_loader/parsers/test_public_api.py b/tests/unit_tests/document_loaders/parsers/test_public_api.py similarity index 100% rename from tests/unit_tests/document_loader/parsers/test_public_api.py rename to tests/unit_tests/document_loaders/parsers/test_public_api.py diff --git a/tests/unit_tests/document_loader/test_base.py b/tests/unit_tests/document_loaders/test_base.py similarity index 100% rename from tests/unit_tests/document_loader/test_base.py rename to tests/unit_tests/document_loaders/test_base.py diff --git a/tests/unit_tests/document_loader/test_csv_loader.py b/tests/unit_tests/document_loaders/test_csv_loader.py similarity index 100% rename from tests/unit_tests/document_loader/test_csv_loader.py rename to tests/unit_tests/document_loaders/test_csv_loader.py diff --git a/tests/unit_tests/document_loader/test_docs/csv/test_empty.csv b/tests/unit_tests/document_loaders/test_docs/csv/test_empty.csv similarity index 100% rename from tests/unit_tests/document_loader/test_docs/csv/test_empty.csv rename to tests/unit_tests/document_loaders/test_docs/csv/test_empty.csv diff --git a/tests/unit_tests/document_loader/test_docs/csv/test_nominal.csv b/tests/unit_tests/document_loaders/test_docs/csv/test_nominal.csv similarity index 100% rename from tests/unit_tests/document_loader/test_docs/csv/test_nominal.csv rename to tests/unit_tests/document_loaders/test_docs/csv/test_nominal.csv diff --git a/tests/unit_tests/document_loader/test_docs/csv/test_one_col.csv b/tests/unit_tests/document_loaders/test_docs/csv/test_one_col.csv similarity index 100% rename from tests/unit_tests/document_loader/test_docs/csv/test_one_col.csv rename to tests/unit_tests/document_loaders/test_docs/csv/test_one_col.csv diff --git a/tests/unit_tests/document_loader/test_docs/csv/test_one_row.csv b/tests/unit_tests/document_loaders/test_docs/csv/test_one_row.csv similarity index 100% rename from tests/unit_tests/document_loader/test_docs/csv/test_one_row.csv rename to tests/unit_tests/document_loaders/test_docs/csv/test_one_row.csv diff --git a/tests/integration_tests/examples/telegram.json b/tests/unit_tests/document_loaders/test_docs/telegram.json similarity index 100% rename from tests/integration_tests/examples/telegram.json rename to tests/unit_tests/document_loaders/test_docs/telegram.json diff --git a/tests/unit_tests/document_loaders/test_docs/telegram_channel.json b/tests/unit_tests/document_loaders/test_docs/telegram_channel.json new file mode 100644 index 00000000..5a37d052 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_docs/telegram_channel.json @@ -0,0 +1,34 @@ +[ + { + "sender_id": -1111111, + "text": "Hello, world!", + "date": "2023-05-15T19:30:49+00:00", + "message.id": 1785, + "is_reply": false, + "reply_to_id": null + }, + { + "sender_id": -1111111, + "text": "Telegram is the best!", + "date": "2023-05-08T20:17:10+00:00", + "message.id": 1784, + "is_reply": true, + "reply_to_id": 1783 + }, + { + "sender_id": -1111111, + "text": "Langchain is great.", + "date": "2023-05-03T23:43:33+00:00", + "message.id": 1783, + "is_reply": true, + "reply_to_id": 1782 + }, + { + "sender_id": -1111111, + "text": "LLMs are awesome!", + "date": "2023-05-03T15:32:25+00:00", + "message.id": 1782, + "is_reply": false, + "reply_to_id": null + } +] diff --git a/tests/unit_tests/document_loader/test_json_loader.py b/tests/unit_tests/document_loaders/test_json_loader.py similarity index 100% rename from tests/unit_tests/document_loader/test_json_loader.py rename to tests/unit_tests/document_loaders/test_json_loader.py diff --git a/tests/unit_tests/document_loaders/test_telegram.py b/tests/unit_tests/document_loaders/test_telegram.py new file mode 100644 index 00000000..4fbe32a7 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_telegram.py @@ -0,0 +1,36 @@ +from pathlib import Path + +import pytest + +from langchain.document_loaders import TelegramChatApiLoader, TelegramChatFileLoader + + +def test_telegram_chat_file_loader() -> None: + """Test TelegramChatFileLoader.""" + file_path = Path(__file__).parent / "test_docs" / "telegram.json" + loader = TelegramChatFileLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].metadata["source"] == str(file_path) + assert docs[0].page_content == ( + "Henry on 2020-01-01T00:00:02: It's 2020...\n\n" + "Henry on 2020-01-01T00:00:04: Fireworks!\n\n" + "Grace 🧤 ðŸ\x8d’ on 2020-01-01T00:00:05: You're a minute late!\n\n" + ) + + +@pytest.mark.requires("telethon", "pandas") +def test_telegram_channel_loader_parsing() -> None: + """Test TelegramChatApiLoader.""" + file_path = Path(__file__).parent / "test_docs" / "telegram_channel.json" + # if we don't provide any value, it will skip fetching from telegram + # and will check the parsing logic. + loader = TelegramChatApiLoader(file_path=str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + print(docs[0].page_content) + assert docs[0].page_content == ( + "Hello, world!.\nLLMs are awesome! Langchain is great. Telegram is the best!." + ) diff --git a/tests/unit_tests/document_loader/test_web_base.py b/tests/unit_tests/document_loaders/test_web_base.py similarity index 100% rename from tests/unit_tests/document_loader/test_web_base.py rename to tests/unit_tests/document_loaders/test_web_base.py diff --git a/tests/unit_tests/document_loader/test_youtube.py b/tests/unit_tests/document_loaders/test_youtube.py similarity index 100% rename from tests/unit_tests/document_loader/test_youtube.py rename to tests/unit_tests/document_loaders/test_youtube.py