mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
iMessage loader: implement message content extraction from attributed… (#13634)
- **Description:** We are adding functionality to extract message content from the `attributedBody` field of the database, in case the content is not in the `text` field. - **Issue:** Closes #13326 and #10680 - **Dependencies:** None. - **Tag maintainer:** @eyurtsev, @hwchase17 --------- Co-authored-by: onotate <johnp.pham@mail.utoronto.ca>
This commit is contained in:
parent
e5256bcb69
commit
32d794f5a3
@ -46,6 +46,36 @@ class IMessageChatLoader(BaseChatLoader):
|
||||
"Please install it with `pip install pysqlite3`"
|
||||
) from e
|
||||
|
||||
def _parse_attributedBody(self, attributedBody: bytes) -> str:
|
||||
"""
|
||||
Parse the attributedBody field of the message table
|
||||
for the text content of the message.
|
||||
|
||||
The attributedBody field is a binary blob that contains
|
||||
the message content after the byte string b"NSString":
|
||||
|
||||
5 bytes 1-3 bytes `len` bytes
|
||||
... | b"NSString" | preamble | `len` | contents | ...
|
||||
|
||||
The 5 preamble bytes are always b"\x01\x94\x84\x01+"
|
||||
|
||||
The size of `len` is either 1 byte or 3 bytes:
|
||||
- If the first byte in `len` is b"\x81" then `len` is 3 bytes long.
|
||||
So the message length is the 2 bytes after, in little Endian.
|
||||
- Otherwise, the size of `len` is 1 byte, and the message length is
|
||||
that byte.
|
||||
|
||||
Args:
|
||||
attributedBody (bytes): attributedBody field of the message table.
|
||||
Return:
|
||||
str: Text content of the message.
|
||||
"""
|
||||
content = attributedBody.split(b"NSString")[1][5:]
|
||||
length, start = content[0], 1
|
||||
if content[0] == 129:
|
||||
length, start = int.from_bytes(content[1:3], "little"), 3
|
||||
return content[start : start + length].decode("utf-8", errors="ignore")
|
||||
|
||||
def _load_single_chat_session(
|
||||
self, cursor: "sqlite3.Cursor", chat_id: int
|
||||
) -> ChatSession:
|
||||
@ -62,7 +92,7 @@ class IMessageChatLoader(BaseChatLoader):
|
||||
results: List[HumanMessage] = []
|
||||
|
||||
query = """
|
||||
SELECT message.date, handle.id, message.text
|
||||
SELECT message.date, handle.id, message.text, message.attributedBody
|
||||
FROM message
|
||||
JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
|
||||
JOIN handle ON message.handle_id = handle.ROWID
|
||||
@ -72,18 +102,24 @@ class IMessageChatLoader(BaseChatLoader):
|
||||
cursor.execute(query, (chat_id,))
|
||||
messages = cursor.fetchall()
|
||||
|
||||
for date, sender, text in messages:
|
||||
if text: # Skip empty messages
|
||||
results.append(
|
||||
HumanMessage(
|
||||
role=sender,
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"message_time": date,
|
||||
"sender": sender,
|
||||
},
|
||||
)
|
||||
for date, sender, text, attributedBody in messages:
|
||||
if text:
|
||||
content = text
|
||||
elif attributedBody:
|
||||
content = self._parse_attributedBody(attributedBody)
|
||||
else: # Skip messages with no content
|
||||
continue
|
||||
|
||||
results.append(
|
||||
HumanMessage(
|
||||
role=sender,
|
||||
content=content,
|
||||
additional_kwargs={
|
||||
"message_time": date,
|
||||
"sender": sender,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return ChatSession(messages=results)
|
||||
|
||||
|
Binary file not shown.
@ -0,0 +1,28 @@
|
||||
import pathlib
|
||||
|
||||
from langchain.chat_loaders import imessage, utils
|
||||
|
||||
|
||||
def test_imessage_chat_loader() -> None:
|
||||
chat_path = pathlib.Path(__file__).parent / "data" / "imessage_chat.db"
|
||||
loader = imessage.IMessageChatLoader(str(chat_path))
|
||||
|
||||
chat_sessions = list(
|
||||
utils.map_ai_messages(loader.lazy_load(), sender="testemail@gmail.com")
|
||||
)
|
||||
assert chat_sessions, "Chat sessions should not be empty"
|
||||
|
||||
assert chat_sessions[0]["messages"], "Chat messages should not be empty"
|
||||
|
||||
# message content in text field
|
||||
assert "Yeh" in chat_sessions[0]["messages"][0].content, "Chat content mismatch"
|
||||
|
||||
# short message content in attributedBody field
|
||||
assert (
|
||||
"John is the almighty" in chat_sessions[0]["messages"][16].content
|
||||
), "Chat content mismatch"
|
||||
|
||||
# long message content in attributedBody field
|
||||
long_msg = "aaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbba"
|
||||
"aaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbb"
|
||||
assert long_msg in chat_sessions[0]["messages"][18].content, "Chat content mismatch"
|
Loading…
Reference in New Issue
Block a user