iMessage loader: implement message content extraction from attributed… (#13634)

- **Description:** We are adding functionality to extract message
content from the `attributedBody` field of the database, in case the
content is not in the `text` field.
  - **Issue:** Closes #13326 and #10680 
  - **Dependencies:** None.
  - **Tag maintainer:** @eyurtsev, @hwchase17

---------

Co-authored-by: onotate <johnp.pham@mail.utoronto.ca>
This commit is contained in:
Ali Orozgani 2023-11-28 15:45:43 -05:00 committed by GitHub
parent e5256bcb69
commit 32d794f5a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 76 additions and 12 deletions

View File

@ -46,6 +46,36 @@ class IMessageChatLoader(BaseChatLoader):
"Please install it with `pip install pysqlite3`"
) from e
def _parse_attributedBody(self, attributedBody: bytes) -> str:
"""
Parse the attributedBody field of the message table
for the text content of the message.
The attributedBody field is a binary blob that contains
the message content after the byte string b"NSString":
5 bytes 1-3 bytes `len` bytes
... | b"NSString" | preamble | `len` | contents | ...
The 5 preamble bytes are always b"\x01\x94\x84\x01+"
The size of `len` is either 1 byte or 3 bytes:
- If the first byte in `len` is b"\x81" then `len` is 3 bytes long.
So the message length is the 2 bytes after, in little Endian.
- Otherwise, the size of `len` is 1 byte, and the message length is
that byte.
Args:
attributedBody (bytes): attributedBody field of the message table.
Return:
str: Text content of the message.
"""
content = attributedBody.split(b"NSString")[1][5:]
length, start = content[0], 1
if content[0] == 129:
length, start = int.from_bytes(content[1:3], "little"), 3
return content[start : start + length].decode("utf-8", errors="ignore")
def _load_single_chat_session(
self, cursor: "sqlite3.Cursor", chat_id: int
) -> ChatSession:
@ -62,7 +92,7 @@ class IMessageChatLoader(BaseChatLoader):
results: List[HumanMessage] = []
query = """
SELECT message.date, handle.id, message.text
SELECT message.date, handle.id, message.text, message.attributedBody
FROM message
JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
JOIN handle ON message.handle_id = handle.ROWID
@ -72,18 +102,24 @@ class IMessageChatLoader(BaseChatLoader):
cursor.execute(query, (chat_id,))
messages = cursor.fetchall()
for date, sender, text in messages:
if text: # Skip empty messages
results.append(
HumanMessage(
role=sender,
content=text,
additional_kwargs={
"message_time": date,
"sender": sender,
},
)
for date, sender, text, attributedBody in messages:
if text:
content = text
elif attributedBody:
content = self._parse_attributedBody(attributedBody)
else: # Skip messages with no content
continue
results.append(
HumanMessage(
role=sender,
content=content,
additional_kwargs={
"message_time": date,
"sender": sender,
},
)
)
return ChatSession(messages=results)

View File

@ -0,0 +1,28 @@
import pathlib
from langchain.chat_loaders import imessage, utils
def test_imessage_chat_loader() -> None:
chat_path = pathlib.Path(__file__).parent / "data" / "imessage_chat.db"
loader = imessage.IMessageChatLoader(str(chat_path))
chat_sessions = list(
utils.map_ai_messages(loader.lazy_load(), sender="testemail@gmail.com")
)
assert chat_sessions, "Chat sessions should not be empty"
assert chat_sessions[0]["messages"], "Chat messages should not be empty"
# message content in text field
assert "Yeh" in chat_sessions[0]["messages"][0].content, "Chat content mismatch"
# short message content in attributedBody field
assert (
"John is the almighty" in chat_sessions[0]["messages"][16].content
), "Chat content mismatch"
# long message content in attributedBody field
long_msg = "aaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbba"
"aaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbb"
assert long_msg in chat_sessions[0]["messages"][18].content, "Chat content mismatch"