From 32d794f5a3dac334d258f11a1b4b7d727a3549a4 Mon Sep 17 00:00:00 2001 From: Ali Orozgani <40970649+YaySushi@users.noreply.github.com> Date: Tue, 28 Nov 2023 15:45:43 -0500 Subject: [PATCH] =?UTF-8?q?iMessage=20loader:=20implement=20message=20cont?= =?UTF-8?q?ent=20extraction=20from=20attributed=E2=80=A6=20(#13634)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Description:** We are adding functionality to extract message content from the `attributedBody` field of the database, in case the content is not in the `text` field. - **Issue:** Closes #13326 and #10680 - **Dependencies:** None. - **Tag maintainer:** @eyurtsev, @hwchase17 --------- Co-authored-by: onotate --- .../langchain/chat_loaders/imessage.py | 60 ++++++++++++++---- .../chat_loaders/data/imessage_chat.db | Bin 0 -> 49152 bytes .../unit_tests/chat_loaders/test_imessage.py | 28 ++++++++ 3 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/data/imessage_chat.db create mode 100644 libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py diff --git a/libs/langchain/langchain/chat_loaders/imessage.py b/libs/langchain/langchain/chat_loaders/imessage.py index 093d9b15ed..53f32de92c 100644 --- a/libs/langchain/langchain/chat_loaders/imessage.py +++ b/libs/langchain/langchain/chat_loaders/imessage.py @@ -46,6 +46,36 @@ class IMessageChatLoader(BaseChatLoader): "Please install it with `pip install pysqlite3`" ) from e + def _parse_attributedBody(self, attributedBody: bytes) -> str: + """ + Parse the attributedBody field of the message table + for the text content of the message. + + The attributedBody field is a binary blob that contains + the message content after the byte string b"NSString": + + 5 bytes 1-3 bytes `len` bytes + ... | b"NSString" | preamble | `len` | contents | ... + + The 5 preamble bytes are always b"\x01\x94\x84\x01+" + + The size of `len` is either 1 byte or 3 bytes: + - If the first byte in `len` is b"\x81" then `len` is 3 bytes long. + So the message length is the 2 bytes after, in little Endian. + - Otherwise, the size of `len` is 1 byte, and the message length is + that byte. + + Args: + attributedBody (bytes): attributedBody field of the message table. + Return: + str: Text content of the message. + """ + content = attributedBody.split(b"NSString")[1][5:] + length, start = content[0], 1 + if content[0] == 129: + length, start = int.from_bytes(content[1:3], "little"), 3 + return content[start : start + length].decode("utf-8", errors="ignore") + def _load_single_chat_session( self, cursor: "sqlite3.Cursor", chat_id: int ) -> ChatSession: @@ -62,7 +92,7 @@ class IMessageChatLoader(BaseChatLoader): results: List[HumanMessage] = [] query = """ - SELECT message.date, handle.id, message.text + SELECT message.date, handle.id, message.text, message.attributedBody FROM message JOIN chat_message_join ON message.ROWID = chat_message_join.message_id JOIN handle ON message.handle_id = handle.ROWID @@ -72,18 +102,24 @@ class IMessageChatLoader(BaseChatLoader): cursor.execute(query, (chat_id,)) messages = cursor.fetchall() - for date, sender, text in messages: - if text: # Skip empty messages - results.append( - HumanMessage( - role=sender, - content=text, - additional_kwargs={ - "message_time": date, - "sender": sender, - }, - ) + for date, sender, text, attributedBody in messages: + if text: + content = text + elif attributedBody: + content = self._parse_attributedBody(attributedBody) + else: # Skip messages with no content + continue + + results.append( + HumanMessage( + role=sender, + content=content, + additional_kwargs={ + "message_time": date, + "sender": sender, + }, ) + ) return ChatSession(messages=results) diff --git a/libs/langchain/tests/unit_tests/chat_loaders/data/imessage_chat.db b/libs/langchain/tests/unit_tests/chat_loaders/data/imessage_chat.db new file mode 100644 index 0000000000000000000000000000000000000000..cd8ab0800feebe8e875aa051f7a99942ce4e3016 GIT binary patch literal 49152 zcmeHQ4Q$-jedmeN>GMR}Nv(OF8zya&I2IG~$m1Qee3)MsE3;3sB{j*C;!54oI@Whb zg~yAdEL~*l>=wfycS+VI>DIK{Fk}S=*&8r)S%+rBu4|)QKEC(+|K9(7@V!5$_wTh_lRjuWCBvn8ViOUG5W8ub zAc#@;9f9A*;le+?jS~nXE&q4=ca*sCt~+DYPlyF>hu+Wn1O*E)1v#ba_UnXJlosPG56g{5ACT zLxa~elCbjE3{GTb^gHweL*sSnmTNm7|9D^j*!Jy_KPA1#*Kex_(RGiiU!<=;FnQO6 zM%PoNrw&Z)lMmcO|C)XeEzizOPGsP&eR^hQoSv^*d3r{_dxoCPOzfZ4>CEH|otfRc zcbs<32VE~}oOaBGqLDM332J&;p4~e`v*UEdbRMvBW;SP*UDK({7mRYgXl5blMiyG{ zsLp-L%r%nE+10Y^EY@!{T-UMYs;-$=?EE6F?44A`=?6@wV%g<(X}OCF7s^#5)tvW~ z^=H{;(Z^}iacrku`gsG)YrPEPG&;*VrqL``{Zig6T98Sz<+@eLo_CdQ0RiF`K%~)9&<#&+ z#b!fPWwGL#DAQiO>5#h$kf>c$!2me(kh4m)=vLZE0GEdFegga19`i3-zOY z4m=?R=%N4D-PS%R!dAUCz8n z{~2xL>ukziB&Q9rid}Wu8%m>6v2zyESwkg;V$lZB=Bf=N>m#~gEEa7e zp9Sq1b;}8bgE~_IzC!IN=K;w@-z@ugw0Wpwl%Z8C+CZ)jfOfvgEYG`zb{M->$wXRM zmzRzFY9d%Il?-PwYn2b$^Wh;y2oJWMVN&*3(fbHZ_ljE0d?GiM**Vwz>(R=Jbr+yb)CS=+JZQSoi3^>OD%f}w9T(tv&**&9?>J%;!; zt7$y_4ufx->P=zx{;FlV?HiK3mCu%K*E+a})MhPM9n3ybI#icnPp>!5t^+l-y}N@ZWg*7)sYz9>-E0#^;W*wA>K*v+(kymZr|1kU%yP*xF7cJ>Iops z`NAX&yav$VD+a$2>N^Dd-~%Io5x@vw1TX>^0gM1f03(1AzzARjFaj8Xu0!CXYCpkm z-a!y9OjepD!z%8cM}Ke3*(FuB_W7?-hM-&#>Ma6(@PQG)2w(&-0vG{|07d{KfDyn5 zU<5D%7y*pH2M&QgG7?c$fBb)rpw4~Z3Wy7Y5x@vw1TX>^0gM1f03(1AzzARjFaj8X z%LM_tm){X-vjCv+`d>xs|9hw(5Y!v+2Ok&#i~vReBY+XW2w(&-0vG{|07d{KfDyn5 zbT0z29&$Tbcfvsa_rg$w+!pcu0Hph%R3!BpD%%M4GwMgwYrR+ZXx&>>obz8i0?T)= zoZWvfF$*3gz$fHUH!`q8njMHnql1G3g9C8XgMk6UJB1Kkg~Rhf0pXG8KrHq#a7f|B zKnZ*iBn_7oB`L9zE;Gr57-xjEC^3>Ak27pqWx1rLNPJ3DdxtA7IHxHgpGWy4uadRM z?pm$3B{MBI9l}k6%Z>6pQL7DPrYAiQa>TJmK?qL$sL12B$gaMr$;rt(j@NpuiAW!c z9?48=mggM`oC*;jVnrUUeg5-1ve`ou`|8_hrogdF^FbN#PIA1SaHd+CGaYzle`fmQ zMzLz51a7E3`6O}t`0=5}G@O2ObB->)wo50TNN8#@Aux$J6kZV08Y3%;#ORt9S6N;V z)2ZYT@#9;+wmd@&5pVvMd*{(+I0x>%DlW^u6zL&*r;So>x>|8J0sgVg9~$#10**fY z1KEJqEM~iX^qN8#pPSQnI z78oHRBpE>#Bu3^ql~LrR!tt^im-O`OPyf!+#^3N1e&xob(Ydexp{~%s{==t;(DYsF zT{%11tjx>3t(DnRYhBBr-_H83SLRv7f(@0Km6JM`7UB%gb4cl|%t%Qo!H5Zs)mTYQ z$)XaTZetnSrCr;m?Srma_LUmz2Bix68&5%H{_!o}czC%PhNk2MpNhx1xW@1b%Y#K# zIYvo}2`0^^MUhYF3dCio$w?Oe`49znss z)gtZFVtY5xg8$WfpKj8jL67ig!6nl|Lf4phnu7*MMP`^|IfheJp4WImg7$X#zLm54 znzT6ja4Ri5gV1aiE^Z%CH(;ZENCe13c(fS2%Qn2D;FDmm8%S{N=qLUfk>J|ozYRCT z&`7X+c;)N_D0qk8n|r9W!4KR3)EoSE4FJ!oA}hu@M&MYlHzzVFXbTu<>7kn==m{~Y zSyKhOY|>(QiXLbr>Far*8}xjYFhhpdu7qCr*;xj^EFO(gT_D1{?T{C(>NdJv=pZc3>Vi$MgU7Y zeBu=9><_JcPlR9)hHYs5|JcY!2AKB+S>i707`qDewZCeIv0qCv)Zxjp0}| zB_$O}lzD+X^I0US3OcXq36>F*B#6pmqV>zbBeX za_MwhX40|>#gftjqo@gq(L^B~XLThJPwM2E+mWa&NeU}TECV47pY=2#pvnmbUeL%~ zg4214Rmd~9AyJ7>Yw@(GGOU+ZO@@)S6z3UHjwdxKE-QkYi%1mSiOHg>vM|T4z`#$E(~OkXL`F)a6D-g3 zteoJuMZmmykgPpYcRhJ;h3Fn$a{3@mq&CV1mXmnw*sRbUKyh5=nCP1y3}<%M)^S(i3Gx zm}djl#Z?|Spm8kZRfDX@LA`v66}YrYuD*=c|Bq372(r~%_oy?}3)DYS&rx5c zPEyZMU!tC%K1Y3)TB07L7N`RCN$PHD3dHe&5x@vw1TX>^0gM1f03(1AzzARjFaj8X z%MyX;7)cJoTvc?d7k(!CAul{0z0wPhMX&I}-O(*xczblS7v2`#?1e8yhrIB`=%5$A5FPNs=cD~zxIY^8!l`JV7fwcdy)XlpQF{BL zJzh8wjd;IFm*+Y^<*lCm0WHu>C9IP+!$NyIovA-Q&9sFkhPoh+Br042JrvZnTJp#-3 z!e)`R{eF$@V4-d953&F$y!L)TH@&02DX1H^fxZ3mpMUqCiJ{(8FaKWZ=Sz(+wC#Si z!Ig6}P5my9w(jQ)yxs3Y`@I9KLkds7TQWBN`qPgt(Ph~8eb{!~LVb7drY_k2_2$>3 zFaP_}5b^fuZ-0`9jh%iNnx=y+bX(4?oSR-_!%!<5!r0v#BoE=SVN<058@%Z`*z&#L zZ7B7(0(J!#p8ft0J+B^L``7>33|x5A3q#|=@;xi(4uGCN;(PTVS~q&u-8wY9J2XPi z7{Dx_zhfJ@Y`MU!9v5JxWX62RrG34Gr}E*JRDEp^cSFs8_B+qag}3o@d3NO-a>#R) zzm2!CM?W;5EkTwdPtX<9q$`#ao^A)5Y5JP&hRynK{?)UuZC(nk-qvC(=a4rZsDMXvhVfJ!(c>^JcP#x)pOHQEgyugTeWPsCb(y*EIkSBN6}cM=fUAt#dGs>*en)(;~;kfeLgqKe!oeZ28}|aPmql;vHoun zQQ|Qo_7B5{2Y$Pc>iG*exwJ=ZJ9sGo8z*hBameFXU`AiS0ZEX(OnyrPgNOonjH3&t zW70-GA0E%Qwts+6-7c_vFdF^Rvsc5Y^ttr0|JoK_&n3v#soGs9PVR2DfiKl^es49#R7Q%;<` z{XSy$=&)yeYK5=|MoNGkhnI#0P4?`))gl9&F6U1NM1~gvw zM}Ws0UK~Rk0S4I(`;L`=?4u`6-Uf!?i{Vv>BmZ>C zjeWD4zsKgLKCcQT>W1od8o0Q~l6$yUy#m9c{k832U-Dh#w-kXxa2(qvFkT zhu0e;Cl8r^zp4v-FZZ7M-=F-)rtl}UgZM40-GAcL0C2#6saspO1na^R+MnowdNL3h zII`zL-_Hr+X03zTw`HdHRb6AQXg0rYlW%@TC(y@rsAdZFZ|t;wZ|8U6dppdQT*~sl$7t!2#-uvF$sMQ*St$ zANJ|@&zpaftsnLQxwnwz+2HFv`irzYU?DmXFA%rZf4B+T8~9vPAbZn);#3qg@6WRy zZN5MYOY=eUu-CMThS%tznO5)XS0Szx>eU{kZ}6=~@A+2a{q_m3-c#S2e)V+tZBG!B F^M4(2XAuAZ literal 0 HcmV?d00001 diff --git a/libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py b/libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py new file mode 100644 index 0000000000..03a6c9f66a --- /dev/null +++ b/libs/langchain/tests/unit_tests/chat_loaders/test_imessage.py @@ -0,0 +1,28 @@ +import pathlib + +from langchain.chat_loaders import imessage, utils + + +def test_imessage_chat_loader() -> None: + chat_path = pathlib.Path(__file__).parent / "data" / "imessage_chat.db" + loader = imessage.IMessageChatLoader(str(chat_path)) + + chat_sessions = list( + utils.map_ai_messages(loader.lazy_load(), sender="testemail@gmail.com") + ) + assert chat_sessions, "Chat sessions should not be empty" + + assert chat_sessions[0]["messages"], "Chat messages should not be empty" + + # message content in text field + assert "Yeh" in chat_sessions[0]["messages"][0].content, "Chat content mismatch" + + # short message content in attributedBody field + assert ( + "John is the almighty" in chat_sessions[0]["messages"][16].content + ), "Chat content mismatch" + + # long message content in attributedBody field + long_msg = "aaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbba" + "aaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbbbbaaaaabbb" + assert long_msg in chat_sessions[0]["messages"][18].content, "Chat content mismatch"