@ -1,5 +1,6 @@
""" Util that calls Arxiv. """
""" Util that calls Arxiv. """
import logging
import logging
import os
from typing import Any , Dict , List
from typing import Any , Dict , List
from pydantic import BaseModel , Extra , root_validator
from pydantic import BaseModel , Extra , root_validator
@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
It uses only the most informative fields of article meta information .
It uses only the most informative fields of article meta information .
"""
"""
try :
try :
results = self . arxiv_search ( # type: ignore
query [ : self . ARXIV_MAX_QUERY_LENGTH ] , max_results = self . top_k_results
) . results ( )
except self . arxiv_exceptions as ex :
return f " Arxiv exception: { ex } "
docs = [
docs = [
f " Published: { result . updated . date ( ) } \n Title: { result . title } \n "
f " Published: { result . updated . date ( ) } \n Title: { result . title } \n "
f " Authors: { ' , ' . join ( a . name for a in result . authors ) } \n "
f " Authors: { ' , ' . join ( a . name for a in result . authors ) } \n "
f " Summary: { result . summary } "
f " Summary: { result . summary } "
for result in self . arxiv_search ( # type: ignore
for result in results
query [ : self . ARXIV_MAX_QUERY_LENGTH ] , max_results = self . top_k_results
) . results ( )
]
]
return (
if docs :
" \n \n " . join ( docs ) [ : self . doc_content_chars_max ]
return " \n \n " . join ( docs ) [ : self . doc_content_chars_max ]
if docs
else :
else " No good Arxiv Result was found "
return " No good Arxiv Result was found "
)
except self . arxiv_exceptions as ex :
return f " Arxiv exception: { ex } "
def load ( self , query : str ) - > List [ Document ] :
def load ( self , query : str ) - > List [ Document ] :
"""
"""
@ -98,22 +99,30 @@ class ArxivAPIWrapper(BaseModel):
try :
try :
import fitz
import fitz
except ImportError :
except ImportError :
raise Value Error(
raise Import Error(
" PyMuPDF package not found, please install it with "
" PyMuPDF package not found, please install it with "
" `pip install pymupdf` "
" `pip install pymupdf` "
)
)
try :
try :
docs : List [ Document ] = [ ]
results = self . arxiv_search ( # type: ignore
for result in self . arxiv_search ( # type: ignore
query [ : self . ARXIV_MAX_QUERY_LENGTH ] , max_results = self . load_max_docs
query [ : self . ARXIV_MAX_QUERY_LENGTH ] , max_results = self . load_max_docs
) . results ( ) :
) . results ( )
except self . arxiv_exceptions as ex :
logger . debug ( " Error on arxiv: %s " , ex )
return [ ]
docs : List [ Document ] = [ ]
for result in results :
try :
try :
doc_file_name : str = result . download_pdf ( )
doc_file_name : str = result . download_pdf ( )
with fitz . open ( doc_file_name ) as doc_file :
with fitz . open ( doc_file_name ) as doc_file :
text : str = " " . join ( page . get_text ( ) for page in doc_file )
text : str = " " . join ( page . get_text ( ) for page in doc_file )
add_meta = (
except FileNotFoundError as f_ex :
{
logger . debug ( f_ex )
continue
if self . load_all_available_meta :
extra_metadata = {
" entry_id " : result . entry_id ,
" entry_id " : result . entry_id ,
" published_first_time " : str ( result . published . date ( ) ) ,
" published_first_time " : str ( result . published . date ( ) ) ,
" comment " : result . comment ,
" comment " : result . comment ,
@ -123,27 +132,18 @@ class ArxivAPIWrapper(BaseModel):
" categories " : result . categories ,
" categories " : result . categories ,
" links " : [ link . href for link in result . links ] ,
" links " : [ link . href for link in result . links ] ,
}
}
if self . load_all_available_meta
else :
else { }
extra_metadata = { }
)
metadata = {
doc = Document (
page_content = text [ : self . doc_content_chars_max ] ,
metadata = (
{
" Published " : str ( result . updated . date ( ) ) ,
" Published " : str ( result . updated . date ( ) ) ,
" Title " : result . title ,
" Title " : result . title ,
" Authors " : " , " . join (
" Authors " : " , " . join ( a . name for a in result . authors ) ,
a . name for a in result . authors
) ,
" Summary " : result . summary ,
" Summary " : result . summary ,
* * add _meta,
* * extra_metadata ,
}
}
) ,
doc = Document (
page_content = text [ : self . doc_content_chars_max ] , metadata = metadata
)
)
docs . append ( doc )
docs . append ( doc )
except FileNotFoundError as f_ex :
os . remove ( doc_file_name )
logger . debug ( f_ex )
return docs
return docs
except self . arxiv_exceptions as ex :
logger . debug ( " Error on arxiv: %s " , ex )
return [ ]