From debcf053ebd7c628550b37f6cba188ec63691b3d Mon Sep 17 00:00:00 2001 From: Predrag Gruevski <2348618+obi1kenobi@users.noreply.github.com> Date: Wed, 18 Oct 2023 10:55:17 -0400 Subject: [PATCH] Fix `invalid escape sequence` warnings by using raw strings for regexes. (#11943) This code also generates warnings when our users' apps hit it, which is annoying and doesn't look great. Let's fix it. --- libs/langchain/langchain/chains/query_constructor/parser.py | 2 +- libs/langchain/langchain/document_loaders/acreom.py | 6 +++--- .../langchain/document_transformers/openai_functions.py | 4 ++-- libs/langchain/langchain/retrievers/kendra.py | 4 ++-- libs/langchain/langchain/retrievers/self_query/myscale.py | 4 ++-- libs/langchain/langchain/utils/html.py | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libs/langchain/langchain/chains/query_constructor/parser.py b/libs/langchain/langchain/chains/query_constructor/parser.py index b7af3ae6c7..1dec1058e2 100644 --- a/libs/langchain/langchain/chains/query_constructor/parser.py +++ b/libs/langchain/langchain/chains/query_constructor/parser.py @@ -23,7 +23,7 @@ from langchain.chains.query_constructor.ir import ( Operator, ) -GRAMMAR = """ +GRAMMAR = r""" ?program: func_call ?expr: func_call | value diff --git a/libs/langchain/langchain/document_loaders/acreom.py b/libs/langchain/langchain/document_loaders/acreom.py index c69ba37d0b..740f3797a9 100644 --- a/libs/langchain/langchain/document_loaders/acreom.py +++ b/libs/langchain/langchain/document_loaders/acreom.py @@ -49,9 +49,9 @@ class AcreomLoader(BaseLoader): def _process_acreom_content(self, content: str) -> str: # remove acreom specific elements from content that # do not contribute to the context of current document - content = re.sub("\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content) # rm tasks - content = re.sub("#", "", content) # rm hashtags - content = re.sub("\[\[.*?\]\]", "", content) # rm doclinks + content = re.sub(r"\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content) # rm tasks + content = re.sub(r"#", "", content) # rm hashtags + content = re.sub(r"\[\[.*?\]\]", "", content) # rm doclinks return content def lazy_load(self) -> Iterator[Document]: diff --git a/libs/langchain/langchain/document_transformers/openai_functions.py b/libs/langchain/langchain/document_transformers/openai_functions.py index f60586410c..1b85fe75c2 100644 --- a/libs/langchain/langchain/document_transformers/openai_functions.py +++ b/libs/langchain/langchain/document_transformers/openai_functions.py @@ -40,7 +40,7 @@ class OpenAIMetadataTagger(BaseDocumentTransformer, BaseModel): tagging_chain = create_tagging_chain(schema, llm) document_transformer = OpenAIMetadataTagger(tagging_chain=tagging_chain) original_documents = [ - Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\This is the greatest movie ever made. 4 out of 5 stars."), + Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."), Document(page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}), ] @@ -123,7 +123,7 @@ def create_metadata_tagger( document_transformer = create_metadata_tagger(schema, llm) original_documents = [ - Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\This is the greatest movie ever made. 4 out of 5 stars."), + Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."), Document(page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}), ] diff --git a/libs/langchain/langchain/retrievers/kendra.py b/libs/langchain/langchain/retrievers/kendra.py index 68d6444caa..55382993b3 100644 --- a/libs/langchain/langchain/retrievers/kendra.py +++ b/libs/langchain/langchain/retrievers/kendra.py @@ -20,7 +20,7 @@ def clean_excerpt(excerpt: str) -> str: """ if not excerpt: return excerpt - res = re.sub("\s+", " ", excerpt).replace("...", "") + res = re.sub(r"\s+", " ", excerpt).replace("...", "") return res @@ -45,7 +45,7 @@ def combined_text(item: "ResultItem") -> str: DocumentAttributeValueType = Union[str, int, List[str], None] -"""Possible types of a DocumentAttributeValue. +"""Possible types of a DocumentAttributeValue. Dates are also represented as str. """ diff --git a/libs/langchain/langchain/retrievers/self_query/myscale.py b/libs/langchain/langchain/retrievers/self_query/myscale.py index e4c5dde4b6..fde3a6adf2 100644 --- a/libs/langchain/langchain/retrievers/self_query/myscale.py +++ b/libs/langchain/langchain/retrievers/self_query/myscale.py @@ -88,8 +88,8 @@ class MyScaleTranslator(Visitor): return self.map_dict[func](*args) def visit_comparison(self, comparison: Comparison) -> Dict: - regex = "\((.*?)\)" - matched = re.search("\(\w+\)", comparison.attribute) + regex = r"\((.*?)\)" + matched = re.search(r"\(\w+\)", comparison.attribute) # If arbitrary function is applied to an attribute if matched: diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py index ee830e246e..95e1c3c2f4 100644 --- a/libs/langchain/langchain/utils/html.py +++ b/libs/langchain/langchain/utils/html.py @@ -18,13 +18,13 @@ SUFFIXES_TO_IGNORE = ( ".epub", ) SUFFIXES_TO_IGNORE_REGEX = ( - "(?!" + "|".join([re.escape(s) + "[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")" + "(?!" + "|".join([re.escape(s) + r"[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")" ) PREFIXES_TO_IGNORE_REGEX = ( "(?!" + "|".join([re.escape(s) for s in PREFIXES_TO_IGNORE]) + ")" ) DEFAULT_LINK_REGEX = ( - f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]" + rf"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]" )