add ocr_languages param for ConfluenceLoader.load() (#5823)

@eyurtsev

当Confluence文档内容中包含附件,且附件内容为非英文时,提取出来的文本是乱码的。
When the content of the document contains attachments, and the content
of the attachments is not in English, the extracted text is garbled.

这主要是因为没有为pytesseract传递lang参数,默认情况下只支持英文。
This is mainly because lang parameter is not passed to pytesseract, and
only English is supported by default.

所以我给ConfluenceLoader.load()添加了ocr_languages参数,以便支持多种语言。
So I added the ocr_languages parameter to ConfluenceLoader.load () to
support multiple languages.
searx_updates
Vincent 1 year ago committed by GitHub
parent ac3e6e3944
commit 0b740c9baa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -180,6 +180,7 @@ class ConfluenceLoader(BaseLoader):
include_comments: bool = False, include_comments: bool = False,
limit: Optional[int] = 50, limit: Optional[int] = 50,
max_pages: Optional[int] = 1000, max_pages: Optional[int] = 1000,
ocr_languages: Optional[str] = None,
) -> List[Document]: ) -> List[Document]:
""" """
:param space_key: Space key retrieved from a confluence URL, defaults to None :param space_key: Space key retrieved from a confluence URL, defaults to None
@ -203,6 +204,10 @@ class ConfluenceLoader(BaseLoader):
:type limit: int, optional :type limit: int, optional
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000 :param max_pages: Maximum number of pages to retrieve in total, defaults 1000
:type max_pages: int, optional :type max_pages: int, optional
:param ocr_languages: The languages to use for the Tesseract agent. To use a
language, you'll first need to install the appropriate
Tesseract language pack.
:type ocr_languages: str, optional
:raises ValueError: _description_ :raises ValueError: _description_
:raises ImportError: _description_ :raises ImportError: _description_
:return: _description_ :return: _description_
@ -226,7 +231,11 @@ class ConfluenceLoader(BaseLoader):
expand="body.storage.value", expand="body.storage.value",
) )
docs += self.process_pages( docs += self.process_pages(
pages, include_restricted_content, include_attachments, include_comments pages,
include_restricted_content,
include_attachments,
include_comments,
ocr_languages,
) )
if label: if label:
@ -252,7 +261,11 @@ class ConfluenceLoader(BaseLoader):
expand="body.storage.value", expand="body.storage.value",
) )
docs += self.process_pages( docs += self.process_pages(
pages, include_restricted_content, include_attachments, include_comments pages,
include_restricted_content,
include_attachments,
include_comments,
ocr_languages,
) )
if page_ids: if page_ids:
@ -272,7 +285,9 @@ class ConfluenceLoader(BaseLoader):
page = get_page(page_id=page_id, expand="body.storage.value") page = get_page(page_id=page_id, expand="body.storage.value")
if not include_restricted_content and not self.is_public_page(page): if not include_restricted_content and not self.is_public_page(page):
continue continue
doc = self.process_page(page, include_attachments, include_comments) doc = self.process_page(
page, include_attachments, include_comments, ocr_languages
)
docs.append(doc) docs.append(doc)
return docs return docs
@ -335,13 +350,16 @@ class ConfluenceLoader(BaseLoader):
include_restricted_content: bool, include_restricted_content: bool,
include_attachments: bool, include_attachments: bool,
include_comments: bool, include_comments: bool,
ocr_languages: Optional[str] = None,
) -> List[Document]: ) -> List[Document]:
"""Process a list of pages into a list of documents.""" """Process a list of pages into a list of documents."""
docs = [] docs = []
for page in pages: for page in pages:
if not include_restricted_content and not self.is_public_page(page): if not include_restricted_content and not self.is_public_page(page):
continue continue
doc = self.process_page(page, include_attachments, include_comments) doc = self.process_page(
page, include_attachments, include_comments, ocr_languages
)
docs.append(doc) docs.append(doc)
return docs return docs
@ -351,6 +369,7 @@ class ConfluenceLoader(BaseLoader):
page: dict, page: dict,
include_attachments: bool, include_attachments: bool,
include_comments: bool, include_comments: bool,
ocr_languages: Optional[str] = None,
) -> Document: ) -> Document:
try: try:
from bs4 import BeautifulSoup # type: ignore from bs4 import BeautifulSoup # type: ignore
@ -361,7 +380,7 @@ class ConfluenceLoader(BaseLoader):
) )
if include_attachments: if include_attachments:
attachment_texts = self.process_attachment(page["id"]) attachment_texts = self.process_attachment(page["id"], ocr_languages)
else: else:
attachment_texts = [] attachment_texts = []
text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text( text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text(
@ -388,7 +407,11 @@ class ConfluenceLoader(BaseLoader):
}, },
) )
def process_attachment(self, page_id: str) -> List[str]: def process_attachment(
self,
page_id: str,
ocr_languages: Optional[str] = None,
) -> List[str]:
try: try:
from PIL import Image # noqa: F401 from PIL import Image # noqa: F401
except ImportError: except ImportError:
@ -405,13 +428,13 @@ class ConfluenceLoader(BaseLoader):
absolute_url = self.base_url + attachment["_links"]["download"] absolute_url = self.base_url + attachment["_links"]["download"]
title = attachment["title"] title = attachment["title"]
if media_type == "application/pdf": if media_type == "application/pdf":
text = title + self.process_pdf(absolute_url) text = title + self.process_pdf(absolute_url, ocr_languages)
elif ( elif (
media_type == "image/png" media_type == "image/png"
or media_type == "image/jpg" or media_type == "image/jpg"
or media_type == "image/jpeg" or media_type == "image/jpeg"
): ):
text = title + self.process_image(absolute_url) text = title + self.process_image(absolute_url, ocr_languages)
elif ( elif (
media_type == "application/vnd.openxmlformats-officedocument" media_type == "application/vnd.openxmlformats-officedocument"
".wordprocessingml.document" ".wordprocessingml.document"
@ -420,14 +443,18 @@ class ConfluenceLoader(BaseLoader):
elif media_type == "application/vnd.ms-excel": elif media_type == "application/vnd.ms-excel":
text = title + self.process_xls(absolute_url) text = title + self.process_xls(absolute_url)
elif media_type == "image/svg+xml": elif media_type == "image/svg+xml":
text = title + self.process_svg(absolute_url) text = title + self.process_svg(absolute_url, ocr_languages)
else: else:
continue continue
texts.append(text) texts.append(text)
return texts return texts
def process_pdf(self, link: str) -> str: def process_pdf(
self,
link: str,
ocr_languages: Optional[str] = None,
) -> str:
try: try:
import pytesseract # noqa: F401 import pytesseract # noqa: F401
from pdf2image import convert_from_bytes # noqa: F401 from pdf2image import convert_from_bytes # noqa: F401
@ -452,12 +479,16 @@ class ConfluenceLoader(BaseLoader):
return text return text
for i, image in enumerate(images): for i, image in enumerate(images):
image_text = pytesseract.image_to_string(image) image_text = pytesseract.image_to_string(image, lang=ocr_languages)
text += f"Page {i + 1}:\n{image_text}\n\n" text += f"Page {i + 1}:\n{image_text}\n\n"
return text return text
def process_image(self, link: str) -> str: def process_image(
self,
link: str,
ocr_languages: Optional[str] = None,
) -> str:
try: try:
import pytesseract # noqa: F401 import pytesseract # noqa: F401
from PIL import Image # noqa: F401 from PIL import Image # noqa: F401
@ -481,7 +512,7 @@ class ConfluenceLoader(BaseLoader):
except OSError: except OSError:
return text return text
return pytesseract.image_to_string(image) return pytesseract.image_to_string(image, lang=ocr_languages)
def process_doc(self, link: str) -> str: def process_doc(self, link: str) -> str:
try: try:
@ -531,7 +562,11 @@ class ConfluenceLoader(BaseLoader):
return text return text
def process_svg(self, link: str) -> str: def process_svg(
self,
link: str,
ocr_languages: Optional[str] = None,
) -> str:
try: try:
import pytesseract # noqa: F401 import pytesseract # noqa: F401
from PIL import Image # noqa: F401 from PIL import Image # noqa: F401
@ -560,4 +595,4 @@ class ConfluenceLoader(BaseLoader):
img_data.seek(0) img_data.seek(0)
image = Image.open(img_data) image = Image.open(img_data)
return pytesseract.image_to_string(image) return pytesseract.image_to_string(image, lang=ocr_languages)

Loading…
Cancel
Save