mirror of
https://github.com/arc53/DocsGPT
synced 2024-10-31 09:20:25 +00:00
98a97f34f5
still issues with celery worker.
60 lines
1.5 KiB
Python
60 lines
1.5 KiB
Python
"""Docs parser.
|
|
|
|
Contains parsers for docx, pdf files.
|
|
|
|
"""
|
|
from pathlib import Path
|
|
from typing import Dict
|
|
|
|
from application.parser.file.base_parser import BaseParser
|
|
|
|
|
|
class PDFParser(BaseParser):
|
|
"""PDF parser."""
|
|
|
|
def _init_parser(self) -> Dict:
|
|
"""Init parser."""
|
|
return {}
|
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
|
"""Parse file."""
|
|
try:
|
|
import PyPDF2
|
|
except ImportError:
|
|
raise ValueError("PyPDF2 is required to read PDF files.")
|
|
text_list = []
|
|
with open(file, "rb") as fp:
|
|
# Create a PDF object
|
|
pdf = PyPDF2.PdfReader(fp)
|
|
|
|
# Get the number of pages in the PDF document
|
|
num_pages = len(pdf.pages)
|
|
|
|
# Iterate over every page
|
|
for page in range(num_pages):
|
|
# Extract the text from the page
|
|
page_text = pdf.pages[page].extract_text()
|
|
text_list.append(page_text)
|
|
text = "\n".join(text_list)
|
|
|
|
return text
|
|
|
|
|
|
class DocxParser(BaseParser):
|
|
"""Docx parser."""
|
|
|
|
def _init_parser(self) -> Dict:
|
|
"""Init parser."""
|
|
return {}
|
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
|
"""Parse file."""
|
|
try:
|
|
import docx2txt
|
|
except ImportError:
|
|
raise ValueError("docx2txt is required to read Microsoft Word files.")
|
|
|
|
text = docx2txt.process(file)
|
|
|
|
return text
|