mirror of
https://github.com/arc53/DocsGPT
synced 2024-11-09 19:10:53 +00:00
60 lines
1.5 KiB
Python
60 lines
1.5 KiB
Python
|
"""Docs parser.
|
||
|
|
||
|
Contains parsers for docx, pdf files.
|
||
|
|
||
|
"""
|
||
|
from pathlib import Path
|
||
|
from typing import Dict
|
||
|
|
||
|
from parser.file.base_parser import BaseParser
|
||
|
|
||
|
|
||
|
class PDFParser(BaseParser):
|
||
|
"""PDF parser."""
|
||
|
|
||
|
def _init_parser(self) -> Dict:
|
||
|
"""Init parser."""
|
||
|
return {}
|
||
|
|
||
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||
|
"""Parse file."""
|
||
|
try:
|
||
|
import PyPDF2
|
||
|
except ImportError:
|
||
|
raise ValueError("PyPDF2 is required to read PDF files.")
|
||
|
text_list = []
|
||
|
with open(file, "rb") as fp:
|
||
|
# Create a PDF object
|
||
|
pdf = PyPDF2.PdfReader(fp)
|
||
|
|
||
|
# Get the number of pages in the PDF document
|
||
|
num_pages = len(pdf.pages)
|
||
|
|
||
|
# Iterate over every page
|
||
|
for page in range(num_pages):
|
||
|
# Extract the text from the page
|
||
|
page_text = pdf.pages[page].extract_text()
|
||
|
text_list.append(page_text)
|
||
|
text = "\n".join(text_list)
|
||
|
|
||
|
return text
|
||
|
|
||
|
|
||
|
class DocxParser(BaseParser):
|
||
|
"""Docx parser."""
|
||
|
|
||
|
def _init_parser(self) -> Dict:
|
||
|
"""Init parser."""
|
||
|
return {}
|
||
|
|
||
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||
|
"""Parse file."""
|
||
|
try:
|
||
|
import docx2txt
|
||
|
except ImportError:
|
||
|
raise ValueError("docx2txt is required to read Microsoft Word files.")
|
||
|
|
||
|
text = docx2txt.process(file)
|
||
|
|
||
|
return text
|