"""Docs parser. Contains parsers for docx, pdf files. """ from pathlib import Path from typing import Dict from application.parser.file.base_parser import BaseParser class PDFParser(BaseParser): """PDF parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> str: """Parse file.""" try: import PyPDF2 except ImportError: raise ValueError("PyPDF2 is required to read PDF files.") text_list = [] with open(file, "rb") as fp: # Create a PDF object pdf = PyPDF2.PdfReader(fp) # Get the number of pages in the PDF document num_pages = len(pdf.pages) # Iterate over every page for page in range(num_pages): # Extract the text from the page page_text = pdf.pages[page].extract_text() text_list.append(page_text) text = "\n".join(text_list) return text class DocxParser(BaseParser): """Docx parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> str: """Parse file.""" try: import docx2txt except ImportError: raise ValueError("docx2txt is required to read Microsoft Word files.") text = docx2txt.process(file) return text