"""Tabular parser. Contains parsers for tabular data files. """ from pathlib import Path from typing import Any, Dict, List, Union from parser.file.base_parser import BaseParser class CSVParser(BaseParser): """CSV parser. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. """ def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file. Returns: Union[str, List[str]]: a string or a List of strings. """ try: import csv except ImportError: raise ValueError("csv module is required to read CSV files.") text_list = [] with open(file, "r") as fp: csv_reader = csv.reader(fp) for row in csv_reader: text_list.append(", ".join(row)) if self._concat_rows: return "\n".join(text_list) else: return text_list class PandasCSVParser(BaseParser): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv`function. If special parameters are required, use the `pandas_config` dict. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. col_joiner (str): Separator to use for joining cols per row. Set to ", " by default. row_joiner (str): Separator to use for joining each row. Only used when `concat_rows=True`. Set to "\n" by default. pandas_config (dict): Options for the `pandas.read_csv` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information. Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. """ def __init__( self, *args: Any, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file.""" try: import pandas as pd except ImportError: raise ValueError("pandas module is required to read CSV files.") df = pd.read_csv(file, **self._pandas_config) text_list = df.apply( lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 ).tolist() if self._concat_rows: return (self._row_joiner).join(text_list) else: return text_list