fix packaging and imports and introduce tests with pytest.

still issues with celery worker.
pull/300/head
Anton Larin 9 months ago
parent 9a393b4f74
commit 98a97f34f5

@ -0,0 +1,28 @@
name: Run python tests with pytest
on: [push, pull_request]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
cd application
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: |
cd application
pytest

@ -37,9 +37,9 @@ from langchain.schema import HumanMessage, AIMessage
from pymongo import MongoClient
from werkzeug.utils import secure_filename
from core.settings import settings
from error import bad_request
from worker import ingest_worker
from application.core.settings import settings
from application.error import bad_request
from application.worker import ingest_worker
from bson.objectid import ObjectId
# os.environ["LANGCHAIN_HANDLER"] = "langchain"

@ -3,7 +3,7 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document
from application.parser.schema.base import Document
class BaseReader:

@ -3,15 +3,15 @@ import logging
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from parser.file.base import BaseReader
from parser.file.base_parser import BaseParser
from parser.file.docs_parser import DocxParser, PDFParser
from parser.file.epub_parser import EpubParser
from parser.file.html_parser import HTMLParser
from parser.file.markdown_parser import MarkdownParser
from parser.file.rst_parser import RstParser
from parser.file.tabular_parser import PandasCSVParser
from parser.schema.base import Document
from application.parser.file.base import BaseReader
from application.parser.file.base_parser import BaseParser
from application.parser.file.docs_parser import DocxParser, PDFParser
from application.parser.file.epub_parser import EpubParser
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),

@ -6,7 +6,7 @@ Contains parsers for docx, pdf files.
from pathlib import Path
from typing import Dict
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class PDFParser(BaseParser):

@ -6,7 +6,7 @@ Contains parsers for epub files.
from pathlib import Path
from typing import Dict
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class EpubParser(BaseParser):

@ -7,7 +7,7 @@ import re
from pathlib import Path
from typing import Dict, Union
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class HTMLParser(BaseParser):

@ -8,7 +8,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
import tiktoken
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):

@ -7,7 +7,7 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class RstParser(BaseParser):

@ -6,7 +6,7 @@ Contains parsers for tabular data files.
from pathlib import Path
from typing import Any, Dict, List, Union
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class CSVParser(BaseParser):

@ -2,7 +2,7 @@
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument
from application.parser.schema.schema import BaseDocument
@dataclass

@ -3,7 +3,7 @@ from math import ceil
from typing import List
import tiktoken
from parser.schema.base import Document
from application.parser.schema.base import Document
def separate_header_and_body(text):

@ -73,6 +73,7 @@ pymongo==4.3.3
pyowm==3.3.0
PyPDF2==3.0.1
PySocks==1.7.1
pytest
python-dateutil==2.8.2
python-dotenv==1.0.0
python-jose==3.3.0

@ -0,0 +1,37 @@
from application.app import get_vectorstore
# Test cases for get_vectorstore function
def test_no_active_docs():
data = {}
assert get_vectorstore(data) == ""
def test_default_active_docs():
data = {"active_docs": "default"}
assert get_vectorstore(data) == ""
def test_local_default_active_docs():
data = {"active_docs": "local/default"}
assert get_vectorstore(data) == ""
def test_local_custom_active_docs():
data = {"active_docs": "local/custom_index"}
assert get_vectorstore(data) == "indexes/local/custom_index"
def test_remote_active_docs():
data = {"active_docs": "remote_index"}
assert get_vectorstore(data) == "vectors/remote_index"
def test_active_docs_not_in_data():
data = {"other_key": "value"}
assert get_vectorstore(data) == ""
def test_multiple_slashes_in_active_docs():
data = {"active_docs": "local/some/other/index"}
assert get_vectorstore(data) == "indexes/local/some/other/index"

@ -7,11 +7,11 @@ from urllib.parse import urljoin
import nltk
import requests
from core.settings import settings
from parser.file.bulk import SimpleDirectoryReader
from parser.open_ai_func import call_openai_api
from parser.schema.base import Document
from parser.token_func import group_split
from application.core.settings import settings
from application.parser.file.bulk import SimpleDirectoryReader
from application.parser.open_ai_func import call_openai_api
from application.parser.schema.base import Document
from application.parser.token_func import group_split
try:
nltk.download('punkt', quiet=True)

@ -1,4 +1,4 @@
from app import app
from application.app import app
if __name__ == "__main__":
app.run(debug=True, port=7091)

@ -13,6 +13,7 @@ services:
backend:
build: ./application
working_dir: /application
environment:
- API_KEY=$OPENAI_API_KEY
- EMBEDDINGS_KEY=$OPENAI_API_KEY
@ -27,16 +28,17 @@ services:
ports:
- "7091:7091"
volumes:
- ./application/indexes:/app/indexes
- ./application/inputs:/app/inputs
- ./application/vectors:/app/vectors
- ./application/indexes:/application/indexes
- ./application/inputs:/application/inputs
- ./application/vectors:/application/vectors
depends_on:
- redis
- mongo
worker:
build: ./application
command: celery -A app.celery worker -l INFO
working_dir: /application
command: celery -A application.app.celery worker -l INFO
environment:
- API_KEY=$OPENAI_API_KEY
- EMBEDDINGS_KEY=$OPENAI_API_KEY

@ -13,6 +13,7 @@ services:
backend:
build: ./application
working_dir: /application
environment:
- API_KEY=$OPENAI_API_KEY
- EMBEDDINGS_KEY=$OPENAI_API_KEY
@ -22,16 +23,17 @@ services:
ports:
- "7091:7091"
volumes:
- ./application/indexes:/app/indexes
- ./application/inputs:/app/inputs
- ./application/vectors:/app/vectors
- ./application/indexes:/application/indexes
- ./application/inputs:/application/inputs
- ./application/vectors:/application/vectors
depends_on:
- redis
- mongo
worker:
build: ./application
command: celery -A app.celery worker -l INFO
working_dir: /application
command: celery -A application.app.celery worker -l INFO
environment:
- API_KEY=$OPENAI_API_KEY
- EMBEDDINGS_KEY=$OPENAI_API_KEY

@ -110,8 +110,6 @@ tenacity==8.2.2
threadpoolctl==3.2.0
tiktoken==0.4.0
tokenizers==0.13.3
torch==2.0.1
torchvision==0.15.2
tqdm==4.65.0
transformers==4.31.0
typer==0.9.0

Loading…
Cancel
Save