langchain/libs/experimental/langchain_experimental/synthetic_data/__init__.py

"""Generate **synthetic data** using LLM and few-shot template."""

from typing import Any, Dict, List, Optional

from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import PromptTemplate

from langchain_experimental.synthetic_data.prompts import SENTENCE_PROMPT


def create_data_generation_chain(
    llm: BaseLanguageModel,
    prompt: Optional[PromptTemplate] = None,
) -> Chain:
    """Create a chain that generates synthetic sentences with
     provided fields.

    Args:
        llm: The language model to use.
        prompt: Prompt to feed the language model with.
        If not provided, the default one will be used.
    """
    prompt = prompt or SENTENCE_PROMPT
    return LLMChain(
        llm=llm,
        prompt=prompt,
    )


class DatasetGenerator:
    """Generate synthetic dataset with a given language model."""

    def __init__(
        self,
        llm: BaseLanguageModel,
        sentence_preferences: Optional[Dict[str, Any]] = None,
    ):
        self.generator = create_data_generation_chain(llm)
        self.sentence_preferences = sentence_preferences or {}

    def __call__(self, fields_collection: List[List[Any]]) -> List[Dict[str, Any]]:
        results: List[Dict[str, Any]] = []
        for fields in fields_collection:
            results.append(
                self.generator(
                    {"fields": fields, "preferences": self.sentence_preferences}
                )
            )
        return results
experimental[patch]: update module doc strings (#19539) Added missed module descriptions. Fixed format. 2024-03-26 14:38:10 +00:00			`"""Generate synthetic data using LLM and few-shot template."""`
infra: update mypy 1.10, ruff 0.5 (#23721) ```python """python scripts/update_mypy_ruff.py""" import glob import tomllib from pathlib import Path import toml import subprocess import re ROOT_DIR = Path(__file__).parents[1] def main(): for path in glob.glob(str(ROOT_DIR / "libs/*/pyproject.toml"), recursive=True): print(path) with open(path, "rb") as f: pyproject = tomllib.load(f) try: pyproject["tool"]["poetry"]["group"]["typing"]["dependencies"]["mypy"] = ( "^1.10" ) pyproject["tool"]["poetry"]["group"]["lint"]["dependencies"]["ruff"] = ( "^0.5" ) except KeyError: continue with open(path, "w") as f: toml.dump(pyproject, f) cwd = "/".join(path.split("/")[:-1]) completed = subprocess.run( "poetry lock --no-update; poetry install --with typing; poetry run mypy . --no-color", cwd=cwd, shell=True, capture_output=True, text=True, ) logs = completed.stdout.split("\n") to_ignore = {} for l in logs: if re.match("^(.)\:(\d+)\: error:.\[(.)\]", l): path, line_no, error_type = re.match( "^(.)\:(\d+)\: error:.\[(.*)\]", l ).groups() if (path, line_no) in to_ignore: to_ignore[(path, line_no)].append(error_type) else: to_ignore[(path, line_no)] = [error_type] print(len(to_ignore)) for (error_path, line_no), error_types in to_ignore.items(): all_errors = ", ".join(error_types) full_path = f"{cwd}/{error_path}" try: with open(full_path, "r") as f: file_lines = f.readlines() except FileNotFoundError: continue file_lines[int(line_no) - 1] = ( file_lines[int(line_no) - 1][:-1] + f" # type: ignore[{all_errors}]\n" ) with open(full_path, "w") as f: f.write("".join(file_lines)) subprocess.run( "poetry run ruff format .; poetry run ruff --select I --fix .", cwd=cwd, shell=True, capture_output=True, text=True, ) if __name__ == "__main__": main() ``` 2024-07-03 17:33:27 +00:00
fix experimental imports (#10875) 2023-09-21 06:44:17 +00:00			`from typing import Any, Dict, List, Optional`
Synthetic data generation (#9759) ### Description Implements synthetic data generation with the fields and preferences given by the user. Adds showcase notebook. Corresponding prompt was proposed for langchain-hub. ### Example ``` output = chain({"fields": {"colors": ["blue", "yellow"]}, "preferences": {"style": "Make it in a style of a weather forecast."}}) print(output) # {'fields': {'colors': ['blue', 'yellow']}, 'preferences': {'style': 'Make it in a style of a weather forecast.'}, 'text': "Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting."} ``` ### Twitter handle @deepsense_ai @matt_wosinski --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-19 23:29:50 +00:00
fix experimental imports (#10875) 2023-09-21 06:44:17 +00:00			`from langchain.chains.base import Chain`
Synthetic data generation (#9759) ### Description Implements synthetic data generation with the fields and preferences given by the user. Adds showcase notebook. Corresponding prompt was proposed for langchain-hub. ### Example ``` output = chain({"fields": {"colors": ["blue", "yellow"]}, "preferences": {"style": "Make it in a style of a weather forecast."}}) print(output) # {'fields': {'colors': ['blue', 'yellow']}, 'preferences': {'style': 'Make it in a style of a weather forecast.'}, 'text': "Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting."} ``` ### Twitter handle @deepsense_ai @matt_wosinski --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-19 23:29:50 +00:00			`from langchain.chains.llm import LLMChain`
langchain[patch], experimental[patch]: replace langchain.schema imports (#15410) Import from core instead. Ran: ```bash git grep -l 'from langchain.schema\.output_parser' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.output_parser/from\ langchain_core.output_parsers/g" git grep -l 'from langchain.schema\.messages' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.messages/from\ langchain_core.messages/g" git grep -l 'from langchain.schema\.document' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.document/from\ langchain_core.documents/g" git grep -l 'from langchain.schema\.runnable' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.runnable/from\ langchain_core.runnables/g" git grep -l 'from langchain.schema\.vectorstore' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.vectorstore/from\ langchain_core.vectorstores/g" git grep -l 'from langchain.schema\.language_model' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.language_model/from\ langchain_core.language_models/g" git grep -l 'from langchain.schema\.embeddings' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.embeddings/from\ langchain_core.embeddings/g" git grep -l 'from langchain.schema\.storage' \| xargs -L 1 sed -i '' "s/from\ langchain\.schema\.storage/from\ langchain_core.stores/g" git checkout master libs/langchain/tests/unit_tests/schema/ make format cd libs/experimental make format cd ../langchain make format ``` 2024-01-02 20:09:45 +00:00			`from langchain_core.language_models import BaseLanguageModel`
experimental[patch]: `PromptTemplate` import fix (#19617) Changed import of `PromptTemplate` from `langchain` to `langchain_core` in `langchain_experimental` 2024-03-27 00:03:13 +00:00			`from langchain_core.prompts import PromptTemplate`
Synthetic data generation (#9759) ### Description Implements synthetic data generation with the fields and preferences given by the user. Adds showcase notebook. Corresponding prompt was proposed for langchain-hub. ### Example ``` output = chain({"fields": {"colors": ["blue", "yellow"]}, "preferences": {"style": "Make it in a style of a weather forecast."}}) print(output) # {'fields': {'colors': ['blue', 'yellow']}, 'preferences': {'style': 'Make it in a style of a weather forecast.'}, 'text': "Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting."} ``` ### Twitter handle @deepsense_ai @matt_wosinski --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-19 23:29:50 +00:00
			`from langchain_experimental.synthetic_data.prompts import SENTENCE_PROMPT`


			`def create_data_generation_chain(`
			`llm: BaseLanguageModel,`
			`prompt: Optional[PromptTemplate] = None,`
			`) -> Chain:`
experimental: docstrings update (#18048) Added missed docstrings. Formatted docsctrings to the consistent format. 2024-02-24 02:24:16 +00:00			`"""Create a chain that generates synthetic sentences with`
Synthetic data generation (#9759) ### Description Implements synthetic data generation with the fields and preferences given by the user. Adds showcase notebook. Corresponding prompt was proposed for langchain-hub. ### Example ``` output = chain({"fields": {"colors": ["blue", "yellow"]}, "preferences": {"style": "Make it in a style of a weather forecast."}}) print(output) # {'fields': {'colors': ['blue', 'yellow']}, 'preferences': {'style': 'Make it in a style of a weather forecast.'}, 'text': "Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting."} ``` ### Twitter handle @deepsense_ai @matt_wosinski --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-19 23:29:50 +00:00			`provided fields.`

			`Args:`
			`llm: The language model to use.`
			`prompt: Prompt to feed the language model with.`
			`If not provided, the default one will be used.`
			`"""`
			`prompt = prompt or SENTENCE_PROMPT`
			`return LLMChain(`
			`llm=llm,`
			`prompt=prompt,`
			`)`


			`class DatasetGenerator:`
experimental: docstrings update (#18048) Added missed docstrings. Formatted docsctrings to the consistent format. 2024-02-24 02:24:16 +00:00			`"""Generate synthetic dataset with a given language model."""`
Synthetic data generation (#9759) ### Description Implements synthetic data generation with the fields and preferences given by the user. Adds showcase notebook. Corresponding prompt was proposed for langchain-hub. ### Example ``` output = chain({"fields": {"colors": ["blue", "yellow"]}, "preferences": {"style": "Make it in a style of a weather forecast."}}) print(output) # {'fields': {'colors': ['blue', 'yellow']}, 'preferences': {'style': 'Make it in a style of a weather forecast.'}, 'text': "Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting."} ``` ### Twitter handle @deepsense_ai @matt_wosinski --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-19 23:29:50 +00:00
			`def __init__(`
			`self,`
			`llm: BaseLanguageModel,`
			`sentence_preferences: Optional[Dict[str, Any]] = None,`
			`):`
			`self.generator = create_data_generation_chain(llm)`
			`self.sentence_preferences = sentence_preferences or {}`

			`def __call__(self, fields_collection: List[List[Any]]) -> List[Dict[str, Any]]:`
			`results: List[Dict[str, Any]] = []`
			`for fields in fields_collection:`
			`results.append(`
			`self.generator(`
			`{"fields": fields, "preferences": self.sentence_preferences}`
			`)`
			`)`
			`return results`