Skip to content

Evaluate

Evaluation pipeline for RAG model performance using LangSmith metrics.

evaluate(project_name, config_path, dataset_name)

Run evaluation on a RAG setup using LangSmith metrics.

Loads a configuration file to initialize a RAG chain and evaluates its performance on a dataset using a set of standard metrics.

Parameters:

Name Type Description Default
project_name str

The name of the LangChain project used in LangSmith.

required
config_path str

Path to the JSON configuration file defining the RAG setup.

required
dataset_name str

The name of the dataset to be used for evaluation, registered in LangSmith.

required

Raises:

Type Description
IOError

If the configuration file at config_path does not exist.

Source code in ragbot\evaluate.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def evaluate(project_name: str, config_path: str, dataset_name: str):
    """Run evaluation on a RAG setup using LangSmith metrics.

    Loads a configuration file to initialize a RAG chain and evaluates its
    performance on a dataset using a set of standard metrics.

    Args:
        project_name: The name of the LangChain project used in LangSmith.
        config_path: Path to the JSON configuration file defining the RAG setup.
        dataset_name: The name of the dataset to be used for evaluation, registered in LangSmith.

    Raises:
        IOError: If the configuration file at `config_path` does not exist.
    """

    # Define LLM and embedding model for evalution
    llm = get_model("google", "gemini-2.0-flash", temperature=0.0)
    embeddings = get_embeddings("google", "models/embedding-001")

    # Wrap the metrics for evaluation with LangSmith
    evaluators = [
        EvaluatorChain(metric=metric, llm=llm, embeddings=embeddings)
        for metric in [
            BLEU(),
            ROUGE(),
            SemanticSimilarity(),
            Faithfulness(),
            AnswerRelevance(),
            ContextRelevance(),
        ]
    ]

    # Load configuration from json config
    if os.path.exists(config_path):
        with open(config_path, "r") as f:
            config = json.load(f)
            metadata = config
    else:
        raise IOError(f"Configuration file {config_path} not found.")

    # Set up RAG, and run an evalution
    rag_chain = setup(
        project_name=project_name,
        llm_provider=config["llm_provider"],
        llm=config["llm"],
        llm_temperature=config["llm_temperature"],
        llm_top_p=config["llm_top_p"],
        llm_top_k=config["llm_top_k"],
        embeddings_provider=config["embeddings_provider"],
        embedding_model=config["embedding_model"],
        chunk_size=config["chunk_size"],
        chunk_overlap=config["chunk_overlap"],
        search_type=config["search_type"],
        k_docs=config["k_docs"],
    )

    # Add delay for quota management
    def invoke_with_delay(*args, **kwargs):
        time.sleep(12)
        return rag_chain.invoke(*args, **kwargs)

    # Run evaluation
    langsmith.evaluate(
        invoke_with_delay,
        data=dataset_name,
        evaluators=evaluators,
        experiment_prefix="base",
        metadata=metadata,
        max_concurrency=1,
    )