diff --git a/libs/langchain/langchain/evaluation/comparison/eval_chain.py b/libs/langchain/langchain/evaluation/comparison/eval_chain.py
index 21f623bf0f..9a63fff38b 100644
--- a/libs/langchain/langchain/evaluation/comparison/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/comparison/eval_chain.py
@@ -1,12 +1,20 @@
 """Base classes for comparing the output of two models."""
 from __future__ import annotations
 
+import logging
+import re
 from typing import Any, Dict, List, Optional, Union
 
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
-from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
+from langchain.chat_models.azure_openai import AzureChatOpenAI
+from langchain.chat_models.openai import ChatOpenAI
+from langchain.evaluation.comparison.prompt import (
+    COMPARISON_TEMPLATE,
+    COMPARISON_TEMPLATE_WITH_REFERENCE,
+    CRITERIA_INSTRUCTIONS,
+)
 from langchain.evaluation.criteria.eval_chain import (
     CRITERIA_TYPE,
     Criteria,
@@ -17,6 +25,10 @@ from langchain.pydantic_v1 import Extra, Field
 from langchain.schema import RUN_KEY, BaseOutputParser
 from langchain.schema.language_model import BaseLanguageModel
 
+logger = logging.getLogger(__name__)
+
+_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
+
 _SUPPORTED_CRITERIA = {
     Criteria.CONCISENESS: "Is the submission concise and to the point?",
     Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
@@ -112,27 +124,26 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
             ValueError: If the verdict is invalid.
 
         """
-        parsed = text.strip().rsplit("\n", maxsplit=1)
-        if len(parsed) == 1:
-            reasoning = ""
-            verdict = parsed[0]
-        else:
-            reasoning, verdict = parsed
-        verdict = verdict.strip("[").strip("]")
-        if verdict not in {"A", "B", "C"}:
+        match = _FIND_DOUBLE_BRACKETS.search(text)
+
+        if match:
+            verdict = match.group(1)
+
+        if not match or verdict not in {"A", "B", "C"}:
             raise ValueError(
-                f"Invalid verdict: {verdict}. "
-                "Verdict must be one of 'A', 'B', or 'C'."
+                f"Invalid output: {text}. "
+                "Output must contain a double bracketed string\
+                 with the verdict 'A', 'B', or 'C'."
             )
         # C means the models are tied. Return 'None' meaning no preference
         verdict_ = None if verdict == "C" else verdict
         score = {
             "A": 1,
             "B": 0,
-            None: 0.5,
-        }.get(verdict_)
+            "C": 0.5,
+        }[verdict]
         return {
-            "reasoning": reasoning,
+            "reasoning": text,
             "value": verdict_,
             "score": score,
         }
@@ -225,7 +236,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
         """Initialize the PairwiseStringEvalChain from an LLM.
 
         Args:
-            llm (BaseLanguageModel): The LLM to use.
+            llm (BaseChatModel): The LLM to use (GPT-4 recommended).
             prompt (PromptTemplate, optional): The prompt to use.
             **kwargs (Any): Additional keyword arguments.
 
@@ -236,8 +247,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
             ValueError: If the input variables are not as expected.
 
         """
+        if not (
+            isinstance(llm, (ChatOpenAI, AzureChatOpenAI))
+            and llm.model_name.startswith("gpt-4")
+        ):
+            logger.warning(
+                "This chain was only tested with GPT-4. \
+Performance may be significantly worse with other models."
+            )
+
         expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
-        prompt_ = prompt or PROMPT
+        prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
         if expected_input_vars != set(prompt_.input_variables):
             raise ValueError(
                 f"Input variables should be {expected_input_vars}, "
@@ -245,6 +265,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
             )
         criteria_ = resolve_pairwise_criteria(criteria)
         criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
+        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
         return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
 
     def _prepare_input(
@@ -418,7 +439,7 @@ class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
             "reference",
             "criteria",
         }
-        prompt_ = prompt or PROMPT_WITH_REFERENCE
+        prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
         if expected_input_vars != set(prompt_.input_variables):
             raise ValueError(
                 f"Input variables should be {expected_input_vars}, "
@@ -426,4 +447,5 @@ class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
             )
         criteria_ = resolve_pairwise_criteria(criteria)
         criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
+        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
         return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
diff --git a/libs/langchain/langchain/evaluation/comparison/prompt.py b/libs/langchain/langchain/evaluation/comparison/prompt.py
index a29e8b4130..bed64a4dc6 100644
--- a/libs/langchain/langchain/evaluation/comparison/prompt.py
+++ b/libs/langchain/langchain/evaluation/comparison/prompt.py
@@ -5,64 +5,55 @@ and answers the question. The prompt is based on the paper from
 Zheng, et. al. https://arxiv.org/abs/2306.05685
 """
 # flake8: noqa
-from langchain.prompts import PromptTemplate
-
-template = """Act as a fair judge and rate the two responses to the question below.\
- Choose the response that best followed the instructions and answered the question.\
- Your assessment should weigh the following criteria:
-{criteria}\
- Start by comparing both responses and give a brief rationale.\
- Avoid bias from the order of presentation or response length.
-After giving your rationale, make your final decision using this format:\
- "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\
- and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line.
-
-[QUESTION]
-{input}
-[/QUESTION]
-
-[RESPONSE A]
-{prediction}
-[/RESPONSE A]
-
-[RESPONSE B]
-{prediction_b}
-[/RESPONSE B]"""
-PROMPT = PromptTemplate(
-    input_variables=["input", "prediction", "prediction_b", "criteria"],
-    template=template,
+from langchain.prompts.chat import ChatPromptTemplate
+
+SYSTEM_MESSAGE = 'Please act as an impartial judge and evaluate the quality \
+of the responses provided by two AI assistants to the user question displayed below. \
+You should choose the assistant that follows the user\'s instructions \
+and answers \the user\'s question better. \
+Your evaluation should consider factors such as the \
+helpfulness, relevance, accuracy, depth, creativity, \
+and level of detail of their responses. \
+Begin your evaluation by comparing the two responses and provide a short explanation. \
+Avoid any position biases and ensure that the order in which \
+the responses were presented does not influence your decision. \
+Do not allow the length of the responses to influence your evaluation. \
+Do not favor certain names of the assistants. Be as objective as possible. \
+After providing your explanation, output your final verdict by strictly following \
+this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, \
+and "[[C]]" for a tie.'
+
+CRITERIA_INSTRUCTIONS = (
+    "For this evaluation, you should primarily consider the following criteria:\n"
 )
 
-template = """Act as a fair judge and rate the two responses to the question below.\
- Choose the response that best followed the instructions and answered the question.\
- Your assessment should weigh the following criteria:
-{criteria}\
- Start by comparing both responses and give a brief rationale.\
- Avoid bias from the order of presentation or response length.\
- Weigh accuracy based on the following ground truth reference\
- answer to the question:
-
-[REFERENCE]
-{reference}
-[/REFERENCE]
-
-After giving your rationale, make your final decision using this format:\
- "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\
- and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line.
-
-[QUESTION]
-{input}
-[/QUESTION]
-
-[RESPONSE A]
-{prediction}
-[/RESPONSE A]
-
-[RESPONSE B]
-{prediction_b}
-[/RESPONSE B]"""
+COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_MESSAGE),
+        (
+            "human",
+            "{criteria}[User Question]\n{input}\n\n\
+[The Start of Assistant A's Answer]\n{prediction}\n\
+[The End of Assistant A's Answer]\
+\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
+[The End of Assistant B's Answer]",
+        ),
+    ]
+)
 
-PROMPT_WITH_REFERENCE = PromptTemplate(
-    input_variables=["input", "prediction", "prediction_b", "reference", "criteria"],
-    template=template,
+COMPARISON_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_MESSAGE),
+        (
+            "human",
+            "{criteria}\n\nTo help you evaluate the responses, \
+here is a reference answer to the user's question:\n\
+{reference}\
+[User Question]\n{input}\n\n\
+[The Start of Assistant A's Answer]\n{prediction}\n\
+[The End of Assistant A's Answer]\
+\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
+[The End of Assistant B's Answer]",
+        ),
+    ]
 )
diff --git a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py
index 269d3caf2b..0bbfb43b23 100644
--- a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@@ -34,7 +34,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None:
 [[A]]"""
     got = output_parser.parse(text)
     want = {
-        "reasoning": "I like pie better than cake.",
+        "reasoning": text,
         "value": "A",
         "score": 1,
     }
@@ -46,7 +46,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None:
 [[B]]"""
     got = output_parser.parse(text)
     want = {
-        "reasoning": "I like cake better than pie.",
+        "reasoning": text,
         "value": "B",
         "score": 0,
     }
@@ -58,7 +58,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None:
 [[C]]"""
     got = output_parser.parse(text)
     want = {
-        "reasoning": "I like cake and pie.",
+        "reasoning": text,
         "value": None,
         "score": 0.5,
     }
@@ -84,7 +84,7 @@ def test_pairwise_string_comparison_chain() -> None:
     )
     assert res["value"] is None
     assert res["score"] == 0.5
-    assert res["reasoning"] == "The values are the same."
+    assert res["reasoning"] == "The values are the same.\n[[C]]"
     res = chain.evaluate_string_pairs(
         prediction="I like pie.",
         prediction_b="I like pie.",