import caribou
from caribou import Message, ToolExecution, Grade
from datetime import datetime, UTC
kestrel = caribou.get_kestrel_client()
if kestrel.enabled:
project = kestrel.get_or_create_project(name="my-project")
run = kestrel.create_run(
project_id=project["id"],
version_hash="abc123",
env_name="my-env",
dataloader_name="my-dataloader",
grader_name="my-grader",
model_name="gpt-5",
)
run_id = run["id"]
kestrel.update_run(run_id, status="running")
kestrel.create_run_step(run_id, step_number=0, phase="eval")
else:
run_id = "local-run"
caribou.configure(run_id=run_id, phase="eval")
for datapoint in datapoints:
if kestrel.enabled:
dp = kestrel.get_or_create_datapoint(
name=datapoint["name"],
problem_input=datapoint["input"],
grader_input=datapoint["expected"],
)
datapoint_id = dp["id"]
else:
datapoint_id = datapoint["name"]
with caribou.trace_run({
"datapoint_id": datapoint_id,
"phase": "eval",
"env_name": "my-env",
"model_name": "gpt-5",
"grader_name": "my-grader",
}) as trace_id:
caribou.log(Message(role="user", content=datapoint["prompt"]))
completion = await get_completion(datapoint["prompt"])
caribou.log(Message(
role="assistant",
content=completion["content"],
model="gpt-5",
input_tokens=completion["usage"]["input"],
output_tokens=completion["usage"]["output"],
))
score = await grade(completion, datapoint["expected"])
caribou.log(Grade(score=score, grader_type="MyGrader"))
caribou.set_status(caribou.RolloutStatus.COMPLETED)
caribou.flush()
if kestrel.enabled:
kestrel.update_run(run_id, status="completed", completed_at=datetime.now(UTC))
caribou.shutdown()