inspect_ai.analysis
Evals
evals_df
Read a dataframe containing evals.
def evals_df(
= list_eval_logs(),
logs: LogPaths = EvalColumns,
columns: Sequence[Column] bool = True,
strict: bool | None = None,
quiet: -> "pd.DataFrame" | tuple["pd.DataFrame", Sequence[ColumnError]] )
logs
LogPaths-
One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).
columns
Sequence[Column]-
Specification for what columns to read from log files.
strict
bool-
Raise import errors immediately. Defaults to
True
. IfFalse
then a tuple ofDataFrame
and errors is returned. quiet
bool | None-
If
True
, do not show any output or progress. Defaults toFalse
for terminal environments, andTrue
for notebooks.
EvalColumn
Column which maps to EvalLog.
class EvalColumn(Column)
EvalColumns
Default columns to import for evals_df().
list[Column] = (
EvalColumns:
EvalInfo+ EvalTask
+ EvalModel
+ EvalDataset
+ EvalConfiguration
+ EvalResults
+ EvalScores
)
EvalInfo
Eval basic information columns.
list[Column] = [
EvalInfo: "run_id", path="eval.run_id", required=True),
EvalColumn("task_id", path="eval.task_id", required=True),
EvalColumn(*EvalLogPath,
"created", path="eval.created", type=datetime, required=True),
EvalColumn("tags", path="eval.tags", default="", value=list_as_str),
EvalColumn("git_origin", path="eval.revision.origin"),
EvalColumn("git_commit", path="eval.revision.commit"),
EvalColumn("packages", path="eval.packages"),
EvalColumn("metadata", path="eval.metadata"),
EvalColumn( ]
EvalTask
Eval task configuration columns.
list[Column] = [
EvalTask: "task_name", path="eval.task", required=True, value=remove_namespace),
EvalColumn("task_display_name", path=eval_log_task_display_name),
EvalColumn("task_version", path="eval.task_version", required=True),
EvalColumn("task_file", path="eval.task_file"),
EvalColumn("task_attribs", path="eval.task_attribs"),
EvalColumn("task_arg_*", path="eval.task_args"),
EvalColumn("solver", path="eval.solver"),
EvalColumn("solver_args", path="eval.solver_args"),
EvalColumn("sandbox_type", path="eval.sandbox.type"),
EvalColumn("sandbox_config", path="eval.sandbox.config"),
EvalColumn( ]
EvalModel
Eval model columns.
list[Column] = [
EvalModel: "model", path="eval.model", required=True),
EvalColumn("model_base_url", path="eval.model_base_url"),
EvalColumn("model_args", path="eval.model_base_url"),
EvalColumn("model_generate_config", path="eval.model_generate_config"),
EvalColumn("model_roles", path="eval.model_roles"),
EvalColumn( ]
EvalConfiguration
Eval configuration columns.
list[Column] = [
EvalConfiguration: "epochs", path="eval.config.epochs"),
EvalColumn("epochs_reducer", path="eval.config.epochs_reducer"),
EvalColumn("approval", path="eval.config.approval"),
EvalColumn("message_limit", path="eval.config.message_limit"),
EvalColumn("token_limit", path="eval.config.token_limit"),
EvalColumn("time_limit", path="eval.config.time_limit"),
EvalColumn("working_limit", path="eval.config.working_limit"),
EvalColumn( ]
EvalResults
Eval results columns.
list[Column] = [
EvalResults: "status", path="status", required=True),
EvalColumn("error_message", path="error.message"),
EvalColumn("error_traceback", path="error.traceback"),
EvalColumn("total_samples", path="results.total_samples"),
EvalColumn("completed_samples", path="results.completed_samples"),
EvalColumn("score_headline_name", path="results.scores[0].scorer"),
EvalColumn("score_headline_metric", path="results.scores[0].metrics.*.name"),
EvalColumn("score_headline_value", path="results.scores[0].metrics.*.value"),
EvalColumn("score_headline_stderr", path=eval_log_headline_stderr),
EvalColumn( ]
EvalScores
Eval scores (one score/metric per-columns).
list[Column] = [
EvalScores: "score_*_*", path=eval_log_scores_dict),
EvalColumn( ]
Samples
samples_df
Read a dataframe containing samples from a set of evals.
def samples_df(
= list_eval_logs(),
logs: LogPaths = SampleSummary,
columns: Sequence[Column] bool = False,
full: bool = True,
strict: bool | int = False,
parallel: bool | None = None,
quiet: -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]] )
logs
LogPaths-
One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).
columns
Sequence[Column]-
Specification for what columns to read from log files.
full
bool-
Read full sample
metadata
. This will be much slower, but will include the unfiltered values of samplemetadata
rather than the abbrevivated metadata from sample summaries (which includes only scalar values and limits string values to 1k). strict
bool-
Raise import errors immediately. Defaults to
True
. IfFalse
then a tuple ofDataFrame
and errors is returned. parallel
bool | int-
If
True
, useProcessPoolExecutor
to read logs in parallel (with workers based onmp.cpu_count()
, capped at 8). Ifint
, read in parallel with the specified number of workers. IfFalse
(the default) do not read in parallel. quiet
bool | None-
If
True
, do not show any output or progress. Defaults toFalse
for terminal environments, andTrue
for notebooks.
SampleColumn
Column which maps to EvalSample or EvalSampleSummary.
class SampleColumn(Column)
SampleSummary
Sample summary columns.
list[Column] = [
SampleSummary: "id", path="id", required=True, type=str),
SampleColumn("epoch", path="epoch", required=True),
SampleColumn("input", path=sample_input_as_str, required=True),
SampleColumn("target", path="target", required=True, value=list_as_str),
SampleColumn("metadata_*", path="metadata"),
SampleColumn("score_*", path="scores", value=score_values),
SampleColumn("model_usage", path="model_usage"),
SampleColumn("total_time", path="total_time"),
SampleColumn("working_time", path="total_time"),
SampleColumn("error", path="error", default=""),
SampleColumn("limit", path="limit"),
SampleColumn("retries", path="retries"),
SampleColumn( ]
SampleMessages
Sample messages as a string.
list[Column] = [
SampleMessages: "messages", path=sample_messages_as_str, required=True, full=True)
SampleColumn( ]
Messages
messages_df
Read a dataframe containing messages from a set of evals.
def messages_df(
= list_eval_logs(),
logs: LogPaths = MessageColumns,
columns: Sequence[Column] filter: MessageFilter | None = None,
bool = True,
strict: bool | int = False,
parallel: bool | None = None,
quiet: -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]] )
logs
LogPaths-
One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).
columns
Sequence[Column]-
Specification for what columns to read from log files.
filter
MessageFilter | None-
Callable that filters messages
strict
bool-
Raise import errors immediately. Defaults to
True
. IfFalse
then a tuple ofDataFrame
and errors is returned. parallel
bool | int-
If
True
, useProcessPoolExecutor
to read logs in parallel (with workers based onmp.cpu_count()
, capped at 8). Ifint
, read in parallel with the specified number of workers. IfFalse
(the default) do not read in parallel. quiet
bool | None-
If
True
, do not show any output or progress. Defaults toFalse
for terminal environments, andTrue
for notebooks.
MessageFilter
Filter for messages_df() rows.
= Callable[[ChatMessage], bool] MessageFilter: TypeAlias
MessageColumn
Column which maps to ChatMessage.
class MessageColumn(Column)
MessageContent
Message content columns.
list[Column] = [
MessageContent: "message_id", path="id"),
MessageColumn("role", path="role", required=True),
MessageColumn("source", path="source"),
MessageColumn("content", path=message_text),
MessageColumn( ]
MessageToolCalls
Message tool call columns.
list[Column] = [
MessageToolCalls: "tool_calls", path=message_tool_calls),
MessageColumn("tool_call_id", path="tool_call_id"),
MessageColumn("tool_call_function", path="function"),
MessageColumn("tool_call_error", path="error.message"),
MessageColumn( ]
MessageColumns
Chat message columns.
list[Column] = MessageContent + MessageToolCalls MessageColumns:
Events
events_df
Read a dataframe containing events from a set of evals.
def events_df(
= list_eval_logs(),
logs: LogPaths = EventInfo,
columns: Sequence[Column] filter: EventFilter | None = None,
bool = True,
strict: bool | int = False,
parallel: bool | None = None,
quiet: -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]] )
logs
LogPaths-
One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).
columns
Sequence[Column]-
Specification for what columns to read from log files.
filter
EventFilter | None-
Callable that filters event types.
strict
bool-
Raise import errors immediately. Defaults to
True
. IfFalse
then a tuple ofDataFrame
and errors is returned. parallel
bool | int-
If
True
, useProcessPoolExecutor
to read logs in parallel (with workers based onmp.cpu_count()
, capped at 8). Ifint
, read in parallel with the specified number of workers. IfFalse
(the default) do not read in parallel. quiet
bool | None-
If
True
, do not show any output or progress. Defaults toFalse
for terminal environments, andTrue
for notebooks.
EventColumn
Column which maps to Event.
class EventColumn(Column)
EventInfo
Event basic information columns.
list[Column] = [
EventInfo: "event_id", path="uuid"),
EventColumn("event", path="event"),
EventColumn("span_id", path="span_id"),
EventColumn( ]
EventTiming
Event timing columns.
list[Column] = [
EventTiming: "timestamp", path="timestamp", type=datetime),
EventColumn("completed", path="completed", type=datetime),
EventColumn("working_start", path="working_start"),
EventColumn("working_time", path="working_time"),
EventColumn( ]
ModelEventColumns
Model event columns.
list[Column] = [
ModelEventColumns: "model_event_model", path="model"),
EventColumn("model_event_role", path="role"),
EventColumn("model_event_input", path=model_event_input_as_str),
EventColumn("model_event_tools", path="tools"),
EventColumn("model_event_tool_choice", path=tool_choice_as_str),
EventColumn("model_event_config", path="config"),
EventColumn("model_event_usage", path="output.usage"),
EventColumn("model_event_time", path="output.time"),
EventColumn("model_event_completion", path=completion_as_str),
EventColumn("model_event_retries", path="retries"),
EventColumn("model_event_error", path="error"),
EventColumn("model_event_cache", path="cache"),
EventColumn("model_event_call", path="call"),
EventColumn( ]
ToolEventColumns
Tool event columns.
list[Column] = [
ToolEventColumns: "tool_event_function", path="function"),
EventColumn("tool_event_arguments", path="arguments"),
EventColumn("tool_event_view", path=tool_view_as_str),
EventColumn("tool_event_result", path="result"),
EventColumn("tool_event_truncated", path="truncated"),
EventColumn("tool_event_error_type", path="error.type"),
EventColumn("tool_event_error_message", path="error.message"),
EventColumn( ]
Prepare
prepare
Prepare a data frame for analysis using one or more transform operations.
def prepare(
"pd.DataFrame", operation: Operation | Sequence[Operation]
df: -> "pd.DataFrame" )
log_viewer
Add a log viewer column to an eval data frame.
Tranform operation to add a log_viewer column to a data frame based on one more more url_mappings
.
URL mappings define the relationship between log file paths (either fileystem or S3) and URLs where logs are published. The URL target should be the location where the output of the inspect view bundle
command was published.
def log_viewer(
"eval", "sample", "event", "message"],
target: Literal[dict[str, str],
url_mappings: str = "log",
log_column: str = "log_viewer",
log_viewer_column: -> Operation )
target
Literal['eval', 'sample', 'event', 'message']-
Target for log viewer (“eval”, “sample”, “event”, or “message”).
url_mappings
dict[str, str]-
Map log file paths (either filesystem or S3) to URLs where logs are published.
log_column
str-
Column in the data frame containing log file path (defaults to “log”).
log_viewer_column
str-
Column to create with log viewer URL (defaults to “log_viewer”)
model_info
Amend data frame with model metadata.
Fields added (when available) include:
model_organization_name
- Displayable model organization (e.g. OpenAI, Anthropic, etc.)
model_display_name
- Displayable model name (e.g. Gemini Flash 2.5)
model_snapshot
- A snapshot (version) string, if available (e.g. “latest” or “20240229”)
model_release_date
- The model’s release date
model_knowledge_cutoff_date
- The model’s knowledge cutoff date
Inspect includes built in support for many models (based upon the model
string in the dataframe). If you are using models for which Inspect does not include model metadata, you may include your own model metadata via the model_info
argument.
def model_info(
str, ModelInfo] | None = None,
model_info: Dict[-> Operation )
model_info
Dict[str, ModelInfo] | None-
Additional model info for models not supported directly by Inspect’s internal database.
task_info
Amend data frame with task display name.
Maps task names to task display names for plotting (e.g. “gpqa_diamond” -> “GPQA Diamond”)
If no mapping is provided for a task then name will come from the display_name
attribute of the Task (or failing that from the registered name of the Task).
def task_info(
dict[str, str],
display_names: str = "task_name",
task_name_column: str = "task_display_name",
task_display_name_column: -> Operation )
display_names
dict[str, str]-
Mapping of task log names (e.g. “gpqa_diamond”) to task display names (e.g. “GPQA Diamond”).
task_name_column
str-
Column to draw the task name from (defaults to “task_name”).
task_display_name_column
str-
Column to populate with the task display name (defaults to “task_display_name”)
frontier
Add a frontier column to an eval data frame.
Tranform operation to add a frontier column to a data frame based using a task, release date, and score.
The frontier column will be True if the model was the top-scoring model on the task among all models available at the moment the model was released; otherwise it will be False.
def frontier(
str = "task_name",
task_column: str = "model_release_date",
date_column: str = "score_headline_value",
score_column: str = "frontier",
frontier_column: -> Operation )
task_column
str-
The column in the data frame containing the task name (defaults to “task_name”).
date_column
str-
The column in the data frame containing the model release date (defaults to “model_release_date”).
score_column
str-
The column in the data frame containing the score (defaults to “score_headline_value”).
frontier_column
str-
The column to create with the frontier value (defaults to “frontier”).
score_to_float
Converts score columns to float values.
For each column specified, this operation will convert the values to floats using the provided value_to_float
function. The column value will be replaced with the float value.
def score_to_float(
str | Sequence[str], *, value_to_float: ValueToFloat = value_to_float()
columns: -> Operation )
columns
str | Sequence[str]-
The name of the score column(s) to convert to float. This can be a single column name or a sequence of column names.
value_to_float
ValueToFloat-
Function to convert values to float. Defaults to the built-in
value_to_float
function.
Operation
Operation to transform a data frame for analysis.
class Operation(Protocol):
def __call__(self, df: "pd.DataFrame") -> "pd.DataFrame"
df
pd.DataFrame-
Input data frame.
ModelInfo
Model information and metadata
class ModelInfo(BaseModel)
Attributes
organization
str | None-
Model organization (e.g. Anthropic, OpenAI).
model
str | None-
Model name (e.g. Gemini 2.5 Flash).
snapshot
str | None-
A snapshot (version) string, if available (e.g. “latest” or “20240229”)..
release_date
date | None-
The mode’s release date.
Columns
Column
Specification for importing a column into a dataframe.
Extract columns from an EvalLog path either using JSONPath expressions or a function that takes EvalLog and returns a value.
By default, columns are not required, pass required=True
to make them required. Non-required columns are extracted as None
, provide a default
to yield an alternate value.
The type
option serves as both a validation check and a directive to attempt to coerce the data into the specified type
. Coercion from str
to other types is done after interpreting the string using YAML (e.g. "true"
-> True
).
The value
function provides an additional hook for transformation of the value read from the log before it is realized as a column (e.g. list to a comma-separated string).
The root
option indicates which root eval log context the columns select from.
class Column(abc.ABC)
Attributes
name
str-
Column name.
path
JSONPath | None-
Path to column in EvalLog
required
bool-
Is the column required? (error is raised if required columns aren’t found).
default
JsonValue | None-
Default value for column when it is read from the log as
None
. type
Type[ColumnType] | None-
Column type (import will attempt to coerce to the specified type).
Methods
- value
-
Convert extracted value into a column value (defaults to identity function).
def value(self, x: JsonValue) -> JsonValue
x
JsonValue-
Value to convert.
ColumnType
Valid types for columns.
Values of list
and dict
are converted into column values as JSON str
.
= int | float | bool | str | date | time | datetime | None ColumnType: TypeAlias
ColumnError
Error which occurred parsing a column.
@dataclass
class ColumnError
Attributes
column
str-
Target column name.
path
str | None-
Path to select column value.
error
Exception-
Underlying error.
log
EvalLog-
Eval log where the error occurred.
Use log.location to determine the path where the log was read from.