inspect_ai.analysis.beta

Note

Analysis functions are currently in beta and are exported from the inspect_ai.analysis.beta module. The beta module will be preserved after final release so that code written against it now will continue to work after the beta.

Evals

evals_df

Read a dataframe containing evals.

def evals_df(
    logs: LogPaths = list_eval_logs(),
    columns: list[Column] = EvalColumns,
    strict: bool = True,
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]
logs LogPaths

One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).

columns list[Column]

Specification for what columns to read from log files.

strict bool

Raise import errors immediately. Defaults to True. If False then a tuple of DataFrame and errors is returned.

EvalColumn

Column which maps to EvalLog.

class EvalColumn(Column)

EvalColumns

Default columns to import for evals_df().

EvalColumns: list[Column] = (
    EvalInfo
    + EvalTask
    + EvalModel
    + EvalDataset
    + EvalConfig
    + EvalResults
    + EvalScores
)

EvalInfo

Eval basic information columns.

EvalInfo: list[Column] = [
    EvalColumn("run_id", path="eval.run_id", required=True),
    EvalColumn("task_id", path="eval.task_id", required=True),
    EvalColumn("log", path=eval_log_location),
    EvalColumn("created", path="eval.created", type=datetime, required=True),
    EvalColumn("tags", path="eval.tags", default="", value=list_as_str),
    EvalColumn("git_origin", path="eval.revision.origin"),
    EvalColumn("git_commit", path="eval.revision.commit"),
    EvalColumn("packages", path="eval.packages"),
    EvalColumn("metadata", path="eval.metadata"),
]

EvalTask

Eval task configuration columns.

EvalTask: list[Column] = [
    EvalColumn("task_name", path="eval.task", required=True),
    EvalColumn("task_version", path="eval.task_version", required=True),
    EvalColumn("task_file", path="eval.task_file"),
    EvalColumn("task_attribs", path="eval.task_attribs"),
    EvalColumn("task_arg_*", path="eval.task_args"),
    EvalColumn("solver", path="eval.solver"),
    EvalColumn("solver_args", path="eval.solver_args"),
    EvalColumn("sandbox_type", path="eval.sandbox.type"),
    EvalColumn("sandbox_config", path="eval.sandbox.config"),
]

EvalModel

Eval model columns.

EvalModel: list[Column] = [
    EvalColumn("model", path="eval.model", required=True),
    EvalColumn("model_base_url", path="eval.model_base_url"),
    EvalColumn("model_args", path="eval.model_base_url"),
    EvalColumn("model_generate_config", path="eval.model_generate_config"),
    EvalColumn("model_roles", path="eval.model_roles"),
]

EvalConfig

Eval configuration columns.

EvalConfig: list[Column] = [
    EvalColumn("epochs", path="eval.config.epochs"),
    EvalColumn("epochs_reducer", path="eval.config.epochs_reducer"),
    EvalColumn("approval", path="eval.config.approval"),
    EvalColumn("message_limit", path="eval.config.message_limit"),
    EvalColumn("token_limit", path="eval.config.token_limit"),
    EvalColumn("time_limit", path="eval.config.time_limit"),
    EvalColumn("working_limit", path="eval.config.working_limit"),
]

EvalResults

Eval results columns.

EvalResults: list[Column] = [
    EvalColumn("status", path="status", required=True),
    EvalColumn("error_message", path="error.message"),
    EvalColumn("error_traceback", path="error.traceback"),
    EvalColumn("total_samples", path="results.total_samples"),
    EvalColumn("completed_samples", path="results.completed_samples"),
    EvalColumn("score_headline_name", path="results.scores[0].scorer"),
    EvalColumn("score_headline_metric", path="results.scores[0].metrics.*.name"),
    EvalColumn("score_headline_value", path="results.scores[0].metrics.*.value"),
]

EvalScores

Eval scores (one score/metric per-columns).

EvalScores: list[Column] = [
    EvalColumn("score_*_*", path=eval_log_scores_dict),
]

Samples

samples_df

Read a dataframe containing samples from a set of evals.

def samples_df(
    logs: LogPaths = list_eval_logs(),
    columns: list[Column] = SampleSummary,
    strict: bool = True,
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]
logs LogPaths

One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).

columns list[Column]

Specification for what columns to read from log files.

strict bool

Raise import errors immediately. Defaults to True. If False then a tuple of DataFrame and errors is returned.

SampleColumn

Column which maps to EvalSample or EvalSampleSummary.

class SampleColumn(Column)

SampleSummary

Sample summary columns.

SampleSummary: list[Column] = [
    SampleColumn("id", path="id", required=True, type=str),
    SampleColumn("epoch", path="epoch", required=True),
    SampleColumn("input", path=sample_input_as_str, required=True),
    SampleColumn("target", path="target", required=True, value=list_as_str),
    SampleColumn("metadata_*", path="metadata"),
    SampleColumn("score_*", path="scores", value=score_values),
    SampleColumn("model_usage", path="model_usage"),
    SampleColumn("total_time", path="total_time"),
    SampleColumn("working_time", path="total_time"),
    SampleColumn("error", path="error"),
    SampleColumn("limit", path="limit"),
    SampleColumn("retries", path="retries"),
]

SampleMessages

Sample messages as a string.

SampleMessages: list[Column] = [
    SampleColumn("messages", path=sample_messages_as_str, required=True, full=True)
]

Messages

messages_df

Read a dataframe containing messages from a set of evals.

def messages_df(
    logs: LogPaths = list_eval_logs(),
    columns: list[Column] = MessageColumns,
    filter: MessageFilter | None = None,
    strict: bool = True,
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]
logs LogPaths

One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).

columns list[Column]

Specification for what columns to read from log files.

filter MessageFilter | None

List of message role types to include or callable that performs the filter.

strict bool

Raise import errors immediately. Defaults to True. If False then a tuple of DataFrame and errors is returned.

MessageFilter

Filter for messages_df() rows.

MessageFilter: TypeAlias = (
    list[Literal["system", "user", "assistant", "tool"]] | Callable[[ChatMessage], bool]
)

MessageColumn

Column which maps to ChatMessage.

class MessageColumn(Column)

MessageContent

Message content columns.

MessageContent: list[Column] = [
    MessageColumn("role", path="role", required=True),
    MessageColumn("source", path="source"),
    MessageColumn("content", path=message_text),
]

MessageToolCalls

Message tool call columns.

MessageToolCalls: list[Column] = [
    MessageColumn("tool_calls", path=message_tool_calls),
    MessageColumn("tool_call_id", path="tool_call_id"),
    MessageColumn("tool_call_function", path="function"),
    MessageColumn("tool_call_error", path="error.message"),
]

MessageColumns

Chat message columns.

MessageColumns: list[Column] = MessageContent + MessageToolCalls

Events

events_df

Read a dataframe containing events from a set of evals.

def events_df(
    logs: LogPaths = list_eval_logs(),
    columns: list[Column] = EventInfo,
    filter: EventFilter | None = None,
    strict: bool = True,
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]
logs LogPaths

One or more paths to log files or log directories. Defaults to the contents of the currently active log directory (e.g. ./logs or INSPECT_LOG_DIR).

columns list[Column]

Specification for what columns to read from log files.

filter EventFilter | None

List of event types to include or callable that performs the filter.

strict bool

Raise import errors immediately. Defaults to True. If False then a tuple of DataFrame and errors is returned.

EventColumn

Column which maps to Event.

class EventColumn(Column)

EventInfo

Event basic information columns.

EventInfo: list[Column] = [
    EventColumn("event", path="event"),
    EventColumn("span_id", path="span_id"),
]

EventTiming

Event timing columns.

EventTiming: list[Column] = [
    EventColumn("timestamp", path="timestamp", type=datetime),
    EventColumn("completed", path="completed", type=datetime),
    EventColumn("working_start", path="working_start"),
    EventColumn("working_time", path="working_time"),
]

ModelEventColumns

Model event columns.

ModelEventColumns: list[Column] = [
    EventColumn("model_event_model", path="model"),
    EventColumn("model_event_role", path="role"),
    EventColumn("model_event_input", path=model_event_input_as_str),
    EventColumn("model_event_tools", path="tools"),
    EventColumn("model_event_tool_choice", path=tool_choice_as_str),
    EventColumn("model_event_config", path="config"),
    EventColumn("model_event_usage", path="output.usage"),
    EventColumn("model_event_time", path="output.time"),
    EventColumn("model_event_completion", path=completion_as_str),
    EventColumn("model_event_retries", path="retries"),
    EventColumn("model_event_error", path="error"),
    EventColumn("model_event_cache", path="cache"),
    EventColumn("model_event_call", path="call"),
]

ToolEventColumns

Tool event columns.

ToolEventColumns: list[Column] = [
    EventColumn("tool_event_function", path="function"),
    EventColumn("tool_event_arguments", path="arguments"),
    EventColumn("tool_event_view", path=tool_view_as_str),
    EventColumn("tool_event_result", path="result"),
    EventColumn("tool_event_truncated", path="truncated"),
    EventColumn("tool_event_error_type", path="error.type"),
    EventColumn("tool_event_error_message", path="error.message"),
]

Columns

Column

Specification for importing a column into a dataframe.

Extract columns from an EvalLog path either using JSONPath expressions or a function that takes EvalLog and returns a value.

By default, columns are not required, pass required=True to make them required. Non-required columns are extracted as None, provide a default to yield an alternate value.

The type option serves as both a validation check and a directive to attempt to coerce the data into the specified type. Coercion from str to other types is done after interpreting the string using YAML (e.g. "true" -> True).

The value function provides an additional hook for transformation of the value read from the log before it is realized as a column (e.g. list to a comma-separated string).

The root option indicates which root eval log context the columns select from.

class Column(abc.ABC)

Attributes

name str

Column name.

path JSONPath | None

Path to column in EvalLog

required bool

Is the column required? (error is raised if required columns aren’t found).

default JsonValue | None

Default value for column when it is read from the log as None.

type Type[ColumnType] | None

Column type (import will attempt to coerce to the specified type).

Methods

value

Convert extracted value into a column value (defaults to identity function).

def value(self, x: JsonValue) -> JsonValue
x JsonValue

Value to convert.

ColumnType

Valid types for columns.

Values of list and dict are converted into column values as JSON str.

ColumnType: TypeAlias = int | float | bool | str | date | time | datetime | None

ColumnError

Error which occurred parsing a column.

@dataclass
class ColumnError

Attributes

column str

Target column name.

path str | None

Path to select column value.

message str

Error message.

ColumnErrors

Dictionary of column errors keyed by log file.

class ColumnErrors(dict[str, list[ColumnError]])