inspect_ai.log
Eval Log Files
list_eval_logs
List all eval logs in a directory.
def list_eval_logs(
    log_dir: str = os.environ.get("INSPECT_LOG_DIR", "./logs"),
    formats: list[Literal["eval", "json"]] | None = None,
    filter: Callable[[EvalLog], bool] | None = None,
    recursive: bool = True,
    descending: bool = True,
    fs_options: dict[str, Any] = {},
) -> list[EvalLogInfo]- log_dirstr
- 
Log directory (defaults to INSPECT_LOG_DIR) 
- formatslist[Literal['eval', 'json']] | None
- 
Formats to list (default to listing all formats) 
- filterCallable[[EvalLog], bool] | None
- 
Filter to limit logs returned. Note that the EvalLog instance passed to the filter has only the EvalLog header (i.e. does not have the samples or logging output). 
- recursivebool
- 
List log files recursively (defaults to True). 
- descendingbool
- 
List in descending order. 
- fs_optionsdict[str, Any]
- 
Optional. Additional arguments to pass through to the filesystem provider (e.g. S3FileSystem).
write_eval_log
Write an evaluation log.
def write_eval_log(
    log: EvalLog,
    location: str | Path | FileInfo | None = None,
    format: Literal["eval", "json", "auto"] = "auto",
    if_match_etag: str | None = None,
) -> None- logEvalLog
- 
Evaluation log to write. 
- locationstr | Path | FileInfo | None
- 
Location to write log to. 
- formatLiteral['eval', 'json', 'auto']
- 
Write to format (defaults to ‘auto’ based on log_fileextension)
- if_match_etagstr | None
- 
ETag for conditional write. If provided and writing to S3, will only write if the current ETag matches. 
write_eval_log_async
Write an evaluation log.
async def write_eval_log_async(
    log: EvalLog,
    location: str | Path | FileInfo | None = None,
    format: Literal["eval", "json", "auto"] = "auto",
    if_match_etag: str | None = None,
) -> None- logEvalLog
- 
Evaluation log to write. 
- locationstr | Path | FileInfo | None
- 
Location to write log to. 
- formatLiteral['eval', 'json', 'auto']
- 
Write to format (defaults to ‘auto’ based on log_fileextension)
- if_match_etagstr | None
- 
ETag for conditional write. If provided and writing to S3, will only write if the current ETag matches. 
read_eval_log
Read an evaluation log.
def read_eval_log(
    log_file: str | Path | EvalLogInfo,
    header_only: bool = False,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> EvalLog- log_filestr | Path | EvalLogInfo
- 
Log file to read. 
- header_onlybool
- 
Read only the header (i.e. exclude the “samples” and “logging” fields). Defaults to False. 
- resolve_attachmentsbool
- 
Resolve attachments (duplicated content blocks) to their full content. 
- formatLiteral['eval', 'json', 'auto']
- 
Read from format (defaults to ‘auto’ based on log_fileextension)
read_eval_log_async
Read an evaluation log.
async def read_eval_log_async(
    log_file: str | Path | EvalLogInfo,
    header_only: bool = False,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> EvalLog- log_filestr | Path | EvalLogInfo
- 
Log file to read. 
- header_onlybool
- 
Read only the header (i.e. exclude the “samples” and “logging” fields). Defaults to False. 
- resolve_attachmentsbool
- 
Resolve attachments (duplicated content blocks) to their full content. 
- formatLiteral['eval', 'json', 'auto']
- 
Read from format (defaults to ‘auto’ based on log_fileextension)
read_eval_log_sample
Read a sample from an evaluation log.
def read_eval_log_sample(
    log_file: str | Path | EvalLogInfo,
    id: int | str | None = None,
    epoch: int = 1,
    uuid: str | None = None,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> EvalSample- log_filestr | Path | EvalLogInfo
- 
Log file to read. 
- idint | str | None
- 
Sample id to read. Optional, alternatively specify uuid(you must specifyidoruuid)
- epochint
- 
Epoch for sample id (defaults to 1) 
- uuidstr | None
- 
Sample uuid to read. Optional, alternatively specify idandepoch(you must specify eitheruuidorid)
- resolve_attachmentsbool
- 
Resolve attachments (duplicated content blocks) to their full content. 
- formatLiteral['eval', 'json', 'auto']
- 
Read from format (defaults to ‘auto’ based on log_fileextension)
read_eval_log_samples
Read all samples from an evaluation log incrementally.
Generator for samples in a log file. Only one sample at a time will be read into memory and yielded to the caller.
def read_eval_log_samples(
    log_file: str | Path | EvalLogInfo,
    all_samples_required: bool = True,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> Generator[EvalSample, None, None]- log_filestr | Path | EvalLogInfo
- 
Log file to read. 
- all_samples_requiredbool
- 
All samples must be included in the file or an IndexError is thrown. 
- resolve_attachmentsbool
- 
Resolve attachments (duplicated content blocks) to their full content. 
- formatLiteral['eval', 'json', 'auto']
- 
Read from format (defaults to ‘auto’ based on log_fileextension)
read_eval_log_sample_summaries
Read sample summaries from an eval log.
def read_eval_log_sample_summaries(
    log_file: str | Path | EvalLogInfo,
    format: Literal["eval", "json", "auto"] = "auto",
) -> list[EvalSampleSummary]- log_filestr | Path | EvalLogInfo
- 
Log file to read. 
- formatLiteral['eval', 'json', 'auto']
- 
Read from format (defaults to ‘auto’ based on log_fileextension)
edit_score
Edit a score in-place.
def edit_score(
    log: EvalLog,
    sample_id: int | str,
    score_name: str,
    edit: ScoreEdit,
    recompute_metrics: bool = True,
    epoch: int | None = None,
) -> None- logEvalLog
- 
The evaluation log containing the samples and scores 
- sample_idint | str
- 
ID of the sample containing the score to edit 
- score_namestr
- 
Name of the score to edit 
- editScoreEdit
- 
The edit to apply to the score 
- recompute_metricsbool
- 
Whether to recompute aggregate metrics after editing 
- epochint | None
- 
Epoch number of the sample to edit (required when there are multiple epochs) 
recompute_metrics
Recompute aggregate metrics after score edits.
def recompute_metrics(log: EvalLog) -> None- logEvalLog
- 
The evaluation log to recompute metrics for 
convert_eval_logs
Convert between log file formats.
Convert log file(s) to a target format. If a file is already in the target format it will just be copied to the output dir.
def convert_eval_logs(
    path: str,
    to: Literal["eval", "json"],
    output_dir: str,
    overwrite: bool = False,
    resolve_attachments: bool = False,
    stream: int | bool = False,
) -> None- pathstr
- 
Path to source log file(s). Should be either a single log file or a directory containing log files. 
- toLiteral['eval', 'json']
- 
Format to convert to. If a file is already in the target format it will just be copied to the output dir. 
- output_dirstr
- 
Output directory to write converted log file(s) to. 
- overwritebool
- 
Overwrite existing log files (defaults to False, raising an error if the output file path already exists).
- resolve_attachmentsbool
- 
Resolve attachments (duplicated content blocks) to their full content. 
- streamint | bool
- 
Stream samples through the conversion process instead of reading the entire log into memory. Useful for large logs. 
bundle_log_dir
Bundle a log_dir into a statically deployable viewer
def bundle_log_dir(
    log_dir: str | None = None,
    output_dir: str | None = None,
    overwrite: bool = False,
    fs_options: dict[str, Any] = {},
) -> None- log_dirstr | None
- 
(str | None): The log_dir to bundle 
- output_dirstr | None
- 
(str | None): The directory to place bundled output. If no directory is specified, the env variable INSPECT_VIEW_BUNDLE_OUTPUT_DIRwill be used.
- overwritebool
- 
(bool): Optional. Whether to overwrite files in the output directory. Defaults to False. 
- fs_optionsdict[str, Any]
- 
Optional. Additional arguments to pass through to the filesystem provider (e.g. S3FileSystem).
write_log_dir_manifest
Write a manifest for a log directory.
A log directory manifest is a dictionary of EvalLog headers (EvalLog w/o samples) keyed by log file names (names are relative to the log directory)
def write_log_dir_manifest(
    log_dir: str,
    *,
    filename: str = "logs.json",
    output_dir: str | None = None,
    fs_options: dict[str, Any] = {},
) -> None- log_dirstr
- 
Log directory to write manifest for. 
- filenamestr
- 
Manifest filename (defaults to “logs.json”) 
- output_dirstr | None
- 
Output directory for manifest (defaults to log_dir) 
- fs_optionsdict[str, Any]
- 
Optional. Additional arguments to pass through to the filesystem provider (e.g. S3FileSystem).
retryable_eval_logs
Extract the list of retryable logs from a list of logs.
Retryable logs are logs with status “error” or “cancelled” that do not have a corresponding log with status “success” (indicating they were subsequently retried and completed)
def retryable_eval_logs(logs: list[EvalLogInfo]) -> list[EvalLogInfo]- logslist[EvalLogInfo]
- 
List of logs to examine. 
EvalLogInfo
File info and task identifiers for eval log.
class EvalLogInfo(BaseModel)Attributes
- namestr
- 
Name of file. 
- typestr
- 
Type of file (file or directory) 
- sizeint
- 
File size in bytes. 
- mtimefloat | None
- 
File modification time (None if the file is a directory on S3). 
- taskstr
- 
Task name. 
- task_idstr
- 
Task id. 
- suffixstr | None
- 
Log file suffix (e.g. “-scored”) 
Eval Log API
EvalLog
Evaluation log.
class EvalLog(BaseModel)Attributes
- versionint
- 
Eval log file format version. 
- statusLiteral['started', 'success', 'cancelled', 'error']
- 
Status of evaluation (did it succeed or fail). 
- evalEvalSpec
- 
Eval identity and configuration. 
- planEvalPlan
- 
Eval plan (solvers and config) 
- resultsEvalResults | None
- 
Eval results (scores and metrics). 
- statsEvalStats
- 
Eval stats (runtime, model usage) 
- errorEvalError | None
- 
Error that halted eval (if status==“error”) 
- sampleslist[EvalSample] | None
- 
Samples processed by eval. 
- reductionslist[EvalSampleReductions] | None
- 
Reduced sample values 
- locationstr
- 
Location that the log file was read from. 
- etagstr | None
- 
ETag from S3 for conditional writes. 
EvalSpec
Eval target and configuration.
class EvalSpec(BaseModel)Attributes
- eval_set_idstr | None
- 
Globally unique id for eval set (if any). 
- eval_idstr
- 
Globally unique id for eval. 
- run_idstr
- 
Unique run id 
- createdstr
- 
Time created. 
- taskstr
- 
Task name. 
- task_idstr
- 
Unique task id. 
- task_versionint | str
- 
Task version. 
- task_filestr | None
- 
Task source file. 
- task_display_namestr | None
- 
Task display name. 
- task_registry_namestr | None
- 
Task registry name. 
- task_attribsdict[str, Any]
- 
Attributes of the @task decorator. 
- task_argsdict[str, Any]
- 
Arguments used for invoking the task (including defaults). 
- task_args_passeddict[str, Any]
- 
Arguments explicitly passed by caller for invoking the task. 
- solverstr | None
- 
Solver name. 
- solver_argsdict[str, Any] | None
- 
Arguments used for invoking the solver. 
- tagslist[str] | None
- 
Tags associated with evaluation run. 
- datasetEvalDataset
- 
Dataset used for eval. 
- sandboxSandboxEnvironmentSpec | None
- 
Sandbox environment type and optional config file. 
- modelstr
- 
Model used for eval. 
- model_generate_configGenerateConfig
- 
Generate config specified for model instance. 
- model_base_urlstr | None
- 
Optional override of model base url 
- model_argsdict[str, Any]
- 
Model specific arguments. 
- model_rolesdict[str, ModelConfig] | None
- 
Model roles. 
- configEvalConfig
- 
Configuration values for eval. 
- revisionEvalRevision | None
- 
Source revision of eval. 
- packagesdict[str, str]
- 
Package versions for eval. 
- metadatadict[str, Any] | None
- 
Additional eval metadata. 
- scorerslist[EvalScorer] | None
- 
Scorers and args for this eval 
- metricslist[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]] | dict[str, list[EvalMetricDefinition]] | None
- 
metrics and args for this eval 
EvalDataset
Dataset used for evaluation.
class EvalDataset(BaseModel)Attributes
- namestr | None
- 
Dataset name. 
- locationstr | None
- 
Dataset location (file path or remote URL) 
- samplesint | None
- 
Number of samples in the dataset. 
- sample_idslist[str] | list[int] | list[str | int] | None
- 
IDs of samples in the dataset. 
- shuffledbool | None
- 
Was the dataset shuffled after reading. 
EvalConfig
Configuration used for evaluation.
class EvalConfig(BaseModel)Attributes
- limitint | tuple[int, int] | None
- 
Sample limit (number of samples or range of samples). 
- sample_idstr | int | list[str] | list[int] | list[str | int] | None
- 
Evaluate specific sample(s). 
- sample_shufflebool | int | None
- 
Shuffle order of samples. 
- epochsint | None
- 
Number of epochs to run samples over. 
- epochs_reducerlist[str] | None
- 
Reducers for aggregating per-sample scores. 
- approvalApprovalPolicyConfig | None
- 
Approval policy for tool use. 
- fail_on_errorbool | float | None
- 
Fail eval when sample errors occur. Trueto fail on first sample error (default);Falseto never fail on sample errors; Value between 0 and 1 to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
- continue_on_failbool | None
- 
Continue eval even if the fail_on_errorcondition is met.Trueto continue running and only fail at the end if thefail_on_errorcondition is met.Falseto fail eval immediately when thefail_on_errorcondition is met (default).
- retry_on_errorint | None
- 
Number of times to retry samples if they encounter errors. 
- message_limitint | None
- 
Maximum messages to allow per sample. 
- token_limitint | None
- 
Maximum tokens usage per sample. 
- time_limitint | None
- 
Maximum clock time per sample. 
- working_limitint | None
- 
Meximum working time per sample. 
- max_samplesint | None
- 
Maximum number of samples to run in parallel. 
- max_tasksint | None
- 
Maximum number of tasks to run in parallel. 
- max_subprocessesint | None
- 
Maximum number of subprocesses to run concurrently. 
- max_sandboxesint | None
- 
Maximum number of sandboxes to run concurrently. 
- sandbox_cleanupbool | None
- 
Cleanup sandbox environments after task completes. 
- log_samplesbool | None
- 
Log detailed information on each sample. 
- log_realtimebool | None
- 
Log events in realtime (enables live viewing of samples in inspect view). 
- log_imagesbool | None
- 
Log base64 encoded versions of images. 
- log_bufferint | None
- 
Number of samples to buffer before writing log file. 
- log_sharedint | None
- 
Interval (in seconds) for syncing sample events to log directory. 
- score_displaybool | None
- 
Display scoring metrics realtime. 
EvalRevision
Git revision for evaluation.
class EvalRevision(BaseModel)Attributes
- typeLiteral['git']
- 
Type of revision (currently only “git”) 
- originstr
- 
Revision origin server 
- commitstr
- 
Revision commit. 
- dirtybool | None
- 
Working tree has uncommitted changes or untracked files. 
EvalPlan
Plan (solvers) used in evaluation.
class EvalPlan(BaseModel)Attributes
- namestr
- 
Plan name. 
- stepslist[EvalPlanStep]
- 
Steps in plan. 
- finishEvalPlanStep | None
- 
Step to always run at the end. 
- configGenerateConfig
- 
Generation config. 
EvalPlanStep
Solver step.
class EvalPlanStep(BaseModel)Attributes
- solverstr
- 
Name of solver. 
- paramsdict[str, Any]
- 
Parameters used to instantiate solver. 
EvalResults
Scoring results from evaluation.
class EvalResults(BaseModel)Attributes
- total_samplesint
- 
Total samples in eval (dataset samples * epochs) 
- completed_samplesint
- 
Samples completed without error. Will be equal to total_samples except when –fail-on-error is enabled. 
- scoreslist[EvalScore]
- 
Scorers used to compute results 
- metadatadict[str, Any] | None
- 
Additional results metadata. 
- sample_reductionslist[EvalSampleReductions] | None
- 
List of per sample scores reduced across epochs 
EvalScore
Score for evaluation task.
class EvalScore(BaseModel)Attributes
- namestr
- 
Score name. 
- scorerstr
- 
Scorer name. 
- reducerstr | None
- 
Reducer name. 
- scored_samplesint | None
- 
Number of samples scored by this scorer. 
- unscored_samplesint | None
- 
Number of samples not scored by this scorer. 
- paramsdict[str, Any]
- 
Parameters specified when creating scorer. 
- metricsdict[str, EvalMetric]
- 
Metrics computed for this scorer. 
- metadatadict[str, Any] | None
- 
Additional scorer metadata. 
EvalMetric
Metric for evaluation score.
class EvalMetric(BaseModel)Attributes
- namestr
- 
Metric name. 
- valueint | float
- 
Metric value. 
- paramsdict[str, Any]
- 
Params specified when creating metric. 
- metadatadict[str, Any] | None
- 
Additional metadata associated with metric. 
EvalSampleReductions
Score reductions.
class EvalSampleReductions(BaseModel)Attributes
- scorerstr
- 
Name the of scorer 
- reducerstr | None
- 
Name the of reducer 
- sampleslist[EvalSampleScore]
- 
List of reduced scores 
EvalStats
Timing and usage statistics.
class EvalStats(BaseModel)Attributes
- started_atstr
- 
Evaluation start time. 
- completed_atstr
- 
Evaluation completion time. 
- model_usagedict[str, ModelUsage]
- 
Model token usage for evaluation. 
EvalError
Eval error details.
class EvalError(BaseModel)Attributes
- messagestr
- 
Error message. 
- tracebackstr
- 
Error traceback. 
- traceback_ansistr
- 
Error traceback with ANSI color codes. 
EvalSample
Sample from evaluation task.
class EvalSample(BaseModel)Attributes
- idint | str
- 
Unique id for sample. 
- epochint
- 
Epoch number for sample. 
- inputstr | list[ChatMessage]
- 
Sample input. 
- choiceslist[str] | None
- 
Sample choices. 
- targetstr | list[str]
- 
Sample target value(s) 
- sandboxSandboxEnvironmentSpec | None
- 
Sandbox environment type and optional config file. 
- fileslist[str] | None
- 
Files that go along with the sample (copied to SandboxEnvironment) 
- setupstr | None
- 
Setup script to run for sample (run within default SandboxEnvironment). 
- messageslist[ChatMessage]
- 
Chat conversation history for sample. 
- outputModelOutput
- 
Model output from sample. 
- scoresdict[str, Score] | None
- 
Scores for sample. 
- metadatadict[str, Any]
- 
Additional sample metadata. 
- storedict[str, Any]
- 
State at end of sample execution. 
- eventslist[Event]
- 
Events that occurred during sample execution. 
- model_usagedict[str, ModelUsage]
- 
Model token usage for sample. 
- total_timefloat | None
- 
Total time that the sample was running. 
- working_timefloat | None
- 
Time spent working (model generation, sandbox calls, etc.) 
- uuidstr | None
- 
Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70) 
- errorEvalError | None
- 
Error that halted sample. 
- error_retrieslist[EvalError] | None
- 
Errors that were retried for this sample. 
- attachmentsdict[str, str]
- 
Attachments referenced from messages and events. Resolve attachments for a sample (replacing attachment://* references with attachment content) by passing resolve_attachments=Trueto log reading functions.
- limitEvalSampleLimit | None
- 
The limit that halted the sample 
Methods
- metadata_as
- 
Pydantic model interface to metadata. def metadata_as(self, metadata_cls: Type[MT]) -> MT- metadata_clsType[MT]
- 
Pydantic model type 
 
- store_as
- 
Pydantic model interface to the store. def store_as(self, model_cls: Type[SMT], instance: str | None = None) -> SMT- model_clsType[SMT]
- 
Pydantic model type (must derive from StoreModel) 
- instancestr | None
- 
Optional instances name for store (enables multiple instances of a given StoreModel type within a single sample) 
 
- summary
- 
Summary of sample. The summary excludes potentially large fields like messages, output, events, store, and metadata so that it is always fast to load. If there are images, audio, or video in the input, they are replaced with a placeholder. def summary(self) -> EvalSampleSummary
EvalSampleSummary
Summary information (including scoring) for a sample.
class EvalSampleSummary(BaseModel)Attributes
- idint | str
- 
Unique id for sample. 
- epochint
- 
Epoch number for sample. 
- inputstr | list[ChatMessage]
- 
Sample input (text inputs only). 
- targetstr | list[str]
- 
Sample target value(s) 
- metadatadict[str, Any]
- 
Sample metadata (only fields < 1k; strings truncated to 1k). 
- scoresdict[str, Score] | None
- 
Scores for sample (only metadata fields < 1k; strings truncated to 1k). 
- model_usagedict[str, ModelUsage]
- 
Model token usage for sample. 
- total_timefloat | None
- 
Total time that the sample was running. 
- working_timefloat | None
- 
Time spent working (model generation, sandbox calls, etc.) 
- uuidstr | None
- 
Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70) 
- errorstr | None
- 
Error that halted sample. 
- limitstr | None
- 
Limit that halted the sample 
- retriesint | None
- 
Number of retries for the sample. 
- completedbool
- 
Is the sample complete. 
- message_countint | None
- 
Number of messages in the sample conversation. 
EvalSampleLimit
Limit encountered by sample.
class EvalSampleLimit(BaseModel)Attributes
- typeLiteral['context', 'time', 'working', 'message', 'token', 'operator', 'custom']
- 
The type of limit 
- limitfloat
- 
The limit value 
EvalSampleReductions
Score reductions.
class EvalSampleReductions(BaseModel)Attributes
- scorerstr
- 
Name the of scorer 
- reducerstr | None
- 
Name the of reducer 
- sampleslist[EvalSampleScore]
- 
List of reduced scores 
EvalSampleScore
Score and sample_id scored.
class EvalSampleScore(Score)Attributes
- sample_idstr | int | None
- 
Sample ID. 
WriteConflictError
Exception raised when a conditional write fails due to concurrent modification.
This error occurs when attempting to write to a log file that has been modified by another process since it was last read, indicating a race condition between concurrent evaluation runs.
class WriteConflictError(Exception)Transcript API
transcript
Get the current Transcript.
def transcript() -> TranscriptTranscript
Transcript of events.
class TranscriptMethods
- info
- 
Add an InfoEvent to the transcript. def info(self, data: JsonValue, *, source: str | None = None) -> None- dataJsonValue
- 
Data associated with the event. 
- sourcestr | None
- 
Optional event source. 
 
- step
- 
Context manager for recording StepEvent. The step()context manager is deprecated and will be removed in a future version. Please use the span() context manager instead.@contextlib.contextmanager def step(self, name: str, type: str | None = None) -> Iterator[None]- namestr
- 
Step name. 
- typestr | None
- 
Optional step type.