Source code for braid.metrics

"""Performance metrics module for BRAID-DSPy.

This module implements the Performance-per-Dollar (PPD) metrics from the
BRAID paper, allowing users to measure and compare the cost-effectiveness
of different model configurations.
"""

import time
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
from datetime import datetime


@dataclass
class TokenUsage:
    """Token usage for a single operation."""

    prompt_tokens: int
    completion_tokens: int
    total_tokens: int = 0

    def __post_init__(self):
        if self.total_tokens == 0:
            self.total_tokens = self.prompt_tokens + self.completion_tokens


@dataclass
class StepMetrics:
    """Metrics for a single reasoning step."""

    step_id: str
    phase: str  # "planning" or "execution"
    token_usage: TokenUsage
    cost_usd: float
    latency_ms: float


@dataclass
class CostAnalysis:
    """Complete cost analysis for an execution."""

    total_cost_usd: float
    planning_cost_usd: float
    execution_cost_usd: float
    total_tokens: int
    prompt_tokens: int
    completion_tokens: int
    step_costs: List[StepMetrics] = field(default_factory=list)


@dataclass
class PPDReport:
    """Performance-per-Dollar report."""

    accuracy: float  # 0.0 to 1.0
    total_cost_usd: float
    ppd_score: float  # Performance per Dollar
    efficiency_multiplier: float  # vs baseline
    baseline_model: Optional[str] = None
    baseline_cost_usd: Optional[float] = None
    breakdown: Optional[Dict[str, Any]] = None


@dataclass
class ModelConfig:
    """Configuration for a model."""

    model_id: str
    input_cost_per_1m: float  # USD per 1M input tokens
    output_cost_per_1m: float  # USD per 1M output tokens
    provider: str = "openai"


[docs] class PPDAnalyzer: """ Performance-per-Dollar analyzer for BRAID executions. This class tracks token usage and costs across the planning and execution phases, and provides metrics for comparing with baseline models. Example: >>> analyzer = PPDAnalyzer( ... architect_model="gpt-4", ... solver_model="gpt-3.5-turbo" ... ) >>> analyzer.track_usage(TokenUsage(100, 50), "planning") >>> report = analyzer.generate_report(accuracy=0.95) >>> print(f"PPD Score: {report.ppd_score}") """ # Model pricing (USD per 1M tokens) - Updated December 2025 MODEL_CONFIGS: Dict[str, ModelConfig] = { # OpenAI Models "gpt-4": ModelConfig("gpt-4", 30.0, 60.0, "openai"), "gpt-4-turbo": ModelConfig("gpt-4-turbo", 10.0, 30.0, "openai"), "gpt-4-turbo-preview": ModelConfig("gpt-4-turbo-preview", 10.0, 30.0, "openai"), "gpt-4o": ModelConfig("gpt-4o", 2.50, 10.0, "openai"), "gpt-4o-mini": ModelConfig("gpt-4o-mini", 0.15, 0.60, "openai"), "gpt-3.5-turbo": ModelConfig("gpt-3.5-turbo", 0.50, 1.50, "openai"), "o1-preview": ModelConfig("o1-preview", 15.0, 60.0, "openai"), "o1-mini": ModelConfig("o1-mini", 0.15, 0.60, "openai"), "o1": ModelConfig("o1", 15.0, 60.0, "openai"), "o3": ModelConfig("o3", 2.0, 8.0, "openai"), "o3-mini": ModelConfig("o3-mini", 1.1, 4.4, "openai"), # Anthropic Models "claude-3-opus": ModelConfig("claude-3-opus", 15.0, 75.0, "anthropic"), "claude-3-sonnet": ModelConfig("claude-3-sonnet", 3.0, 15.0, "anthropic"), "claude-3-haiku": ModelConfig("claude-3-haiku", 0.25, 1.25, "anthropic"), "claude-3.5-sonnet": ModelConfig("claude-3.5-sonnet", 3.0, 15.0, "anthropic"), "claude-3.5-haiku": ModelConfig("claude-3.5-haiku", 0.80, 4.0, "anthropic"), "claude-3.7-sonnet": ModelConfig("claude-3.7-sonnet", 3.0, 15.0, "anthropic"), "claude-4.5-sonnet": ModelConfig("claude-4.5-sonnet", 3.0, 15.0, "anthropic"), "claude-4.5-opus": ModelConfig("claude-4.5-opus", 5.0, 25.0, "anthropic"), "claude-4.5-haiku": ModelConfig("claude-4.5-haiku", 1.0, 5.0, "anthropic"), # Google Models "gemini-1.5-pro": ModelConfig("gemini-1.5-pro", 1.25, 5.00, "google"), "gemini-1.5-flash": ModelConfig("gemini-1.5-flash", 0.075, 0.30, "google"), "gemini-2.0-flash": ModelConfig("gemini-2.0-flash", 0.10, 0.40, "google"), "gemini-2.0-flash-lite": ModelConfig("gemini-2.0-flash-lite", 0.075, 0.30, "google"), "gemini-2.5-pro": ModelConfig("gemini-2.5-pro", 1.25, 10.00, "google"), "gemini-2.5-flash": ModelConfig("gemini-2.5-flash", 0.30, 2.50, "google"), "gemini-3.0-pro": ModelConfig("gemini-3.0-pro", 2.00, 12.00, "google"), "gemini-3.0-flash": ModelConfig("gemini-3.0-flash", 0.50, 3.00, "google"), "gemini-2.0-pro-exp": ModelConfig("gemini-2.0-pro-exp", 0.00, 0.00, "google"), # Local/Open Models (estimated inference costs via providers like Together/Groq) "llama-3.3-70b": ModelConfig("llama-3.3-70b", 0.10, 0.40, "local"), "llama-4-scout": ModelConfig("llama-4-scout", 0.10, 0.34, "local"), "llama-4-maverick": ModelConfig("llama-4-maverick", 0.22, 0.85, "local"), "llama-4-behemoth": ModelConfig("llama-4-behemoth", 3.50, 3.50, "local"), "deepseek-v3": ModelConfig("deepseek-v3", 0.28, 0.42, "local"), "deepseek-r1": ModelConfig("deepseek-r1", 0.55, 2.19, "local"), }
[docs] def __init__( self, architect_model: str = "gpt-4", solver_model: str = "gpt-3.5-turbo", custom_configs: Optional[Dict[str, ModelConfig]] = None, ): """ Initialize the PPD Analyzer. Args: architect_model: Model used for GRD planning phase solver_model: Model used for GRD execution phase custom_configs: Optional custom model configurations """ self.architect_model = architect_model self.solver_model = solver_model # Merge custom configs with defaults self.model_configs = dict(self.MODEL_CONFIGS) if custom_configs: self.model_configs.update(custom_configs) # Usage tracking self.usage_log: List[StepMetrics] = [] self._session_start = datetime.now()
[docs] def get_model_config(self, model_id: str) -> ModelConfig: """Get configuration for a model.""" if model_id in self.model_configs: return self.model_configs[model_id] # Default config for unknown models return ModelConfig(model_id, 1.0, 2.0, "unknown")
[docs] def calculate_cost( self, usage: TokenUsage, model_id: str, ) -> float: """ Calculate cost for given token usage. Args: usage: Token usage to calculate cost for model_id: Model ID to use for pricing Returns: Cost in USD """ config = self.get_model_config(model_id) input_cost = (usage.prompt_tokens / 1_000_000) * config.input_cost_per_1m output_cost = (usage.completion_tokens / 1_000_000) * config.output_cost_per_1m return input_cost + output_cost
[docs] def track_usage( self, usage: TokenUsage, phase: str, step_id: Optional[str] = None, latency_ms: float = 0.0, ) -> StepMetrics: """ Track token usage for a step. Args: usage: Token usage for this step phase: "planning" or "execution" step_id: Optional step identifier latency_ms: Latency in milliseconds Returns: StepMetrics for this step """ model_id = self.architect_model if phase == "planning" else self.solver_model cost = self.calculate_cost(usage, model_id) metrics = StepMetrics( step_id=step_id or f"step_{len(self.usage_log) + 1}", phase=phase, token_usage=usage, cost_usd=cost, latency_ms=latency_ms, ) self.usage_log.append(metrics) return metrics
[docs] def get_cost_analysis(self) -> CostAnalysis: """ Get complete cost analysis for all tracked usage. Returns: CostAnalysis with complete breakdown """ total_cost = sum(m.cost_usd for m in self.usage_log) planning_cost = sum(m.cost_usd for m in self.usage_log if m.phase == "planning") execution_cost = sum(m.cost_usd for m in self.usage_log if m.phase == "execution") total_tokens = sum(m.token_usage.total_tokens for m in self.usage_log) prompt_tokens = sum(m.token_usage.prompt_tokens for m in self.usage_log) completion_tokens = sum(m.token_usage.completion_tokens for m in self.usage_log) return CostAnalysis( total_cost_usd=total_cost, planning_cost_usd=planning_cost, execution_cost_usd=execution_cost, total_tokens=total_tokens, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, step_costs=list(self.usage_log), )
[docs] def estimate_baseline_cost( self, baseline_model: str, problem_complexity_tokens: int = 500, response_tokens: int = 200, ) -> float: """ Estimate cost for solving with a single baseline model. This estimates what it would cost to solve the problem using a single model without BRAID's split architecture. Args: baseline_model: Model to use as baseline problem_complexity_tokens: Estimated input tokens response_tokens: Estimated response tokens Returns: Estimated cost in USD """ usage = TokenUsage( prompt_tokens=problem_complexity_tokens, completion_tokens=response_tokens, ) return self.calculate_cost(usage, baseline_model)
[docs] def calculate_ppd_score( self, accuracy: float, total_cost: Optional[float] = None, ) -> float: """ Calculate Performance-per-Dollar score. PPD = Accuracy / Cost Higher is better. A score of 100 means 100% accuracy at $0.01 cost. Args: accuracy: Accuracy between 0.0 and 1.0 total_cost: Optional override for total cost Returns: PPD score """ if total_cost is None: analysis = self.get_cost_analysis() total_cost = analysis.total_cost_usd if total_cost <= 0: return float("inf") if accuracy > 0 else 0.0 # Scale PPD for readability (per $0.01) return (accuracy / total_cost) * 0.01
[docs] def compare_with_baseline( self, accuracy: float, baseline_model: str, baseline_accuracy: Optional[float] = None, ) -> PPDReport: """ Compare BRAID execution with a baseline model. Args: accuracy: BRAID accuracy baseline_model: Model to compare against baseline_accuracy: Baseline model accuracy (if known) Returns: PPDReport with comparison metrics """ analysis = self.get_cost_analysis() braid_cost = analysis.total_cost_usd # Estimate baseline cost avg_tokens = analysis.total_tokens / max(len(self.usage_log), 1) baseline_cost = self.estimate_baseline_cost( baseline_model, problem_complexity_tokens=int(avg_tokens * 0.7), response_tokens=int(avg_tokens * 0.3), ) # Calculate PPD scores braid_ppd = self.calculate_ppd_score(accuracy, braid_cost) if baseline_accuracy is not None and baseline_cost > 0: baseline_ppd = (baseline_accuracy / baseline_cost) * 0.01 efficiency_multiplier = braid_ppd / baseline_ppd if baseline_ppd > 0 else float("inf") else: efficiency_multiplier = 1.0 return PPDReport( accuracy=accuracy, total_cost_usd=braid_cost, ppd_score=braid_ppd, efficiency_multiplier=efficiency_multiplier, baseline_model=baseline_model, baseline_cost_usd=baseline_cost, breakdown={ "architect_model": self.architect_model, "solver_model": self.solver_model, "planning_cost": analysis.planning_cost_usd, "execution_cost": analysis.execution_cost_usd, "total_tokens": analysis.total_tokens, "num_steps": len(self.usage_log), }, )
[docs] def generate_report( self, accuracy: float, baseline_model: Optional[str] = None, format: str = "markdown", ) -> str: """ Generate a human-readable performance report. Args: accuracy: Achieved accuracy baseline_model: Optional model for comparison format: Output format ("markdown" or "text") Returns: Formatted report string """ analysis = self.get_cost_analysis() if format == "markdown": return self._format_markdown_report(analysis, accuracy, baseline_model) else: return self._format_text_report(analysis, accuracy, baseline_model)
def _format_markdown_report( self, analysis: CostAnalysis, accuracy: float, baseline_model: Optional[str], ) -> str: """Format report as Markdown.""" lines = [ "# BRAID Performance Report", "", "## Configuration", f"- **Architect Model:** {self.architect_model}", f"- **Solver Model:** {self.solver_model}", "", "## Cost Breakdown", f"| Phase | Cost (USD) | Tokens |", f"|-------|------------|--------|", f"| Planning | ${analysis.planning_cost_usd:.6f} | {sum(m.token_usage.total_tokens for m in self.usage_log if m.phase == 'planning')} |", f"| Execution | ${analysis.execution_cost_usd:.6f} | {sum(m.token_usage.total_tokens for m in self.usage_log if m.phase == 'execution')} |", f"| **Total** | **${analysis.total_cost_usd:.6f}** | **{analysis.total_tokens}** |", "", "## Performance Metrics", f"- **Accuracy:** {accuracy:.1%}", f"- **PPD Score:** {self.calculate_ppd_score(accuracy):.2f}", ] if baseline_model: report = self.compare_with_baseline(accuracy, baseline_model) lines.extend( [ "", "## Baseline Comparison", f"- **Baseline Model:** {baseline_model}", f"- **Estimated Baseline Cost:** ${report.baseline_cost_usd:.6f}", f"- **Efficiency Multiplier:** {report.efficiency_multiplier:.2f}x", ] ) return "\n".join(lines) def _format_text_report( self, analysis: CostAnalysis, accuracy: float, baseline_model: Optional[str], ) -> str: """Format report as plain text.""" lines = [ "BRAID Performance Report", "=" * 40, f"Architect: {self.architect_model}", f"Solver: {self.solver_model}", "", "Cost Breakdown:", f" Planning: ${analysis.planning_cost_usd:.6f}", f" Execution: ${analysis.execution_cost_usd:.6f}", f" Total: ${analysis.total_cost_usd:.6f}", "", f"Total Tokens: {analysis.total_tokens}", f"Accuracy: {accuracy:.1%}", f"PPD Score: {self.calculate_ppd_score(accuracy):.2f}", ] if baseline_model: report = self.compare_with_baseline(accuracy, baseline_model) lines.extend( [ "", f"vs {baseline_model}:", f" Efficiency: {report.efficiency_multiplier:.2f}x", ] ) return "\n".join(lines)
[docs] def reset(self) -> None: """Reset all tracking data.""" self.usage_log = [] self._session_start = datetime.now()
class LatencyTracker: """Context manager for tracking operation latency.""" def __init__(self): self.start_time: float = 0.0 self.end_time: float = 0.0 def __enter__(self) -> "LatencyTracker": self.start_time = time.perf_counter() return self def __exit__(self, *args) -> None: self.end_time = time.perf_counter() @property def elapsed_ms(self) -> float: """Get elapsed time in milliseconds.""" return (self.end_time - self.start_time) * 1000