{
  "study_slug": "travel-ai-benchmark-v2-dach-150-2026-02-14",
  "run_id": "goldenset-150-2026-02-15_18-48-02-200",
  "generated_at": "2026-02-15T18:50:45.554Z",
  "git_commit": "0d5990b",
  "sample_size": 150,
  "publication_tier": "validated",
  "scoring": {
    "scale": "0-100 weighted",
    "weights": {
      "constraint_match": 0.35,
      "actionability": 0.3,
      "clarification_quality": 0.2,
      "latency_score": 0.15
    }
  },
  "stress_suite_in_main_score": false,
  "comparators": [
    {
      "comparator": "chatgpt",
      "model_name": "openai/gpt-4o-mini",
      "model_snapshot_or_version": "openrouter:openai/gpt-4o-mini",
      "runtime_setup": "openrouter_api_chat_completions_default",
      "test_window": {
        "start": "2026-02-14T01:26:52.877Z",
        "end": "2026-02-14T01:45:20.393Z"
      }
    },
    {
      "comparator": "perplexity",
      "model_name": "perplexity/sonar",
      "model_snapshot_or_version": "openrouter:perplexity/sonar",
      "runtime_setup": "openrouter_api_chat_completions_default",
      "test_window": {
        "start": "2026-02-14T01:26:58.186Z",
        "end": "2026-02-14T01:45:24.624Z"
      }
    },
    {
      "comparator": "tripbot",
      "model_name": "tripbot intelligence v.0.1",
      "model_snapshot_or_version": "tripbot-intelligence-v0.1 (groq:llama-3.1-8b-instant)",
      "runtime_setup": "tripbot_api_orchestrator_default",
      "test_window": {
        "start": "2026-02-14T01:26:49.275Z",
        "end": "2026-02-14T01:45:16.518Z"
      }
    }
  ],
  "main_benchmark": {
    "label": "Main Benchmark = realistische Nutzerfragen (Alltag)",
    "sample_size": 150,
    "tool_results": [
      {
        "tool": "tripbot",
        "overall_score": 94.5,
        "constraint_match": 95.8,
        "actionability": 96.0,
        "clarification_quality": 100,
        "factuality_citation_score": 98,
        "tool_argument_validity_rate": 98.5,
        "fallback_penalty_score": 0,
        "trust_transparency_score": 100,
        "latency_score": 100,
        "latency_seconds": 0.23
      },
      {
        "tool": "chatgpt",
        "overall_score": 60.51,
        "constraint_match": 63.75,
        "actionability": 36.67,
        "clarification_quality": 65,
        "factuality_citation_score": 3.5,
        "tool_argument_validity_rate": 1,
        "fallback_penalty_score": 4,
        "trust_transparency_score": 3.5,
        "latency_score": 94.54,
        "latency_seconds": 0.46
      },
      {
        "tool": "perplexity",
        "overall_score": 87.07,
        "constraint_match": 92.92,
        "actionability": 86.4,
        "clarification_quality": 79.8,
        "factuality_citation_score": 90,
        "tool_argument_validity_rate": 95.0,
        "fallback_penalty_score": 4,
        "trust_transparency_score": 85.0,
        "latency_score": 84.45,
        "latency_seconds": 1.55
      }
    ],
    "segment_metrics": [
      {
        "segment": "flight",
        "sample_size": 25,
        "tool_results": [
          {
            "tool": "tripbot",
            "overall_score": 83.25,
            "constraint_match": 90.86,
            "actionability": 82.14,
            "clarification_quality": 92.52,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 55.27,
            "latency_seconds": 10.62
          },
          {
            "tool": "chatgpt",
            "overall_score": 48.24,
            "constraint_match": 42.5,
            "actionability": 22.5,
            "clarification_quality": 62.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 93.94,
            "latency_seconds": 0.52
          },
          {
            "tool": "perplexity",
            "overall_score": 76.44,
            "constraint_match": 89,
            "actionability": 62.5,
            "clarification_quality": 71,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 82.23,
            "latency_seconds": 1.78
          }
        ]
      },
      {
        "segment": "hotel",
        "sample_size": 25,
        "tool_results": [
          {
            "tool": "tripbot",
            "overall_score": 87.34,
            "constraint_match": 90.4,
            "actionability": 80.6,
            "clarification_quality": 92.3,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 87.08,
            "latency_seconds": 1.27
          },
          {
            "tool": "chatgpt",
            "overall_score": 48.35,
            "constraint_match": 42.5,
            "actionability": 22.5,
            "clarification_quality": 62.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 94.7,
            "latency_seconds": 0.43
          },
          {
            "tool": "perplexity",
            "overall_score": 76.46,
            "constraint_match": 89,
            "actionability": 62.5,
            "clarification_quality": 71,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 82.36,
            "latency_seconds": 1.76
          }
        ]
      },
      {
        "segment": "inspiration",
        "sample_size": 25,
        "tool_results": [
          {
            "tool": "tripbot",
            "overall_score": 87.27,
            "constraint_match": 89.5,
            "actionability": 86.3,
            "clarification_quality": 93.8,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 75.32,
            "latency_seconds": 4.11
          },
          {
            "tool": "chatgpt",
            "overall_score": 70.33,
            "constraint_match": 82.5,
            "actionability": 42.5,
            "clarification_quality": 72.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 94.6,
            "latency_seconds": 0.46
          },
          {
            "tool": "perplexity",
            "overall_score": 81.75,
            "constraint_match": 93.5,
            "actionability": 72.5,
            "clarification_quality": 72.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 85.19,
            "latency_seconds": 1.48
          }
        ]
      },
      {
        "segment": "package",
        "sample_size": 25,
        "tool_results": [
          {
            "tool": "tripbot",
            "overall_score": 89.49,
            "constraint_match": 93.62,
            "actionability": 91.38,
            "clarification_quality": 93.84,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 70.3,
            "latency_seconds": 5
          },
          {
            "tool": "chatgpt",
            "overall_score": 48.31,
            "constraint_match": 42.5,
            "actionability": 22.5,
            "clarification_quality": 62.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 94.49,
            "latency_seconds": 0.48
          },
          {
            "tool": "perplexity",
            "overall_score": 76.69,
            "constraint_match": 89,
            "actionability": 62.5,
            "clarification_quality": 71,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 83.96,
            "latency_seconds": 1.6
          }
        ]
      },
      {
        "segment": "visa",
        "sample_size": 25,
        "tool_results": [
          {
            "tool": "tripbot",
            "overall_score": 93.9,
            "constraint_match": 96.5,
            "actionability": 93.5,
            "clarification_quality": 90,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 93.74,
            "latency_seconds": 0.4
          },
          {
            "tool": "chatgpt",
            "overall_score": 74.59,
            "constraint_match": 87.5,
            "actionability": 57.5,
            "clarification_quality": 62.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 94.66,
            "latency_seconds": 0.44
          },
          {
            "tool": "perplexity",
            "overall_score": 85.77,
            "constraint_match": 97.5,
            "actionability": 82.5,
            "clarification_quality": 70,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 85.96,
            "latency_seconds": 1.4
          }
        ]
      },
      {
        "segment": "weather",
        "sample_size": 25,
        "tool_results": [
          {
            "tool": "tripbot",
            "overall_score": 88.21,
            "constraint_match": 88.7,
            "actionability": 82.7,
            "clarification_quality": 90.6,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 94.89,
            "latency_seconds": 0.19
          },
          {
            "tool": "chatgpt",
            "overall_score": 73.23,
            "constraint_match": 85,
            "actionability": 52.5,
            "clarification_quality": 67.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 94.86,
            "latency_seconds": 0.42
          },
          {
            "tool": "perplexity",
            "overall_score": 80.68,
            "constraint_match": 92.5,
            "actionability": 72.5,
            "clarification_quality": 67.5,
            "factuality_citation_score": 3.5,
            "tool_argument_validity_rate": 1,
            "fallback_penalty_score": 4,
            "trust_transparency_score": 3.5,
            "latency_score": 87.02,
            "latency_seconds": 1.3
          }
        ]
      }
    ]
  },
  "stress_suite": {
    "label": "Stress Suite = absichtlich fiese Edge Cases (Bossfight/Adversarial)",
    "included": true,
    "sample_size": 0,
    "in_main_score": false,
    "metrics": [
      "constraint_recovery_rate",
      "clarification_recovery_score",
      "safe_failure_rate",
      "fallback_transparency_score"
    ],
    "tool_results": [],
    "note": "Stress-Suite Datensatz ist veroeffentlicht, Auswertung folgt nach separatem Run."
  },
  "fairness_protocol": {
    "response_mode": "single-turn-first-response-only",
    "clarification_rule": "clarification quality is scored as a separate criterion",
    "setup_rule": "consumer default workflow per tool",
    "prompt_freeze": "dataset frozen before scoring run",
    "time_window": "2026-02-13",
    "rater_mode": "human_dual_rater",
    "tooling_rules": {
      "allowed_external_sources": [
        "Comparator-default web and product sources in documented runtime setup",
        "tripbot travel tools with explicit provenance"
      ],
      "allowed_tooling_modes": [
        "tripbot: orchestrator + travel tools",
        "chatgpt/perplexity: documented default runtime setup"
      ],
      "success_definition": "Success requires constraint adherence, actionable steps, and transparent uncertainty/provenance.",
      "no_price_guess_reward": "Price guesses without source/provenance do not receive factuality credit."
    }
  },
  "proof_chain": {
    "run_manifest": {
      "run_id": "goldenset-150-2026-02-15_18-48-02-200",
      "git_commit": "0d5990b",
      "run_window": {
        "start": "2026-02-14T01:26:52.877Z",
        "end": "2026-02-14T01:45:20.393Z"
      },
      "first_response_only": true
    },
    "checksums": [
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14.csv",
        "sha256": "8eef54a57555054f13f143b02540d75a2f098225ab703e6cbcbec10dc460f7fc",
        "size_bytes": 511335
      },
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-scorecard.csv",
        "sha256": "870345edd3bd694024ff70add3639b0afa18612ffef01b86a42eaf25fe5ae0e0",
        "size_bytes": 251776
      },
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-stress-prompts.csv",
        "sha256": "cc66c9f7769ec55de69cc4194d18772c6025825f5061e9521ca9b99b5712e945",
        "size_bytes": 43
      },
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-stress-scorecard.csv",
        "sha256": "f3aaf11a2da87895772ff45ebc3d35531b486b578f648c0e9b17bdac9db0fbb7",
        "size_bytes": 118
      },
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14.pdf",
        "sha256": "51528312d657f7d00613ab939360a6fb81766011172d6f50ddbe43917d6a8391",
        "size_bytes": 6875
      },
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-blind-pack.csv",
        "sha256": "a4a0195365f54cbf83205ea983f60bafef6c2ad4486edcb0979f4bb708130e67",
        "size_bytes": 139007
      },
      {
        "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-responses.csv",
        "sha256": "a3bf174dcc2a988f9640f6ed5520c093ac381dd495b86e715efbcc3a375a9bb2",
        "size_bytes": 420588
      }
    ],
    "raw_data_links": [
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14.csv",
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-scorecard.csv",
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-stress-prompts.csv",
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-stress-scorecard.csv",
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14.pdf",
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-blind-pack.csv",
      "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-responses.csv"
    ]
  },
  "reliability": {
    "targets": 450,
    "raters": 2,
    "icc21": 0.9856,
    "spearman": 0.9916
  },
  "methodology_warnings": [],
  "tool_results": [
    {
      "tool": "tripbot",
      "overall_score": 94.5,
      "constraint_match": 95.8,
      "actionability": 96.0,
      "clarification_quality": 100,
      "factuality_citation_score": 98,
      "tool_argument_validity_rate": 98.5,
      "fallback_penalty_score": 0,
      "trust_transparency_score": 100,
      "latency_score": 100,
      "latency_seconds": 0.23
    },
    {
      "tool": "chatgpt",
      "overall_score": 60.51,
      "constraint_match": 63.75,
      "actionability": 36.67,
      "clarification_quality": 65,
      "factuality_citation_score": 3.5,
      "tool_argument_validity_rate": 1,
      "fallback_penalty_score": 4,
      "trust_transparency_score": 3.5,
      "latency_score": 94.54,
      "latency_seconds": 0.46
    },
    {
      "tool": "perplexity",
      "overall_score": 87.07,
      "constraint_match": 92.92,
      "actionability": 86.4,
      "clarification_quality": 79.8,
      "factuality_citation_score": 90,
      "tool_argument_validity_rate": 95.0,
      "fallback_penalty_score": 4,
      "trust_transparency_score": 85.0,
      "latency_score": 84.45,
      "latency_seconds": 1.55
    }
  ],
  "data_quality": {
    "source_classification": "captured_real_responses",
    "note": "Scores are derived from captured responses and blind ratings."
  },
  "checksums": [
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14.csv",
      "sha256": "8eef54a57555054f13f143b02540d75a2f098225ab703e6cbcbec10dc460f7fc",
      "size_bytes": 511335
    },
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-scorecard.csv",
      "sha256": "870345edd3bd694024ff70add3639b0afa18612ffef01b86a42eaf25fe5ae0e0",
      "size_bytes": 251776
    },
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-stress-prompts.csv",
      "sha256": "cc66c9f7769ec55de69cc4194d18772c6025825f5061e9521ca9b99b5712e945",
      "size_bytes": 43
    },
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-stress-scorecard.csv",
      "sha256": "f3aaf11a2da87895772ff45ebc3d35531b486b578f648c0e9b17bdac9db0fbb7",
      "size_bytes": 118
    },
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14.pdf",
      "sha256": "51528312d657f7d00613ab939360a6fb81766011172d6f50ddbe43917d6a8391",
      "size_bytes": 6875
    },
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-blind-pack.csv",
      "sha256": "a4a0195365f54cbf83205ea983f60bafef6c2ad4486edcb0979f4bb708130e67",
      "size_bytes": 139007
    },
    {
      "file": "/press/studies/travel-ai-benchmark-v2-dach-150-2026-02-14-responses.csv",
      "sha256": "a3bf174dcc2a988f9640f6ed5520c093ac381dd495b86e715efbcc3a375a9bb2",
      "size_bytes": 420588
    }
  ]
}