{
  "intro": "Arc is a vendor-neutral technology radar for applied AI: a catalog of models, tools, techniques, and infrastructure for teams building production systems on LLMs and agents. We aren't tied to any single vendor — proprietary and open solutions sit side by side, managed services next to self-hosted, mature defaults next to promising newcomers. Every entry is placed in one of four quadrants (Models, Tools, Techniques, Infrastructure) and one of four maturity rings. The goal is to give an engineer a fast, honest bearing: what you can already build on, what's worth piloting, what to keep watching, and what to steer clear of.",
  "methodology": "A ring reflects not 'quality' in the abstract but how ready a technology is for responsible adoption: how well proven it is under real load, how predictable it is to operate, how stable its API is, and how well it fits the standards. We weigh maturity and support, availability (open-weight/open-source vs managed), results on reproducible benchmarks, operational cost, and the presence of a living ecosystem. The radar is deliberately vendor-neutral: neighboring entries in the same ring are alternatives to compare, not a better/worse ranking. Placement reflects the state of the industry on the revision date and is revisited as releases ship, statuses change (GA, maintenance, archival), and production experience accumulates; stale or wound-down projects move to Hold, and matured ones are promoted up the rings.",
  "title": "Arc",
  "tagline": "Tracing the maturity arc of applied AI.",
  "ring_defs": [
    {
      "ring": "Adopt",
      "def": "A mature default. The technology is proven under real load, predictable to operate, and has a settled API — safe to build production and new projects on with confidence."
    },
    {
      "ring": "Trial",
      "def": "Ready for pilots. Production-viable and delivers tangible value, but takes real adoption effort, tuning to your profile, or is still maturing — trial it on bounded tasks with good evals before making it a default."
    },
    {
      "ring": "Assess",
      "def": "Worth watching. A promising direction with a fast-maturing ecosystem or a fresh release; governance, security, and operational patterns are still forming — explore it in isolated scenarios and behind opt-in, not in critical production."
    },
    {
      "ring": "Hold",
      "def": "Hold off. A stalled, wound-down, or successor-superseded option. Not a sensible pick for new systems; maintain existing deployments and migrate off in an orderly way."
    }
  ],
  "quadrants": [
    {
      "quadrant": "Models",
      "entries": [
        {
          "name": "Claude Opus 4.8 (Anthropic)",
          "ring": "Adopt",
          "description": "Anthropic's flagship proprietary model (released 2026-05-28), leading agentic coding (SWE-Bench Pro 69.2%, SWE-bench Verified 88.6%) and computer use. Stable API; Fast mode is three times cheaper than on Opus 4.7.",
          "rationale": "A mature, predictable default for production agents and coding — proven on real workloads, benchmarks, and price."
        },
        {
          "name": "OpenAI GPT-5.5",
          "ring": "Adopt",
          "description": "OpenAI's current flagship (API release 2026-04-24) with Thinking/Pro/Instant variants; strong at code, research, data analysis, and document work. Instant is the default ChatGPT model.",
          "rationale": "A versatile, widely available workhorse with a rich tooling ecosystem."
        },
        {
          "name": "Gemini 3.5 Flash (Google DeepMind)",
          "ring": "Trial",
          "description": "Google's mid-tier model (released 2026-05-19 at I/O): Pro-level reasoning at Flash latency. Beats Gemini 3.1 Pro on Terminal-Bench 2.1 (76.2% vs 70.3%), MCP Atlas (83.6% vs 78.2%), and GDPval-AA. Gemini 3.5 Pro is still internal/Vertex-preview only.",
          "rationale": "A fresh release with strong agentic benchmarks and low latency — worth trialing in pilots before making it a default."
        },
        {
          "name": "DeepSeek V4 (open-weight)",
          "ring": "Trial",
          "description": "An open MoE model under MIT (released 2026-04-24): V4-Pro 1.6T/49B active, V4-Flash 284B/13B, 1M context, DSA sparse attention. V4-Pro-Max scores 80.6% on SWE-bench Verified — top among open weights, on par with Gemini 3.1 Pro.",
          "rationale": "Frontier-level with open weights, self-hosting, and low cost — justifies a pilot where privacy and cost control matter."
        },
        {
          "name": "Kimi K2.6 (Moonshot AI)",
          "ring": "Assess",
          "description": "An open 1T MoE model (32B active, 384 experts, 262K context) under a Modified MIT license; #4 on the Artificial Analysis Intelligence Index and the current open-weight leader. Matches GPT-5.5 on SWE-Bench Pro (58.6%), leads Humanity's Last Exam with tools; ~80% cheaper than the frontier.",
          "rationale": "Today's best open weight for agentic tool use, but still behind the top three proprietary leaders — worth assessing, not yet building production on."
        },
        {
          "name": "Llama 4 Behemoth (Meta)",
          "ring": "Hold",
          "description": "Meta's ~2T-parameter teacher model (288B active, 16 experts) that never shipped publicly, owing to MoE-routing and chunked-attention problems at 2T scale. As of mid-2026 the weights are unreleased with no formal cancellation — it exists only as an internal artifact for distilling Scout/Maverick.",
          "rationale": "A stalled, never-shipped release — don't plan around it until Meta produces working weights."
        }
      ]
    },
    {
      "quadrant": "Tools",
      "entries": [
        {
          "name": "Claude Agent SDK",
          "ring": "Adopt",
          "description": "Anthropic's library (Python and TypeScript) for building production agents on the same harness as Claude Code: an agentic tool-use loop, context management, subagents, persistent sessions, a native MCP client, built-in file/bash/web tools, and optional human-in-the-loop. Renamed from the Claude Code SDK in September 2025; it has become the default way to embed Claude in long-running tool-using processes.",
          "rationale": "A mature, vendor-maintained SDK with proven production use and fast-growing adoption — low risk to adopt."
        },
        {
          "name": "Model Context Protocol (MCP)",
          "ring": "Adopt",
          "description": "An open standard for connecting LLMs to external tools and data, contributed by Anthropic to the Agentic AI Foundation under the Linux Foundation (announced December 9, 2025; founding contributors Anthropic, Block, and OpenAI). By 2026 it is the de facto industry integration interface: 10,000+ servers and first-class support in Claude, ChatGPT, Cursor, Gemini, Microsoft Copilot, and VS Code.",
          "rationale": "Cross-vendor standardization and neutral governance have made MCP a de facto mandatory foundation for agentic integrations."
        },
        {
          "name": "Vercel AI SDK 6",
          "ring": "Adopt",
          "description": "The leading TypeScript toolkit for AI apps and agents, with a provider-neutral API, a first-class Agent abstraction (interface plus a ToolLoopAgent implementation), full MCP support, tool-execution approval, and DevTools. Over 20M downloads a month; integrates with React/Next.js/Vue/Svelte/Node.",
          "rationale": "A mature, widely adopted, provider-neutral library with a settled API — the standard choice for agents and LLM features in a TypeScript stack."
        },
        {
          "name": "LangGraph",
          "ring": "Trial",
          "description": "A low-level orchestrator and runtime for long-running stateful agents: durable execution with crash recovery, checkpointing, time-travel debugging, human-in-the-loop, and short- and long-term memory. Reached v1.0 GA in October 2025; in production at Klarna, LinkedIn, Uber, Replit, Elastic, and others.",
          "rationale": "Powerful and production-proven, but low-level and demanding real investment — justified in pilots of complex agent systems rather than as a default."
        },
        {
          "name": "OpenAI Agents SDK",
          "ring": "Trial",
          "description": "OpenAI's first-party SDK for agent systems: a model-native harness for files and tools plus native sandbox execution (E2B, Modal, Daytona, Cloudflare, Vercel, Blaxel, Runloop). The major harness-and-sandboxes update shipped April 15, 2026, initially Python-first (TypeScript later).",
          "rationale": "Actively evolving and a good production fit, but the key capabilities are new and Python-first — worth trialing on bounded tasks."
        },
        {
          "name": "Microsoft Agent Framework",
          "ring": "Assess",
          "description": "A unified SDK for .NET and Python that merges AutoGen's lightweight agent abstractions with Semantic Kernel's enterprise features: middleware, telemetry, memory, graph-based workflows, orchestration patterns, and native MCP + A2A compatibility. GA 1.0 shipped April 3, 2026.",
          "rationale": "A strategically important Microsoft consolidation, but a very fresh release — assess its maturity and migration paths before trusting it in production."
        },
        {
          "name": "AutoGen",
          "ring": "Hold",
          "description": "An early Microsoft multi-agent framework with GroupChat and agent-conversation patterns. Now in maintenance mode (critical bug and security fixes only, no new features); the investment and explicit successor for new projects is the Microsoft Agent Framework.",
          "rationale": "Maintenance mode and a clear successor make starting new projects on AutoGen pointless — maintain existing ones only."
        }
      ]
    },
    {
      "quadrant": "Techniques",
      "entries": [
        {
          "name": "Adaptive RAG (routing queries by complexity)",
          "ring": "Adopt",
          "description": "A RAG pipeline where a classifier scores query complexity and routes it to the right branch: a cheap direct answer (or no retrieval) for simple questions and full multi-step/agentic search for hard ones. The canonical approach is Adaptive-RAG (a T5 classifier over three complexity classes); by 2026 it is baseline production-RAG practice, delivering the expensive branch's quality at noticeably lower cost.",
          "rationale": "A mature, proven pattern with clear cost/quality control — a safe default for production RAG."
        },
        {
          "name": "LLM-as-a-Judge (model-graded outputs)",
          "ring": "Adopt",
          "description": "Automatically grading one model's outputs with another against criteria (relevance, faithfulness, correctness). The canonical MT-Bench result: a strong judge agrees with experts ~85% of the time, higher than two humans agree with each other (~81%). The default for large-scale evals; requires controlling known biases (position, verbosity, self-preference) via randomization and calibration.",
          "rationale": "A cheap, scalable way to run evals that has become the industry default for evaluating LLM applications."
        },
        {
          "name": "Structured Output via constrained decoding (Strict Mode)",
          "ring": "Adopt",
          "description": "Guaranteeing a response conforms to a JSON Schema by masking invalid tokens during decoding (the schema compiles to a grammar/FSM). By 2026 it is natively supported by every major provider: OpenAI (Strict Mode, 2024), Google Gemini, Anthropic (structured-outputs beta since November 2025), Cohere, and xAI.",
          "rationale": "Removes a whole class of parsing errors and ships out of the box with every major provider — a mandatory baseline for integrations."
        },
        {
          "name": "Declarative prompt optimization: DSPy + GEPA",
          "ring": "Trial",
          "description": "DSPy specifies an LLM pipeline declaratively, and the GEPA optimizer uses natural-language reflection (analyzing traces and errors) to automatically evolve instructions while keeping a Pareto front of candidates. GEPA is an Oral at ICLR 2026: it beats the RL method GRPO by 6 points on average (up to 19) with ~35x fewer rollouts, and MIPROv2 by 10+ points.",
          "rationale": "Past the just-watching stage: integrated into DSPy with early production adoption — worth trying on real pipelines with good evals."
        },
        {
          "name": "Programmable guardrails (NeMo Guardrails / Llama Guard / LLM Guard)",
          "ring": "Trial",
          "description": "An input/output protection layer: content moderation, jailbreak and injection detection, topic and dialog-flow control. In practice teams combine 2-3 tools (defense-in-depth), since single detectors are bypassable (evasion attacks have high ASR), and NVIDIA still marks NeMo Guardrails itself as beta, not recommended in production as-is.",
          "rationale": "The tools are useful and available, but require assembling several components and domain tuning, and maturity isn't yet at default level — adopt under supervision."
        },
        {
          "name": "Multi-agent orchestration over MCP + A2A",
          "ring": "Assess",
          "description": "A two-layer agent-coordination stack: MCP standardizes access to tools and data, while A2A (Google then Linux Foundation, 150+ organizations) covers delegation between autonomous agents. Enterprise gateways with SSO and audit are emerging on top of MCP-for-tools + A2A-for-agents, but multi-agent coordination patterns are only just forming.",
          "rationale": "The ecosystem and standards are maturing fast, but governance, security, and coordination patterns are still forming — explore it in bounded scenarios."
        },
        {
          "name": "Manual prompt engineering as a standalone discipline",
          "ring": "Hold",
          "description": "Relying on isolated wording tweaks in a prompt, divorced from context management, retrieval, and evals. As context windows and reasoning grow, the payoff from wording tricks falls; the industry has shifted to context engineering (in 2026 surveys, most leaders consider bare prompting insufficient at scale).",
          "rationale": "Giving way to context engineering and automated prompt optimization — not worth choosing as a standalone practice for new systems."
        }
      ]
    },
    {
      "quadrant": "Infrastructure",
      "entries": [
        {
          "name": "vLLM",
          "ring": "Adopt",
          "description": "A high-performance LLM inference engine with PagedAttention, continuous batching, and an OpenAI-compatible API; the de facto standard backend for most cloud and self-hosted deployments. Supports 200+ architectures and every kind of parallelism (tensor/pipeline/data/expert).",
          "rationale": "The widest hardware and model coverage, the largest community, and proven scale at trillions of tokens a day make it a safe default."
        },
        {
          "name": "pgvector",
          "ring": "Adopt",
          "description": "A PostgreSQL extension for vector and hybrid search with HNSW and IVFFlat indexes; halfvec (16-bit) and sparsevec (sparse) types save memory. Current stable release 0.8.2. Comfortably handles tens of millions of vectors without a separate database.",
          "rationale": "For most RAG scenarios it's the best choice: it reuses your existing Postgres and removes the operational overhead of a separate vector database."
        },
        {
          "name": "SGLang",
          "ring": "Trial",
          "description": "An inference engine with RadixAttention (prefix KV caching in a radix tree) and fast constrained decoding; a marked gain on prefix-heavy workloads (RAG, multi-turn chat) and structured output. In production at xAI (Grok), NVIDIA, AMD, Azure, and Cursor; part of the PyTorch ecosystem.",
          "rationale": "Production-mature and faster than vLLM on some workloads, but narrower hardware and architecture coverage — adopt selectively for a specific traffic profile."
        },
        {
          "name": "Qdrant",
          "ring": "Trial",
          "description": "A purpose-built vector database in Rust with fast ANN search (HNSW) and rich metadata filtering inside the traversal graph itself; among the lowest-latency open-source options at large scale.",
          "rationale": "Justified when pgvector hits a ceiling on volume or you need complex filters, but it's another service to operate — trial it under load rather than everywhere."
        },
        {
          "name": "OpenTelemetry GenAI Semantic Conventions",
          "ring": "Assess",
          "description": "OTel's semantic-convention standard for tracing LLMs/agents/MCP: consistent span names, tool-call attributes, token metrics. Client-call spans stabilized in early 2026, but events, metrics, and agent/MCP spans are still in Development status.",
          "rationale": "A promising industry standard that has partly stabilized, but key attributes are still changing — study it and adopt behind opt-in for now (OTEL_SEMCONV_STABILITY)."
        },
        {
          "name": "Hugging Face TGI (as a serving engine)",
          "ring": "Hold",
          "description": "Text Generation Inference — an early serving engine from Hugging Face. In maintenance since late 2025, the repository archived in March 2026; HF itself recommends vLLM or SGLang for endpoints.",
          "rationale": "The project is effectively wound down and superseded by vLLM/SGLang, so don't deploy it new — migrate existing ones."
        }
      ]
    }
  ]
}
