From cc7116d787736f1f3d3337c7c2e44baebd95476d Mon Sep 17 00:00:00 2001 From: JingWen Fan <106414602+study8677@users.noreply.github.com> Date: Wed, 10 Jun 2026 19:34:19 +0800 Subject: [PATCH] =?UTF-8?q?feat(tutorial):=20=E8=BF=9B=E9=98=B6=E7=AF=87?= =?UTF-8?q?=E9=99=8D=E5=99=AA=E4=BC=98=E5=8C=96=E5=B9=B6=E4=B8=8A=E7=BA=BF?= =?UTF-8?q?=E6=8A=80=E6=9C=AF=E6=A0=88=E9=80=89=E5=9E=8B=E7=AF=87(27-34)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 针对 Issue #15 降低进阶篇阅读门槛:为 10-17 章补充承上钩子、深水区标记和大白话类比;同时新增技术栈选型篇并更新导航与首页章节计数。 Co-authored-by: Cursor --- .vitepress/config.mts | 31 ++- README.md | 13 ++ README_en.md | 69 +++--- en/index.md | 10 +- ...66\346\236\204\345\210\244\346\226\255.md" | 4 +- ...\344\271\210\345\206\231\347\273\231AI.md" | 2 +- ...31\350\277\233\346\236\266\346\236\204.md" | 2 +- ...vibe\344\275\225\346\227\266spec-first.md" | 20 +- ...06\346\236\266\351\200\211\345\236\213.md" | 164 +++++++++++++ ...30\345\202\250\351\200\211\345\236\213.md" | 178 ++++++++++++++ ...73\347\273\237\351\200\211\345\236\213.md" | 194 ++++++++++++++++ ...32\344\277\241\351\200\211\345\236\213.md" | 173 ++++++++++++++ ...63\345\217\260\351\200\211\345\236\213.md" | 187 +++++++++++++++ ...57\346\240\210\351\200\211\345\236\213.md" | 169 ++++++++++++++ ...57\346\240\210\351\200\211\345\236\213.md" | 178 ++++++++++++++ ...13\345\206\263\347\255\226\346\240\221.md" | 194 ++++++++++++++++ en/tutorial/README.md | 20 +- index.md | 8 +- ...04\347\241\254\351\201\223\347\220\206.md" | 25 +- ...64\346\200\247\345\267\245\347\250\213.md" | 13 +- ...45\350\200\214\350\256\276\350\256\241.md" | 17 +- ...26\347\232\204\345\212\233\345\255\246.md" | 12 +- ...47\345\236\213\347\263\273\347\273\237.md" | 2 + ...07\345\215\263\346\236\266\346\236\204.md" | 9 +- ...37\346\210\267\346\236\266\346\236\204.md" | 4 +- ...66\346\236\204\345\210\244\346\226\255.md" | 8 +- ...\344\271\210\345\206\231\347\273\231AI.md" | 2 +- ...31\350\277\233\346\236\266\346\236\204.md" | 2 +- ...vibe\344\275\225\346\227\266spec-first.md" | 20 +- ...06\346\236\266\351\200\211\345\236\213.md" | 188 +++++++++++++++ ...30\345\202\250\351\200\211\345\236\213.md" | 202 ++++++++++++++++ ...73\347\273\237\351\200\211\345\236\213.md" | 219 ++++++++++++++++++ ...32\344\277\241\351\200\211\345\236\213.md" | 187 +++++++++++++++ ...63\345\217\260\351\200\211\345\236\213.md" | 191 +++++++++++++++ ...57\346\240\210\351\200\211\345\236\213.md" | 171 ++++++++++++++ ...57\346\240\210\351\200\211\345\236\213.md" | 182 +++++++++++++++ ...13\345\206\263\347\255\226\346\240\221.md" | 196 ++++++++++++++++ tutorial/README.md | 20 +- 38 files changed, 3199 insertions(+), 87 deletions(-) create mode 100644 "en/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" create mode 100644 "en/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" create mode 100644 "tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" create mode 100644 "tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" create mode 100644 "tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" create mode 100644 "tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" create mode 100644 "tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" create mode 100644 "tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" create mode 100644 "tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" create mode 100644 "tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" diff --git a/.vitepress/config.mts b/.vitepress/config.mts index 7683540..a69b488 100644 --- a/.vitepress/config.mts +++ b/.vitepress/config.mts @@ -42,6 +42,17 @@ const zhTutorialCollab = [ { text: '26 · 协作决策树:何时 vibe、何时 spec-first', link: '/tutorial/26-协作决策树何时vibe何时spec-first' }, ] +const zhTutorialStack = [ + { text: '27 · 编程语言与后端框架选型', link: '/tutorial/27-编程语言与后端框架选型' }, + { text: '28 · 数据库与存储选型', link: '/tutorial/28-数据库与存储选型' }, + { text: '29 · 缓存、消息队列与事件系统选型', link: '/tutorial/29-缓存消息队列与事件系统选型' }, + { text: '30 · API 与服务通信选型', link: '/tutorial/30-API与服务通信选型' }, + { text: '31 · 云原生与部署平台选型', link: '/tutorial/31-云原生与部署平台选型' }, + { text: '32 · 可观测性与可靠性技术栈选型', link: '/tutorial/32-可观测性与可靠性技术栈选型' }, + { text: '33 · AI 基础设施技术栈选型', link: '/tutorial/33-AI基础设施技术栈选型' }, + { text: '34 · 技术选型决策树', link: '/tutorial/34-技术选型决策树' }, +] + const zhCases = [ { text: '案例总览', link: '/cases/README' }, { text: '01 · StarArena:演唱会抢票系统', link: '/cases/stararena-ticketing/README' }, @@ -98,7 +109,7 @@ const zhAgent = [ export default defineConfig({ title: 'Awesome Architecture', - description: '专注「架构思维」的中英双语知识库:26 章教程 + 25 张真实系统架构地图 + 6 个端到端案例。', + description: '专注「架构思维」的中英双语知识库:34 章教程 + 25 张真实系统架构地图 + 6 个端到端案例。', lang: 'zh-Hans', base: process.env.GITHUB_PAGES ? '/awesome-architecture/' : '/', cleanUrls: true, @@ -108,7 +119,7 @@ export default defineConfig({ head: [ ['meta', { name: 'theme-color', content: '#3c8772' }], ['meta', { property: 'og:title', content: 'Awesome Architecture · 架构图谱' }], - ['meta', { property: 'og:description', content: '像架构师一样思考:26 章教程 + 25 张真实系统架构地图 + 6 个端到端案例。' }], + ['meta', { property: 'og:description', content: '像架构师一样思考:34 章教程 + 25 张真实系统架构地图 + 6 个端到端案例。' }], // 注:Hypothesis 划词标注不再全站默认加载;改由评论区顶部的 toggle 按需注入。 // 见 .vitepress/theme/components/Comments.vue 的 loadHypothesis()。 ], @@ -137,6 +148,7 @@ export default defineConfig({ { text: '🚀 进阶篇', items: zhTutorialAdvanced }, { text: '🎯 实战篇', items: zhTutorialPractice }, { text: '🤝 AI 协同篇', items: zhTutorialCollab }, + { text: '🧰 技术栈选型篇', items: zhTutorialStack }, ], '/templates/': [ { text: '🗺️ 经典 / 通用系统', items: zhCommon }, @@ -151,6 +163,7 @@ export default defineConfig({ { text: '🚀 教程 · 进阶篇', collapsed: false, items: zhTutorialAdvanced }, { text: '🎯 教程 · 实战篇', collapsed: false, items: zhTutorialPractice }, { text: '🤝 教程 · AI 协同篇', collapsed: false, items: zhTutorialCollab }, + { text: '🧰 教程 · 技术栈选型篇', collapsed: false, items: zhTutorialStack }, { text: '🧪 案例篇', collapsed: false, items: zhCases }, { text: '🗺️ 经典 / 通用系统', collapsed: true, items: zhCommon }, { text: '🤖 AI 原生系统', collapsed: true, items: zhAI }, @@ -235,6 +248,20 @@ export default defineConfig({ { text: '26 · Collaboration decision tree: when to vibe, when to spec-first', link: '/en/tutorial/26-协作决策树何时vibe何时spec-first' }, ], }, + { + text: '🧰 Tutorial · Technology Stack Selection', + collapsed: true, + items: [ + { text: '27 · Languages & backend frameworks', link: '/en/tutorial/27-编程语言与后端框架选型' }, + { text: '28 · Databases & storage', link: '/en/tutorial/28-数据库与存储选型' }, + { text: '29 · Cache, queues & events', link: '/en/tutorial/29-缓存消息队列与事件系统选型' }, + { text: '30 · APIs & service communication', link: '/en/tutorial/30-API与服务通信选型' }, + { text: '31 · Cloud native & deployment', link: '/en/tutorial/31-云原生与部署平台选型' }, + { text: '32 · Observability & reliability', link: '/en/tutorial/32-可观测性与可靠性技术栈选型' }, + { text: '33 · AI infrastructure', link: '/en/tutorial/33-AI基础设施技术栈选型' }, + { text: '34 · Technology selection decision tree', link: '/en/tutorial/34-技术选型决策树' }, + ], + }, { text: '🧪 Cases', collapsed: false, diff --git a/README.md b/README.md index 61b9290..20d094b 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,19 @@ awesome-architecture/ | [25](tutorial/25-评测驱动把够好写进架构.md) | 评测驱动:把「够好」写进架构 | eval 当 CI 门禁,承接非确定性 | | [26](tutorial/26-协作决策树何时vibe何时spec-first.md) | 协作决策树:何时 vibe、何时 spec-first | 原型 vs 生产的 workflow 总收束 | +**🧰 技术栈选型篇(27–34)—— 把「用什么技术」变成架构判断:** + +| 章节 | 主题 | 你将判断 | +|---|---|---| +| [27](tutorial/27-编程语言与后端框架选型.md) | 编程语言与后端框架选型 | 语言/框架如何影响运行时、生态、团队和维护成本 | +| [28](tutorial/28-数据库与存储选型.md) | 数据库与存储选型 | 事实源、读模型、搜索、对象存储、向量库怎么分工 | +| [29](tutorial/29-缓存消息队列与事件系统选型.md) | 缓存、消息队列与事件系统选型 | 读热点、削峰、事件广播分别该用什么 | +| [30](tutorial/30-API与服务通信选型.md) | API 与服务通信选型 | REST / gRPC / GraphQL / Webhook 的边界 | +| [31](tutorial/31-云原生与部署平台选型.md) | 云原生与部署平台选型 | PaaS、容器、Serverless、K8s 何时值得上 | +| [32](tutorial/32-可观测性与可靠性技术栈选型.md) | 可观测性与可靠性技术栈选型 | 从 SLO 倒推指标、日志、链路、告警和事故流程 | +| [33](tutorial/33-AI基础设施技术栈选型.md) | AI 基础设施技术栈选型 | 模型 API、RAG、向量库、推理服务、eval 何时升级 | +| [34](tutorial/34-技术选型决策树.md) | 技术选型决策树 | 把阶段、失败模式、团队能力、退出方案串成 ADR | + > 👉 **新手从 [tutorial/README.md](tutorial/README.md) 开始**,那里有完整的学习路径。 ### 🧪 cases/ —— 把架构从答案写成推理过程 diff --git a/README_en.md b/README_en.md index 51c6437..a06fd01 100644 --- a/README_en.md +++ b/README_en.md @@ -54,49 +54,62 @@ Not "how to use framework X", but a transferable way of thinking: how to turn a | Ch. | Topic | What you'll learn | |---|---|---| -| [01](tutorial/01-为什么先有架构思维.md) | Why architecture-first thinking | Why "architecture first" is *the* core skill of this era | -| [02](tutorial/02-架构师的思考框架.md) | The architect's thinking framework | The universal flow: requirements → constraints → quality attributes → trade-offs | -| [03](tutorial/03-读懂与画好架构图.md) | Reading & drawing architecture diagrams | Use the C4 model to get the system out of your head and onto paper | -| [04](tutorial/04-十大核心架构模式.md) | 10 core architecture patterns | Layered, microservices, event-driven, CQRS… what each one actually solves | -| [05](tutorial/05-数据与状态.md) | Data & state | Why *data* is the real hard part of any system | -| [06](tutorial/06-质量属性与取舍.md) | Quality attributes & trade-offs | Performance / availability / consistency / cost — how to weigh them | -| [07](tutorial/07-从0到1设计一个系统.md) | Designing a system from 0 to 1 | A step-by-step methodology you can actually follow | -| [08](tutorial/08-架构决策记录与演进.md) | ADRs & evolution | Record decisions with ADRs and let architecture grow with the business | -| [09](tutorial/09-架构品味.md) | Architectural taste | What separates good architects beyond the framework; grow judgment via real cases (microservices→monolith, big-company tastes) | +| [01](en/tutorial/01-为什么先有架构思维.md) | Why architecture-first thinking | Why "architecture first" is *the* core skill of this era | +| [02](en/tutorial/02-架构师的思考框架.md) | The architect's thinking framework | The universal flow: requirements → constraints → quality attributes → trade-offs | +| [03](en/tutorial/03-读懂与画好架构图.md) | Reading & drawing architecture diagrams | Use the C4 model to get the system out of your head and onto paper | +| [04](en/tutorial/04-十大核心架构模式.md) | 10 core architecture patterns | Layered, microservices, event-driven, CQRS… what each one actually solves | +| [05](en/tutorial/05-数据与状态.md) | Data & state | Why *data* is the real hard part of any system | +| [06](en/tutorial/06-质量属性与取舍.md) | Quality attributes & trade-offs | Performance / availability / consistency / cost — how to weigh them | +| [07](en/tutorial/07-从0到1设计一个系统.md) | Designing a system from 0 to 1 | A step-by-step methodology you can actually follow | +| [08](en/tutorial/08-架构决策记录与演进.md) | ADRs & evolution | Record decisions with ADRs and let architecture grow with the business | +| [09](en/tutorial/09-架构品味.md) | Architectural taste | What separates good architects beyond the framework; grow judgment via real cases (microservices→monolith, big-company tastes) | **🚀 Advanced track (10–17, new) — taming the hard rock that bites once a system gets big or critical:** | Ch. | Topic | What you'll tame | |---|---|---| -| [10](tutorial/10-分布式系统的硬道理.md) | Distributed systems: the hard truths | Partial failure, no global clock, the cost of consensus, the exactly-once illusion | -| [11](tutorial/11-数据一致性工程.md) | The engineering of data consistency | Saga, Outbox, idempotency, event sourcing, CQRS | -| [12](tutorial/12-为失败而设计.md) | Designing for failure: resilience | Cascading failure, circuit breakers, bulkheads, load shedding, SLOs, chaos | -| [13](tutorial/13-规模化的力学.md) | The mechanics of scale | Consistent hashing, hot keys, multi-region, tail latency & fan-out | -| [14](tutorial/14-演进与拆分大型系统.md) | Evolving & splitting large systems | Strangler fig, parallel run, zero-downtime migration, splitting the monolith | -| [15](tutorial/15-组织即架构.md) | Organization as architecture | Conway / inverse Conway, Team Topologies, platform engineering | -| [16](tutorial/16-安全与多租户架构.md) | Security & multi-tenancy | Threat modeling, zero trust, blast radius, tenant isolation | -| [17](tutorial/17-大模型时代的架构判断.md) | Architecting in the age of LLMs | Vibe coding, nondeterminism, context engineering, agentic hard parts | +| [10](en/tutorial/10-分布式系统的硬道理.md) | Distributed systems: the hard truths | Partial failure, no global clock, the cost of consensus, the exactly-once illusion | +| [11](en/tutorial/11-数据一致性工程.md) | The engineering of data consistency | Saga, Outbox, idempotency, event sourcing, CQRS | +| [12](en/tutorial/12-为失败而设计.md) | Designing for failure: resilience | Cascading failure, circuit breakers, bulkheads, load shedding, SLOs, chaos | +| [13](en/tutorial/13-规模化的力学.md) | The mechanics of scale | Consistent hashing, hot keys, multi-region, tail latency & fan-out | +| [14](en/tutorial/14-演进与拆分大型系统.md) | Evolving & splitting large systems | Strangler fig, parallel run, zero-downtime migration, splitting the monolith | +| [15](en/tutorial/15-组织即架构.md) | Organization as architecture | Conway / inverse Conway, Team Topologies, platform engineering | +| [16](en/tutorial/16-安全与多租户架构.md) | Security & multi-tenancy | Threat modeling, zero trust, blast radius, tenant isolation | +| [17](en/tutorial/17-大模型时代的架构判断.md) | Architecting in the age of LLMs | Vibe coding, nondeterminism, context engineering, agentic hard parts | **🎯 Practice track (18–22) — put the method on real cases; bridge tutorial → templates:** | Ch. | Topic | What you'll practice | |---|---|---| -| [18](tutorial/18-读地图用框架拆解陌生系统.md) | Reading the map: deconstruct unfamiliar systems | Reverse-engineer *why* a `templates/` map is designed that way; practice on RAG / AI chat product | -| [19](tutorial/19-完整设计演练中等复杂度系统.md) | Full design walkthrough: medium complexity | Run the ch.07 eight-step flow — design an AI customer-service assistant that looks up orders and issues refunds (with token-cost math) | -| [20](tutorial/20-演进剧本MVP到规模化.md) | Evolution playbook: MVP → scale | Ch.08 + [evolution trigger signals](tutorial/演进触发信号.md) — three life stages of one AI assistant | -| [21](tutorial/21-拆分与迁移实战.md) | Splitting & migration in practice | Ch.14 as a case: strangler, branch by abstraction, shadow traffic, zero-downtime vector-store swap | -| [22](tutorial/22-AI原生系统设计.md) | AI-native system design | Upgrade the assistant into an autonomous Agent; land ch.17's three new constraints; bridge to the AI-collab track | +| [18](en/tutorial/18-读地图用框架拆解陌生系统.md) | Reading the map: deconstruct unfamiliar systems | Reverse-engineer *why* a `templates/` map is designed that way; practice on RAG / AI chat product | +| [19](en/tutorial/19-完整设计演练中等复杂度系统.md) | Full design walkthrough: medium complexity | Run the ch.07 eight-step flow — design an AI customer-service assistant that looks up orders and issues refunds (with token-cost math) | +| [20](en/tutorial/20-演进剧本MVP到规模化.md) | Evolution playbook: MVP → scale | Ch.08 + [evolution trigger signals](tutorial/演进触发信号.md) — three life stages of one AI assistant | +| [21](en/tutorial/21-拆分与迁移实战.md) | Splitting & migration in practice | Ch.14 as a case: strangler, branch by abstraction, shadow traffic, zero-downtime vector-store swap | +| [22](en/tutorial/22-AI原生系统设计.md) | AI-native system design | Upgrade the assistant into an autonomous Agent; land ch.17's three new constraints; bridge to the AI-collab track | **🤝 AI-collaborative design track (23–26) — collaborate with AI without losing judgment:** | Ch. | Topic | What you'll master | |---|---|---| -| [23](tutorial/23-规格即架构约束怎么写给AI.md) | Spec as architecture: constraints for AI | ADR / `AGENTS.md` → enforceable guardrails; pairs with [architecture-copilot](https://github.com/study8677/architecture-copilot) | -| [24](tutorial/24-审查清单AI产出默认缺什么.md) | Review checklist: what AI output omits by default | Production-grade review checklist from ch.11/12/16 | -| [25](tutorial/25-评测驱动把够好写进架构.md) | Eval-driven: bake "good enough" into architecture | Eval as a CI gate for nondeterministic systems | -| [26](tutorial/26-协作决策树何时vibe何时spec-first.md) | Collaboration decision tree: when to vibe, when to spec-first | Prototype vs production workflow — the capstone | +| [23](en/tutorial/23-规格即架构约束怎么写给AI.md) | Spec as architecture: constraints for AI | ADR / `AGENTS.md` → enforceable guardrails; pairs with [architecture-copilot](https://github.com/study8677/architecture-copilot) | +| [24](en/tutorial/24-审查清单AI产出默认缺什么.md) | Review checklist: what AI output omits by default | Production-grade review checklist from ch.11/12/16 | +| [25](en/tutorial/25-评测驱动把够好写进架构.md) | Eval-driven: bake "good enough" into architecture | Eval as a CI gate for nondeterministic systems | +| [26](en/tutorial/26-协作决策树何时vibe何时spec-first.md) | Collaboration decision tree: when to vibe, when to spec-first | Prototype vs production workflow — the capstone | -> 👉 **New here? Start with [tutorial/README.md](tutorial/README.md)** for the full learning path. +**🧰 Technology stack selection track (27–34) — turn "what tech to use" into architecture judgment:** + +| Ch. | Topic | What you'll judge | +|---|---|---| +| [27](en/tutorial/27-编程语言与后端框架选型.md) | Languages & backend frameworks | How language/framework affects runtime, ecosystem, team, and maintenance cost | +| [28](en/tutorial/28-数据库与存储选型.md) | Databases & storage | How source of truth, read models, search, object storage, and vector stores divide work | +| [29](en/tutorial/29-缓存消息队列与事件系统选型.md) | Cache, queues & events | What fits read hotspots, spike smoothing, and fact broadcast | +| [30](en/tutorial/30-API与服务通信选型.md) | APIs & service communication | REST / gRPC / GraphQL / Webhook boundaries | +| [31](en/tutorial/31-云原生与部署平台选型.md) | Cloud native & deployment | When PaaS, containers, Serverless, or K8s are worth it | +| [32](en/tutorial/32-可观测性与可靠性技术栈选型.md) | Observability & reliability | Work backward from SLOs to metrics, logs, traces, alerts, incidents | +| [33](en/tutorial/33-AI基础设施技术栈选型.md) | AI infrastructure | When model API, RAG, vector DB, inference serving, and evals should upgrade | +| [34](en/tutorial/34-技术选型决策树.md) | Technology selection decision tree | Turn stage, failure mode, team capability, and exit plan into an ADR | + +> 👉 **New here? Start with [en/tutorial/README.md](en/tutorial/README.md)** for the full learning path. ### 🧪 cases/ — Turn architecture from answer into reasoning @@ -161,7 +174,7 @@ Each template is an "architecture map". We **deliberately avoid discussing langu > 👉 **Want to add your own template?** Follow the unified format in [templates/_TEMPLATE.md](templates/_TEMPLATE.md). -> 📝 **Now fully bilingual** — all **26 tutorial chapters, 25 templates, and the first 6 cases** are available in English. Use the language switch (top-right on the site), or browse `en/` in the repo. +> 📝 **Now fully bilingual** — all **34 tutorial chapters, 25 templates, and the first 6 cases** are available in English. Use the language switch (top-right on the site), or browse `en/` in the repo. --- diff --git a/en/index.md b/en/index.md index 4878f76..c668bcf 100644 --- a/en/index.md +++ b/en/index.md @@ -4,7 +4,7 @@ layout: home hero: name: "Awesome Architecture" text: "Think like an architect" - tagline: "Writing code is disappearing; judgment is what's getting valuable. 26 architecture-thinking chapters + 25 real-system architecture maps + 6 end-to-end cases — architecture only, no syntax." + tagline: "Writing code is disappearing; judgment is what's getting valuable. 34 architecture-thinking chapters + 25 real-system architecture maps + 6 end-to-end cases — architecture only, no syntax." actions: - theme: brand text: Start the tutorial → @@ -22,7 +22,7 @@ hero: features: - icon: 🧠 title: Judgment, not syntax - details: No language or framework. A transferable, durable way of thinking — requirements → constraints → quality attributes → trade-offs. + details: No language or framework lock-in. A transferable way of thinking — requirements → constraints → quality attributes → trade-offs — now applied to technology stack selection. - icon: 🗺️ title: 25 architecture maps details: From e-commerce, social feeds, chat and video to AI gateways, RAG and inference serving — each explains why it's built that way and where it breaks. @@ -37,7 +37,7 @@ features: details: Each template links to real open-source projects and engineering papers (vLLM, LiteLLM, TigerBeetle, Uber H3, Figma…). --- -> ✅ **Fully bilingual.** All 26 tutorial chapters, 25 templates, and the first 6 cases are available in English — use the language switch (top-right) or browse `en/` in the repo. [Contributions welcome](https://github.com/study8677/awesome-architecture). +> ✅ **Fully bilingual.** All 34 tutorial chapters, 25 templates, and the first 6 cases are available in English — use the language switch (top-right) or browse `en/` in the repo. [Contributions welcome](https://github.com/study8677/awesome-architecture). ## 🗺️ Browse all 25 architecture maps @@ -49,6 +49,10 @@ Click a category to filter, then click any card to jump to that template's archi The case track is not more templates. It is a full product walkthrough: starting architecture, quantified trigger signals, ADRs, data flow, failure fallbacks, and quick checks. 👉 [Enter the case track](/en/cases/README) +## 🧰 Learn technology stack selection + +The new technology stack selection track (27–34) is not a framework tutorial. It teaches the architectural judgment behind language, databases, cache, APIs, deployment, observability, and AI infrastructure. 👉 [Start from Chapter 27](/en/tutorial/27-编程语言与后端框架选型) + ## ⚖️ Architecture is a series of forks in the road The most valuable section of every template is "Key Decisions & Trade-offs." Try this interactive starter: diff --git "a/en/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" "b/en/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" index 8fa44d4..1d52e3b 100644 --- "a/en/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" +++ "b/en/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" @@ -1,10 +1,10 @@ # 17 · Architectural Judgment in the Age of LLMs: What Makes You Irreplaceable in the Vibe-Coding Era -> The thesis in one line: **once "writing code" collapses into a cheap act — a few seconds, a few sentences of natural language — the only thing left that is scarce and valuable is this: judging, before you start, what the system should look like, where it will die, and what you're trading for what.** This is the finale of the advanced track, and of the whole tutorial: projecting every hard truth before it onto the AI moment we're living in. +> The thesis in one line: **once "writing code" collapses into a cheap act — a few seconds, a few sentences of natural language — the only thing left that is scarce and valuable is this: judging, before you start, what the system should look like, where it will die, and what you're trading for what.** This is the finale of the advanced track: projecting every hard truth before it onto the AI moment we're living in. --- -> **🏁 The whole tutorial ends here.** The foundations track (01–09) taught you to **read a system and design a small-to-medium one from scratch**; the advanced track (10–16) taught you to **tame the hard rock that only bites once a system gets big or critical**: distribution, failure, scale, evolution, organization, security. This last chapter introduces no new "hard rock." It does something more important — **it turns that judgment toward the present: an era where AI writes your code (vibe coding) while simultaneously spawning a whole new class of systems (LLM / Agent).** +> **🏁 The advanced track closes here.** The foundations track (01–09) taught you to **read a system and design a small-to-medium one from scratch**; the advanced track (10–16) taught you to **tame the hard rock that only bites once a system gets big or critical**: distribution, failure, scale, evolution, organization, security. This chapter introduces no new "hard rock." It does something more important — **it turns that judgment toward the present: an era where AI writes your code (vibe coding) while simultaneously spawning a whole new class of systems (LLM / Agent).** --- diff --git "a/en/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" "b/en/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" index 077701f..f000639 100644 --- "a/en/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" +++ "b/en/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" @@ -6,7 +6,7 @@ > **🤝 AI-collaborative design track, ch. 1 · What this track is about** > -> **Prerequisites: the practitioner track (18–22).** You already know how to **design, evolve, and migrate** AI systems. This final track shifts the angle: **not "how you build" but "how you build with AI without losing control."** It skips vibe-coding tool tips and focuses on two things — **how to write constraints for AI (this chapter + 25) and how to review AI output (24)** — closing with (26) a decision tree. Same product line as the [architecture-copilot](https://github.com/study8677/architecture-copilot) skill. +> **Prerequisites: the practitioner track (18–22).** You already know how to **design, evolve, and migrate** AI systems. The AI-collaborative track shifts the angle: **not "how you build" but "how you build with AI without losing control."** It skips vibe-coding tool tips and focuses on two things — **how to write constraints for AI (this chapter + 25) and how to review AI output (24)** — closing with (26) a decision tree. Same product line as the [architecture-copilot](https://github.com/study8677/architecture-copilot) skill. --- diff --git "a/en/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" "b/en/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" index 553a92f..4052b15 100644 --- "a/en/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" +++ "b/en/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" @@ -138,7 +138,7 @@ Treat eval as an architecture component, which means looking at its costs the wa - **Eval does not replace traditional testing**: deterministic logic still uses unit tests; eval only governs the quality distribution of nondeterministic output. - **Eval is not free**: it burns money, the judge makes mistakes, it overfits and goes stale — balance coverage against cost, calibrate and maintain continuously. It is the "quality fitness function" of an AI system ([Chapter 14](https://github.com/study8677/awesome-architecture/blob/main/tutorial/14-演进与拆分大型系统.md)). -> **Bridging forward**: at this point, the three weapons of AI collaboration are complete — **spec (23) supplies the constraints, checklist (24) reviews the output, eval (25) guards quality**. But when do you reach for which weapon? When is it fine to vibe, and when must you go spec-first? The final chapter, [26 · Collaboration decision tree: when to vibe, when to spec-first](https://github.com/study8677/awesome-architecture/blob/main/tutorial/26-协作决策树何时vibe何时spec-first.md), wraps all three weapons into **a workflow you can actually follow**, and closes out the full 26-chapter tutorial. +> **Bridging forward**: at this point, the three weapons of AI collaboration are complete — **spec (23) supplies the constraints, checklist (24) reviews the output, eval (25) guards quality**. But when do you reach for which weapon? When is it fine to vibe, and when must you go spec-first? The final chapter of the AI-collaborative track, [26 · Collaboration decision tree: when to vibe, when to spec-first](26-协作决策树何时vibe何时spec-first.md), wraps all three weapons into **a workflow you can actually follow**. --- diff --git "a/en/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" "b/en/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" index 0e0f72a..f942a84 100644 --- "a/en/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" +++ "b/en/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" @@ -1,10 +1,10 @@ # 26 · Collaboration Decision Tree: When to Vibe, When to Spec-First -> The thesis in one line: **the question is not "should we use AI to write code" — it is "for this particular piece, should we let it vibe freely, or lock down the spec first and then let it write." Prototype all you want by vibing; close out production work with judgment. This chapter distills the previous three chapters (spec / review / eval) into a decision tree you can actually follow, and draws the curtain on the full 26-chapter tutorial.** +> The thesis in one line: **the question is not "should we use AI to write code" — it is "for this particular piece, should we let it vibe freely, or lock down the spec first and then let it write." Prototype all you want by vibing; close out production work with judgment. This chapter distills the previous three chapters (spec / review / eval) into a decision tree you can actually follow, and closes the AI-collaborative design track.** --- -> **🏁 The whole tutorial ends here.** You have walked through four tracks: **reading systems and designing from scratch ([01–09](https://github.com/study8677/awesome-architecture/blob/main/tutorial/01-为什么先有架构思维.md)) → taming the hard rock of distributed, critical systems ([10–17](https://github.com/study8677/awesome-architecture/blob/main/tutorial/10-分布式系统的硬道理.md)) → applying the methods to real AI systems ([18–22](https://github.com/study8677/awesome-architecture/blob/main/tutorial/18-读地图用框架拆解陌生系统.md)) → learning to write constraints for AI, review it, and guard its quality ([23–25](https://github.com/study8677/awesome-architecture/blob/main/tutorial/23-规格即架构约束怎么写给AI.md))**. This chapter collects the final three weapons into one workflow — and then, it is your turn. +> **🤝 Final chapter of the AI-collaborative design track.** You have walked through four capabilities: **reading systems and designing from scratch ([01–09](01-为什么先有架构思维.md)) → taming the hard rock of distributed, critical systems ([10–17](10-分布式系统的硬道理.md)) → applying the methods to real AI systems ([18–22](18-读地图用框架拆解陌生系统.md)) → learning to write constraints for AI, review it, and guard its quality ([23–25](23-规格即架构约束怎么写给AI.md))**. This chapter collects the final three weapons into one workflow. --- @@ -111,14 +111,14 @@ Whether you are "letting AI write code" or "letting AI act autonomously at runti ## 5. Back to the beginning: what you have actually been training -After 26 chapters, we return to [Chapter 01](https://github.com/study8677/awesome-architecture/blob/main/tutorial/01-为什么先有架构思维.md) and the reason this repo exists: +At the end of the AI-collaborative design track, we return to [Chapter 01](01-为什么先有架构思维.md) and the reason this repo exists: > **Writing code is becoming cheap; architectural judgment is becoming unprecedentedly scarce and valuable.** From start to finish, this tutorial has never taught a framework or a syntax — AI can produce those in seconds. It taught something **that does not depreciate**: ``` - Will depreciate (AI is making it cheap) Will not depreciate (what you trained across 26 chapters) + Will depreciate (AI is making it cheap) Will not depreciate (what these chapters train) ────────────────────────────────── ────────────────────────────────────────────────────────── • Memorizing an API / syntax • Take a vague requirement and ask the right questions (02/07) • Hand-writing boilerplate impl • Make well-reasoned decisions amid trade-offs (06/08) @@ -128,7 +128,7 @@ From start to finish, this tutorial has never taught a framework or a syntax — → Hand off to AI → This is you, in the AI era ``` -And the final track (23–26) is really saying one thing: **architectural judgment is not obsolete in the AI era — it has a brand-new, extremely high-leverage arena: through the interface of "spec / review / eval / decision," you turn your judgment into the behavioral constraints of an AI army.** You are no longer just "designing a system" — you are "**designing the guardrails and process that continuously produce good systems through AI**." Judgment is amplified, not replaced. +And the AI-collaborative track (23–26) is really saying one thing: **architectural judgment is not obsolete in the AI era — it has a brand-new, extremely high-leverage arena: through the interface of "spec / review / eval / decision," you turn your judgment into the behavioral constraints of an AI army.** You are no longer just "designing a system" — you are "**designing the guardrails and process that continuously produce good systems through AI**." Judgment is amplified, not replaced. > **Architectural wisdom**: **vibe coding is not the end of judgment — it is its lever.** A person with judgment + AI = the output of ten people; a person without judgment + AI = ten times the speed of building systems you cannot read, cannot sustain, and cannot change. What decides the outcome has never been how powerful the AI is, but whether the person holding the wheel can read the map clearly. @@ -162,9 +162,9 @@ And the final track (23–26) is really saying one thing: **architectural judgme --- -## 🏁 Tutorial finale: your turn +## 🤝 AI-collab track close: judgment keeps going -26 chapters, four tracks, one through-line: +Up to Chapter 26, the through-line has been: ``` 01–09 Read systems, design a small-to-medium system from scratch —— build judgment @@ -180,11 +180,9 @@ And the final track (23–26) is really saying one thing: **architectural judgme What this tutorial gave you was never conclusions — it was **the ability to ask questions**. When you can naturally ask, of every technical choice and every piece of AI output, "**why this one? what is the cost? where will it die?**" — you are already thinking like an architect. -> **Now, close the tutorial.** Pick a system you are working on, or any map in [`templates/`](https://github.com/study8677/awesome-architecture/blob/main/templates/README.md), and walk through it: **read it ([18](https://github.com/study8677/awesome-architecture/blob/main/tutorial/18-读地图用框架拆解陌生系统.md)) → design your own version ([19](https://github.com/study8677/awesome-architecture/blob/main/tutorial/19-完整设计演练中等复杂度系统.md)) → think through how it evolves ([20](https://github.com/study8677/awesome-architecture/blob/main/tutorial/20-演进剧本MVP到规模化.md)) → write the constraints for AI ([23](https://github.com/study8677/awesome-architecture/blob/main/tutorial/23-规格即架构约束怎么写给AI.md)) → let AI build, you close it out ([24](https://github.com/study8677/awesome-architecture/blob/main/tutorial/24-审查清单AI产出默认缺什么.md) / [25](https://github.com/study8677/awesome-architecture/blob/main/tutorial/25-评测驱动把够好写进架构.md)).** +> **Keep going.** Once you can design systems and collaborate with AI, real projects keep asking "what technology should we use?" The next track starts with [27 · Programming languages and backend frameworks](27-编程语言与后端框架选型.md), applying the same architectural judgment to languages, databases, cache, APIs, deployment, observability, and AI infrastructure. > -> If you want a coach to walk with you, use the companion [architecture-copilot](https://github.com/study8677/architecture-copilot) skill — it turns these 26 chapters into an interactive partner inside Claude Code / Cursor / Codex that **guides you through architectural judgment step by step**. -> -> In an era where AI writes code for everyone, may you become the one who — **reads the map clearly before deciding whether to take the road.** +> If you want a coach to walk with you, use the companion [architecture-copilot](https://github.com/study8677/architecture-copilot) skill — it turns this tutorial into an interactive partner inside Claude Code / Cursor / Codex that **guides you through architectural judgment step by step**. --- diff --git "a/en/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" "b/en/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" new file mode 100644 index 0000000..3d8431b --- /dev/null +++ "b/en/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" @@ -0,0 +1,164 @@ +# 27 · Programming Languages and Backend Framework Selection + +> The thesis in one line: **language and framework choice is not a belief system. It is a constraint problem. An architect does not ask "which one is hottest"; an architect asks "will this choice change team speed, runtime cost, performance ceiling, ecosystem availability, and future migration cost?" If yes, it is an architecture decision. If it only changes syntax, it is implementation detail.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 1 · One thing to practice** +> +> The first 26 chapters kept saying "architecture is not framework choice." That remains true. But real systems still need Java, Go, Python, TypeScript, Rust, and backend frameworks. This track does not teach syntax. It brings "what tech should we use" back into the framework from [Chapter 02](02-架构师的思考框架.md): requirements, constraints, quality attributes, trade-offs. + +--- + +## Opening: selection is not a popularity vote + +Teams often choose languages like this: + +``` + I like Go -> use Go + Java hiring is easy -> use Java + AI ecosystem is Python -> use Python + Frontend is TS -> use TypeScript +``` + +These are signals, not decisions. Keep asking: + +- What is the bottleneck of this business path? +- Is the team short on delivery speed, runtime efficiency, stability, or hiring supply? +- Does the ecosystem have mature libraries, test tools, monitoring, and deployment patterns? +- Can someone new maintain this code three years from now? + +> **Rule of thumb:** if a language/framework choice materially affects quality attributes ([Chapter 06](06-质量属性与取舍.md)), it is architecture. If it is only code style, do not turn it into a technology war. + +--- + +## 1. Separate language, runtime, and framework + +Beginners often mix three layers: + +| Layer | What it decides | Examples | Architecture concern | +|---|---|---|---| +| **Language** | Expression style, type system, ecosystem entry | Java, Go, Python, TypeScript, Rust | Team familiarity, maintainability, early error detection | +| **Runtime** | Concurrency, memory, startup, deployment shape | JVM, Node.js, CPython, Go runtime | Latency, throughput, resource cost, cold start | +| **Framework** | Conventions and component assembly | Spring Boot, FastAPI, NestJS, Gin | Delivery speed, testability, plugin ecosystem, team consistency | + +So "Java or Go?" is not a complete question. A better question is: + +> Given this team, business, and quality target, what runtime and delivery model do we need? + +For an internal SaaS backend such as [PatchDesk](../cases/patchdesk-saas/README.md), the early challenge is not maximum QPS. It is permissions, tenant isolation, audit, reports, and long-term maintenance. In that context, mature framework conventions often matter more than raw performance. + +--- + +## 2. Five rulers before tool names + +| Ruler | Question | What it pushes toward | +|---|---|---| +| **Business complexity** | Many rules, states, permissions? | Strong typing, clear conventions, mature testing | +| **Performance and resources** | Is CPU, memory, or P99 tail latency central? | Lower runtime overhead, clearer concurrency model | +| **Ecosystem maturity** | Auth, ORM, queues, observability, SDKs ready? | Deep ecosystem, stable docs, active community | +| **Team capability** | What does the team know? Can it review and hire? | Main team language, or a new language with controlled learning cost | +| **Delivery and evolution** | Fast iteration or long-term reliability? | Clear framework conventions and migration paths | + +> **Architectural wisdom:** do not change stack because a language is "more modern." A new technology deserves consideration only when it clearly buys a quality attribute and you are willing to pay learning, ops, hiring, and migration costs. + +--- + +## 3. Common backend choices as trade-offs + +| Technology | Common strengths | Common costs | Good fit | +|---|---|---|---| +| **Java / Kotlin + JVM** | Mature ecosystem, stable performance, enterprise libraries | Can feel heavy; framework complexity | Medium/large business systems, finance, e-commerce, SaaS | +| **Go** | Simple deployment, direct concurrency, low resource use | Complex business modeling needs discipline | Gateways, infrastructure, microservices, realtime paths | +| **Python** | AI/data ecosystem, fast prototyping | Runtime performance and concurrency need care | AI services, data platforms, automation | +| **TypeScript / Node.js** | One language across frontend/backend, I/O friendly | CPU-heavy work is a poor fit; dependency quality varies | BFF, small/mid SaaS, lightweight realtime services | +| **Rust** | Performance and memory safety | Higher learning curve, slower delivery for many teams | Storage, proxies, engines, safety/performance-critical components | + +A system does not need one language only: + +``` + Core business service: Java / Go / TypeScript + AI and data processing: Python + High-performance proxy or engine: Rust / Go + Frontend and BFF: TypeScript +``` + +Polyglot stacks are not wrong, but they carry cognitive tax: build, deploy, monitor, debug, hire, and review all get harder. A small team that uses five stacks because "each module gets the best language" is often spending organizational capacity it does not have ([Chapter 15](15-组织即架构.md)). + +--- + +## 4. Framework selection: mature by default + +Frameworks are team agreements in code: + +``` + A framework gives: + routing / dependency injection / config / data access / auth / testing / observability entry points + + A framework takes: + freedom / learning cost / upgrade cost / debugging transparency +``` + +For MVPs, reduce delivery risk. For long-lived team systems, reduce collaboration risk. Often, a mature framework that looks boring is safer than a cool lightweight stack that relies on everyone maintaining perfect discipline. + +| Question | If yes | Tendency | +|---|---|---| +| Many new teammates, long-term maintenance? | Yes | Strong conventions, good docs, mature ecosystem | +| Fast experimentation, unclear business? | Yes | Lightweight framework + clear module boundaries | +| Lots of enterprise integration, transactions, permissions? | Yes | Mature enterprise framework | +| High-concurrency gateway/proxy? | Yes | Low overhead runtime and mature network model | +| AI/data-heavy service? | Yes | Stay close to Python/data ecosystem, wrap it with stable APIs | + +--- + +## 5. When to change language or framework + +Change should be driven by signals: + +| Signal | Meaning | Possible action | +|---|---|---| +| P99 keeps missing SLO due to runtime mismatch | Not just bad code | Move hot path to a better runtime | +| Delivery slows because framework conventions fight the business | Complexity outgrew the framework | Modularize first, then migrate locally | +| Ecosystem gaps force heavy self-building | Maintenance cost keeps rising | Move to a richer ecosystem | +| Hiring and review are hard | Team cannot support the choice | Converge stack or strengthen platform constraints | +| Security/compliance/performance bar rises | Old stack cannot cover new requirements | Upgrade the critical component | + +As in [Chapter 14](14-演进与拆分大型系统.md), changing stack should be evolutionary: carve a boundary, run in parallel, use shadow traffic where useful, and switch gradually. + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **Language/framework choice is a constraint problem**: business complexity, performance, ecosystem, team, and evolution come before tool names. +- **Separate language, runtime, and framework**: they affect different parts of the system. +- **Mature is the default** unless a new stack clearly buys a quality attribute. +- **Polyglot has cognitive tax**: build, deploy, monitor, debug, hire, and review get harder. +- **Stack changes should evolve**: carve boundaries and migrate gradually, as in Chapter 14. + +> **Next:** language and frameworks decide how services run and teams collaborate. But data is the part that stays. [Chapter 28 · Database and Storage Selection](28-数据库与存储选型.md) asks where data should live, how it should be queried, and how it should remain correct. + +--- + +## Related links + +- Method core: [02 · Thinking framework](02-架构师的思考框架.md) · [06 · Quality attributes](06-质量属性与取舍.md) · [08 · ADRs](08-架构决策记录与演进.md) +- Evolution: [14 · Evolution and splitting](14-演进与拆分大型系统.md) · [15 · Organization as architecture](15-组织即架构.md) +- Case references: [PatchDesk](../cases/patchdesk-saas/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/en/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" "b/en/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" new file mode 100644 index 0000000..02de4c5 --- /dev/null +++ "b/en/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" @@ -0,0 +1,178 @@ +# 28 · Database and Storage Selection + +> The thesis in one line: **database choice is not just "MySQL or PostgreSQL." It is "what are this data's access patterns, consistency needs, query shapes, growth rate, and failure cost?" Choose storage by drawing the data lifecycle first. Tool names come later.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 2 · One thing to practice** +> +> [Chapter 05](05-数据与状态.md) said the hard part of systems is data. Languages can be changed, services can be split, but data placed in the wrong model is expensive to move. This chapter puts databases, caches, search, object storage, and vector stores on one selection map. + +--- + +## Opening: do not make one database solve every problem + +Many systems start like this: + +``` + App -> one relational database + |-- transaction data + |-- reports + |-- search filters + |-- file attachments + |-- AI retrieval vectors +``` + +For an MVP, this can be fine. As the system grows, these access patterns diverge: + +- Orders need correctness and audit. +- Reports scan a lot of history and should not slow the primary database. +- Search needs relevance ranking, not only `LIKE`. +- Images and videos need cheap storage and CDN delivery. +- RAG needs vector retrieval, metadata, citations, and permission filtering. + +> **Architectural judgment:** do not ask "which database is best." Ask "which storage model should carry which kind of data." A system may need multiple stores, but every new store adds synchronization, consistency, ops, and debugging cost. + +--- + +## 1. Draw the data lifecycle + +Before choosing a database, trace a piece of data: + +``` + write -> validate -> commit -> query/search -> analyze/report -> archive/delete + | | | | | | + who? how? how consistent? query shape? how often? retention? +``` + +Ask five questions: + +| Question | Why it matters | +|---|---| +| **Read-heavy or write-heavy?** | Drives read replicas, cache, index model | +| **Where is the transaction boundary?** | Decides whether relational transactions can protect it | +| **What is the query shape?** | Key lookup, range query, full-text, vector similarity are different | +| **How fast does data grow?** | Drives partitioning, hot/cold tiering, archival | +| **What happens if it is wrong?** | Drives consistency, backup, audit, recovery | + +This is the same discipline as the back-of-the-envelope work in [Chapter 07](07-从0到1设计一个系统.md): estimate volume, read/write ratio, and retention before tool choice. + +--- + +## 2. Primary storage: relational is still the default start + +| Type | Good for | Poor fit | +|---|---|---| +| **Relational database** | Transactions, orders, permissions, tenants, ledgers | Huge analytics, full-text search, large binary files | +| **Document database** | Flexible structure, whole-document reads/writes | Strict transactions, complex joins, rigorous reporting | +| **Key-value store** | High-speed key access, simple shape, very large scale | Ad hoc joins and flexible querying | +| **OLAP / analytical store** | Reports, aggregation, logs, behavior analytics | High-frequency small transactions | + +> **Default:** keep core transaction data in a relational source of truth. Split reports, search, logs, and vectors only when they become real pressure. + +--- + +## 3. Read models: search, analytics, and vectors are not the primary truth + +| Need | Common engine | Trade-off | +|---|---|---| +| **Full-text search** | Elasticsearch, OpenSearch, Meilisearch | Strong relevance, but index sync and eventual consistency are required | +| **Analytics** | ClickHouse, BigQuery, Snowflake | Fast scans/aggregates, not a transaction source | +| **Object storage** | S3, OSS, GCS | Cheap durable files, not complex querying | +| **Vector database** | Milvus, Qdrant, pgvector | Similarity retrieval, but permissions and recall need design | +| **Time-series database** | Prometheus, InfluxDB | Metrics over time, not generic business objects | + +The common mistake is making a read model the source of truth: + +``` + Correct: + primary DB = source of truth + search / analytics / vector store = read model rebuilt from source + + Wrong: + user updates profile -> only search index changes -> primary DB never knows +``` + +Read models may lag, but you must state how much lag is acceptable, how to repair, and how to rebuild. That is [Chapter 11](11-数据一致性工程.md) in practice. + +--- + +## 4. RAG: the vector store is not the whole system + +An enterprise RAG system such as [DocuMind](../cases/documind-rag/README.md) is often oversimplified as: + +``` + chunk document -> vector DB -> topK -> LLM answer +``` + +A real system is closer to: + +``` + original object storage -> parsing/chunking -> metadata DB + | | + +--------------> keyword index <------+ + +--------------> vector index <------+ + +--------------> permission filter <--+ +``` + +Ask: + +- Is permission filtering before retrieval or after retrieval? +- Is retrieval vector-only, or hybrid search (keyword + vector)? +- Where are originals and citations stored? +- Can indexes be rebuilt from originals and metadata? +- Does cost scale with document and query volume? + +> A vector database solves similarity recall. It does not solve source of truth, permissions, citations, evaluation, or cost by itself. + +--- + +## 5. When to split storage + +| Signal | Meaning | Possible action | +|---|---|---| +| Reports slow the transaction DB | OLTP and OLAP interfere | Sync to an analytical store | +| Search relevance is poor or `LIKE` scans are slow | Query shape is full-text search | Build a search index | +| Attachments bloat the DB | Binary files do not belong in core tables | Move to object storage + CDN | +| Table growth slows indexes and backups | No lifecycle strategy | Partition, archive, tier hot/cold data | +| RAG recall quality is unstable | Single vector retrieval is not enough | Hybrid retrieval + rerank + eval | + +The opposite matters too: if data volume is small, team size is small, and failure cost is low, one relational database with good indexes and backups may be healthier than five stores. + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **Draw the data lifecycle before choosing storage**: write, validate, commit, query, analyze, archive. +- **Relational databases remain the default truth source** for many business systems. +- **Search, analytics, and vector stores are read models** for specific query shapes. +- **RAG is not a vector store**: it also needs originals, metadata, permissions, retrieval strategy, citations, and evals. +- **Split storage on signals**: reports hurting transactions, search getting slow, files bloating DB, recall failing. + +> **Next:** primary storage answers "where does truth live." Hotspots, traffic spikes, and cross-boundary collaboration bring caches, queues, and events. [Chapter 29](29-缓存消息队列与事件系统选型.md) covers those middle layers. + +--- + +## Related links + +- Method core: [05 · Data and state](05-数据与状态.md) · [06 · Quality attributes](06-质量属性与取舍.md) · [11 · Consistency engineering](11-数据一致性工程.md) +- Evolution: [13 · Mechanics of scale](13-规模化的力学.md) · [14 · Evolution and splitting](14-演进与拆分大型系统.md) +- Case references: [DocuMind](../cases/documind-rag/README.md) · [StarArena](../cases/stararena-ticketing/README.md) diff --git "a/en/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" "b/en/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" new file mode 100644 index 0000000..60f5cc3 --- /dev/null +++ "b/en/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" @@ -0,0 +1,194 @@ +# 29 · Cache, Message Queue, and Event System Selection + +> The thesis in one line: **cache is not a database, a queue is not magic, and an event system is not "just add Kafka." They solve three pressures: read hotspots, write spikes, and cross-boundary collaboration. Before choosing, ask whether you are reducing latency, smoothing peaks, or decoupling state progress.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 3 · One thing to practice** +> +> [Chapter 28](28-数据库与存储选型.md) covered truth sources and read models. This chapter covers the most common middle layers around them: cache, message queue, and event system. They can save a system, and they can also make it harder to reason about. + +--- + +## Opening: these three are often mixed up + +Many diagrams show: + +``` + App -> Redis -> MQ -> Kafka -> Worker +``` + +Then people say the architecture is advanced. Ask instead: + +- Is Redis storing disposable cache or business state that must not be lost? +- Is the queue smoothing a spike or coordinating services? +- Is Kafka carrying commands, or events that describe facts? +- What happens on duplicate, out-of-order, failed, or backlogged messages? + +> **Architectural judgment:** the value of a middle layer is not its name. It is the quality attribute it changes: latency, throughput, availability, coupling, consistency, recovery cost. + +--- + +## 1. Cache: accelerate, do not usurp truth + +Cache solves **read hotspots**: + +``` + request -> app -> cache hit -> return + \-> miss -> primary DB -> fill cache -> return +``` + +Three common mistakes: + +| Mistake | Result | Better approach | +|---|---|---| +| Treat cache as source of truth | Losing cache loses data | Primary store is truth; cache is rebuildable | +| No invalidation strategy | Users see stale or wrong values | TTL, active invalidation, versions | +| Hot key stampede on miss | Primary DB gets crushed | Null cache, request coalescing, rate limits, warmup | + +Types of cache: + +| Cache type | Good for | Watch out | +|---|---|---| +| **Local cache** | Config, dictionaries, rarely-changing data | Multiple instances can diverge | +| **Distributed cache** | Hot objects, sessions, counters, rate limits | Network cost, capacity, eviction | +| **CDN** | Images, videos, static resources, public pages | Invalidation delay and edge consistency | + +> If cache disappears, the system should get slower, not wrong. If it gets wrong, you stored business truth in cache. + +--- + +## 2. Queue: turn "do now" into "do reliably later" + +Queues smooth peaks: + +``` + spike -> admission/rate limit -> queue -> workers consume at safe rate -> DB +``` + +Good fits: + +- Ticket notification after seat lock. +- Coupons, SMS, email after order success. +- Parsing, chunking, indexing after document upload. +- Video transcoding after upload. + +But queues turn a synchronous world into an asynchronous one: + +| New problem | Required answer | +|---|---| +| **Duplicate messages** | Is the consumer idempotent? | +| **Message loss** | How do producer, storage, and ack work? | +| **Ordering** | Do you need order per business key? | +| **Backlog** | What does the user see? How do you degrade? | +| **Dead letter** | Where do failed messages go? Who repairs them? | + +A queue is not "add reliability." It trades request latency for asynchronous consistency and recovery work. + +--- + +## 3. Events: say what happened, not what others must do + +Commands and events differ: + +| Type | Meaning | Example | Who owns the result | +|---|---|---|---| +| **Command** | Please do something | `CreateOrder`, `SendEmail` | Receiver must perform or fail | +| **Event** | Something already happened | `OrderPaid`, `TicketLocked` | Subscribers react as needed | + +Events propagate business facts: + +``` + Order service: OrderPaid + |-- Inventory: confirm deduction + |-- Notification: send message + |-- Data platform: update reports + |-- Risk: record behavior +``` + +This decouples the publisher from downstream consumers. Costs: + +- Event schemas become contracts. +- Downstream failure cannot simply roll back the fact. +- Too many small events flood the system; overly broad events hide meaning. +- Tracing is mandatory as the flow spreads. + +--- + +## 4. Understand products by semantics first + +| Type | Common examples | Feels like | Good for | +|---|---|---|---| +| **Task queue** | RabbitMQ, Celery, Sidekiq | Work assignment | Background jobs, email, image processing | +| **Log/event stream** | Kafka, Pulsar | Replayable fact log | Event bus, data sync, audit, stream processing | +| **Lightweight messaging/stream** | Redis Streams, NATS | Fast internal channel | Small/mid async workflows, low-latency messaging | +| **Managed cloud queue** | SQS, Pub/Sub | Reliable queue with less ops | Cloud-native teams avoiding self-ops | + +Ask: + +1. Do messages need replay? +2. Do you need complex routing and delivery guarantees? +3. Can the team operate the cluster? +4. Is this message part of audit truth? + +--- + +## 5. Outbox: do not split DB commit from event publish + +Classic failure: + +``` + 1. Order write succeeds + 2. OrderCreated event publish fails + result: DB has the order, downstream never hears about it +``` + +Outbox pattern: + +``` + one local transaction: + write business row + write outbox row + | + v + relay scans outbox -> publishes message -> marks delivered +``` + +It adds a table, relay, retries, and idempotency, but it keeps fact writing and event publishing controllable. This is core [Chapter 11](11-数据一致性工程.md) consistency engineering. + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **Cache solves read hotspots**: it should be rebuildable and not become source of truth. +- **Queues solve spikes and background work**: they introduce async consistency, backlog, and recovery. +- **Events propagate facts**: "what happened," not "please do this." +- **Choose semantics before products**: task queue, event stream, lightweight messaging, managed queue solve different problems. +- **Outbox is a consistency basic** for crossing service boundaries safely. + +> **Next:** caches, queues, and events handle pressure behind services. [Chapter 30](30-API与服务通信选型.md) looks at how services speak directly: REST, gRPC, GraphQL, Webhook, and event APIs. + +--- + +## Related links + +- Method core: [11 · Consistency engineering](11-数据一致性工程.md) · [12 · Designing for failure](12-为失败而设计.md) · [13 · Mechanics of scale](13-规模化的力学.md) +- Templates: [Notification system](../templates/notification-system/README.md) · [Online ticketing](../templates/online-ticketing/README.md) · [RAG knowledge base](../templates/rag-knowledge-base/README.md) +- Cases: [StarArena](../cases/stararena-ticketing/README.md) · [DocuMind](../cases/documind-rag/README.md) · [FeedStream](../cases/feedstream-content/README.md) diff --git "a/en/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" "b/en/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" new file mode 100644 index 0000000..1932002 --- /dev/null +++ "b/en/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" @@ -0,0 +1,173 @@ +# 30 · API and Service Communication Selection + +> The thesis in one line: **API selection is not a format choice between REST and gRPC. It is a boundary choice. Sync or async, internal or external, strict contract or flexible query, one response or continuous stream: these decide how services should communicate.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 4 · One thing to practice** +> +> [Chapter 04](04-十大核心架构模式.md) covered layered, microservice, and event-driven patterns. [Chapter 29](29-缓存消息队列与事件系统选型.md) covered async middle layers. Now we focus on service boundaries: once two systems talk, you must choose communication style, contract, versioning, failure handling, and permission boundary. + +--- + +## Opening: communication style determines coupling style + +The same "order tells inventory to deduct" can be done as: + +``` + A. Order calls inventory REST API synchronously + B. Order calls inventory gRPC synchronously + C. Order publishes OrderCreated; inventory consumes asynchronously + D. Inventory exposes GraphQL; order queries as needed + E. Inventory calls order via Webhook +``` + +All can work. They couple differently: + +- Synchronous calls give clear results, but the caller is slowed or broken by the callee. +- Async events decouple, but results are not immediate and consistency is harder. +- GraphQL gives clients flexibility, but server governance and performance get harder. +- Webhooks fit external notifications, but require retry, signature, and idempotency. + +> **Architectural judgment:** choose interaction semantics before protocol. Do not start with "we use gRPC." Start with "does this path need to know the result now?" + +--- + +## 1. First cut: sync or async + +| Style | Good for | Cost | +|---|---|---| +| **Synchronous request/response** | User waits for result, immediate validation, direct failure feedback | Longer chains, P99 tail latency adds up, failures propagate | +| **Async message/event** | Can finish later, smooth spikes, multiple downstream consumers | State progression, idempotency, compensation, backlog | +| **Streaming** | Continuous output, realtime state, long task progress | Connection management, backpressure, reconnect recovery | + +Rule: + +``` + User must know whether to continue now -> sync + User only needs "accepted" -> async + User needs continuous updates -> streaming +``` + +In [StarArena](../cases/stararena-ticketing/README.md), entry/admission needs sync; ticket notification can be async; queue position updates fit streaming or polling. + +--- + +## 2. REST, gRPC, and GraphQL do not replace each other + +| Style | Better fit | Poor fit | +|---|---|---| +| **REST** | External APIs, normal Web/SaaS, easy debugging, common ecosystem | Very high-frequency internal calls needing strict types | +| **gRPC** | Internal service calls, low latency, high throughput, strict IDL | Browser-first public APIs, low-friction partner APIs | +| **GraphQL** | Multi-client aggregation, fast-changing fields, frontend flexibility | Complex writes, weak cache/permission/limit governance | +| **Webhook** | Third-party event notification, payment callbacks, external integration | Core sync path that needs immediate result | +| **MCP** | Exposing tools/resources/context to AI Agents | Normal business service calls with no Agent semantics | + +Boundary matters: + +- External APIs should be understandable, stable, and versionable. +- Internal hot paths can prefer strong contracts and performance. +- Frontend aggregation can use GraphQL if governance exists. +- Webhooks need signature, idempotency, replay protection. +- Agent tool APIs need permissions and human approval in the boundary ([Chapter 23](23-规格即架构约束怎么写给AI.md)). + +--- + +## 3. Contract matters more than protocol + +API failures often come from unclear contracts: + +| Contract point | Must specify | +|---|---| +| **Input/output** | Field meaning, required/optional, units, enums | +| **Error semantics** | Retryable, user error, system error | +| **Idempotency** | What happens if the same request repeats? | +| **Versioning** | How fields are added, deprecated, removed | +| **Rate limits** | Who can call how much? What happens when exceeded? | +| **Security boundary** | Authn, authz, signatures, audit | + +Without contract governance, REST becomes random URLs, GraphQL becomes arbitrary database exposure, and gRPC becomes a strongly typed ball of mud. + +--- + +## 4. Internal calls: prevent unbounded call chains + +Microservice performance often fails through fan-out: + +``` + user request + \- A + |-- B + | |-- D + | \-- E + \-- C + |-- F + \-- G +``` + +Each hop adds: + +- Network latency. +- Timeout and retry storms. +- Dependency failure propagation. +- Trace/debugging cost. + +Internal APIs need: + +1. **Timeout budgets**: if upstream has 500ms, every downstream cannot also get 500ms. +2. **Retry discipline**: retry only idempotent operations, with backoff and jitter. +3. **Degradation**: when non-critical dependency fails, return partial or fallback result. + +This is [Chapter 12](12-为失败而设计.md) resilience engineering. + +--- + +## 5. External APIs: stability beats elegance + +Once an external API is published, it is a promise: + +- **Backward compatibility**: adding fields is usually safe; removing or changing meaning is dangerous. +- **Stable errors**: customers write code against error semantics. +- **Docs and examples**: if external developers cannot understand it, elegance does not help. +- **Signatures and replay protection**: critical for payment, webhooks, Agent tools. +- **Audit and rate limits**: you need traceability and abuse control. + +For platform products, API is part of the product. Versioning and compatibility are architecture boundaries. + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **Communication style determines coupling**: sync, async, and streaming fit different semantics. +- **REST, gRPC, GraphQL, Webhook, and MCP have different boundaries**. +- **Contract matters more than protocol**: fields, errors, idempotency, versions, limits, security. +- **Internal calls need fan-out control**: timeout budget, retry discipline, degradation, trace. +- **External API is a product promise**: compatibility, docs, errors, signatures, audit. + +> **Next:** we have chosen service implementation, storage, middle layers, and communication. [Chapter 31](31-云原生与部署平台选型.md) asks where these things run, who scales them, who releases them, and who rescues them. + +--- + +## Related links + +- Method core: [04 · Core patterns](04-十大核心架构模式.md) · [12 · Designing for failure](12-为失败而设计.md) · [16 · Security and multi-tenancy](16-安全与多租户架构.md) +- AI collaboration: [23 · Spec as architecture](23-规格即架构约束怎么写给AI.md) · [26 · Collaboration decision tree](26-协作决策树何时vibe何时spec-first.md) +- Cases: [StarArena](../cases/stararena-ticketing/README.md) · [CodePilot](../cases/codepilot-agent/README.md) · [SyncRoom](../cases/syncroom-collaboration/README.md) diff --git "a/en/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" "b/en/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" new file mode 100644 index 0000000..09edd30 --- /dev/null +++ "b/en/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" @@ -0,0 +1,187 @@ +# 31 · Cloud Native and Deployment Platform Selection + +> The thesis in one line: **cloud native is not "use Kubernetes." It is the ability to make deployment, scaling, rollback, observability, and failure recovery repeatable. Deployment platform selection asks whether your team should pay that complexity cost right now.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 5 · One thing to practice** +> +> [Chapter 30](30-API与服务通信选型.md) covered how services communicate. This chapter goes one layer down: where services run, how they are released, scaled, rolled back, and operated. Small systems need less to worry about. Large systems need control and autonomy. The wrong platform taxes you every day. + +--- + +## Opening: you are choosing an operating model + +Beginners ask: + +``` + VM or container? + Serverless or Kubernetes? + Self-hosted or cloud? +``` + +Architects ask: + +- Who owns releases? +- Who owns scaling? +- Who owns alerts? +- Who can roll back a broken version? +- Who manages secrets, config, certificates, logs, metrics, permissions? +- Can the team understand the platform during an incident? + +A deployment platform is the whole path from code to online service: build, config, secrets, release, health check, traffic switch, autoscaling, logs, metrics, rollback. + +> **Rule of thumb:** the more convenient a platform is, the less control you usually have and the more vendor lock-in you may accept. The more control you want, the more operations capability you need. + +--- + +## 1. Four steps, not a maturity ranking + +``` + PaaS / managed app platform + -> easiest, good for MVP / small teams / standard web apps + + Managed containers + -> still simple, but with containers and service boundaries + + Serverless + -> good for event-driven work, bursty traffic, background tasks + + Kubernetes / K8s + -> strongest control plane, highest platform burden +``` + +Do not read this as "later is better." A three-person team shipping steadily on PaaS is healthier than a team constantly repairing a cluster it did not need. But if you have dozens of services, multiple teams, and complex traffic management, a very simple platform can become the bottleneck. + +> **Architectural wisdom:** choosing a platform is like choosing transport. A bicycle is best for a nearby errand; a truck is best for moving house. The question is not whether a truck is more advanced. The question is whether you are moving house. + +--- + +## 2. When Kubernetes is worth it + +Kubernetes does not merely "run containers." It manages many changing containers: + +- Scheduling. +- Autoscaling. +- Service discovery. +- Rolling releases. +- Self-healing. +- Resource isolation. +- Declarative configuration. + +Good signals: + +| Signal | Meaning | +|---|---| +| Many services | Need unified scheduling, release, resource governance | +| Multiple teams deploy independently | Teams cannot queue behind each other | +| Complex traffic control | Canary, blue-green, region routing are normal | +| Hybrid/private deployment | Need a portable deployment model | +| Platform team exists | Someone turns K8s into an internal developer platform | + +If you have one standard web app, one database, and one queue, K8s is often burden, not benefit. You buy certificates, ingress, network policy, image registry, cluster upgrades, access control, node resources, and observability all at once. + +--- + +## 3. Serverless changes the constraints + +Serverless is valuable because: + +- It scales on demand. +- It fits event triggers: file upload, scheduled tasks, webhook handling. +- The team manages less infrastructure. + +Costs: + +- Cold start. +- Runtime, memory, timeout, package-size limits. +- Harder local debugging and observability. +- Stronger vendor binding. + +Serverless fits short, scattered, event-driven work. It does not fit everything. Splitting a complex long workflow into dozens of functions without workflow orchestration, tracing, and retry discipline creates another kind of unmaintainability. + +--- + +## 4. Deployment strategy is architecture + +Do not only ask "can it run?" Ask what happens when a bad version ships: + +| Capability | Question | +|---|---| +| **Health check** | How does the platform know an instance can receive traffic? | +| **Rollback** | Can you quickly return to the previous version? | +| **Progressive delivery** | Canary, blue-green, gradual traffic shift? | +| **Config and secrets** | Are config, passwords, and certs separated from code and auditable? | +| **Infrastructure as Code** | Are resources versioned, reviewed, and rebuildable? | + +GitOps matters because it turns "what production should look like" from manual console clicking into versioned, reviewable, revertible declarations. For a small team this might be one config file. For a larger org, it becomes the platform engineering golden path. + +--- + +## 5. A steady evolution path + +``` + MVP / monolith: + managed app platform + managed database + simple CI/CD + + multiple services: + containers + managed container platform + standard logs/metrics/secrets + + multiple teams: + managed K8s + platform team + GitOps + service catalog + permission governance + + regulated / private / hybrid cloud: + K8s or private platform, with higher ops cost accepted +``` + +This is the same restraint as [Chapter 04](04-十大核心架构模式.md)'s "monolith first" and [Chapter 15](15-组织即架构.md)'s platform engineering: let the business run first, then turn repeated operational complexity into platform capability. + +--- + +## 6. Selection table + +| Signal | Better choice | Why | Watch out | +|---|---|---|---| +| 1-5 people, MVP, standard web app | PaaS / managed app platform | Low ops, short release path | Less control, migration cost | +| Few services, need environment consistency | Managed containers | Container consistency without full K8s burden | Still need images, config, logs, health checks | +| Many services, many teams | Managed Kubernetes | Unified scheduling, scaling, release, isolation | Without platform team, business teams absorb complexity | +| Event-driven or bursty background tasks | Serverless + queue | On-demand scaling, no long-running servers | Cold start, limits, observability, vendor lock-in | +| Regulated/private/hybrid cloud | Private cloud / self-managed K8s | Data boundary, compliance, portability | Highest ops cost | + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **Cloud native is not a tool list**: it is automation, elasticity, recovery, observability, repeatable release. +- **Deployment platform selects an operating model**: convenience vs control, simplicity vs governance. +- **Kubernetes is not the default**: it fits multi-service, multi-team, complex governance. +- **Deployment strategy is architecture**: health checks, rollback, canary, secrets, IaC/GitOps decide incident recovery. +- **Start simple, upgrade on signals**: managed first, then containers, then platform. + +> **Next:** deployment platforms make systems run and roll back. [Chapter 32](32-可观测性与可靠性技术栈选型.md) adds the other half: can you see, alert, and recover? + +--- + +## Related links + +- Method core: [04 · Core patterns](04-十大核心架构模式.md) · [06 · Quality attributes](06-质量属性与取舍.md) · [12 · Designing for failure](12-为失败而设计.md) +- Organization: [15 · Organization as architecture](15-组织即架构.md) · [24 · Review checklist](24-审查清单AI产出默认缺什么.md) +- Cases: [PatchDesk](../cases/patchdesk-saas/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/en/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" "b/en/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" new file mode 100644 index 0000000..5e428b5 --- /dev/null +++ "b/en/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" @@ -0,0 +1,169 @@ +# 32 · Observability and Reliability Stack Selection + +> The thesis in one line: **observability is not a wall of dashboards. It is whether, during a failure, you can answer with evidence: who is affected, where is it slow, why is it slow, and should someone be woken up? Reliability stack selection gives the online system a nervous system and an immune system.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 6 · One thing to practice** +> +> [Chapter 31](31-云原生与部署平台选型.md) covered deployment. This chapter covers how you know whether production is healthy, and how you recover when it is not. Do not start from Prometheus, Grafana, ELK, or OpenTelemetry. Start from SLOs. + +--- + +## Opening: monitoring and observability differ + +Monitoring asks: + +> I know what to watch. Has it crossed a threshold? + +Examples: CPU above 90%, error rate above 5%, queue backlog above 10,000. + +Observability asks: + +> I do not know where the problem will appear. Did the system leave enough evidence to investigate? + +Example: one user's checkout is slow. The request crosses gateway, order, inventory, payment, and a third-party API. Where did it slow down? A tenant? Version? Availability zone? Database index? + +> **Rule:** small systems can survive with monitoring. Distributed systems become blind with monitoring only. The more services, calls, and releases you have, the more you need observability, not just more dashboards. + +--- + +## 1. Work backward from SLOs + +Define three things: + +``` + SLI (Service Level Indicator) + -> what you measure: success rate, P99 latency, error rate, availability + + SLO (Service Level Objective) + -> your internal target: 99.9% requests under 300ms + + Error budget + -> allowed failure budget: if budget remains, ship; if burned, fix reliability +``` + +This turns "is the system stable" from feeling into numbers. Alerts should also work backward from SLO: + +> Wake people when users are being hurt. + +CPU, memory, and thread counts are candidate causes. If they do not affect the user journey, they should not directly become midnight pages. + +--- + +## 2. Three evidence types: metrics, logs, traces + +| Signal | Good for | Cost | +|---|---|---| +| **Metrics** | Trends: QPS, error rate, P95/P99 latency, queue depth | Cheap, alert-friendly, low detail | +| **Logs** | Event detail: why an order failed, why auth rejected | Detailed, but costly and noisy | +| **Traces** | Full path of one request across services | Great for distributed debugging, needs sampling and context propagation | + +OpenTelemetry / OTel is valuable because it decouples instrumentation from storage/query backends. You can generate telemetry in a standard way, then swap backend vendors or open-source systems with less migration cost. + +> Tools can change. Instrumentation habits are hard to change. Trace IDs, structured logs, and key business metrics matter more than dashboard colors. + +--- + +## 3. Reliability is not just seeing; it is recovery + +Many teams buy observability tools but reliability does not improve because: + +> Seeing a problem is not the same as handling it. + +Reliability also needs response: + +``` + Alerting -> wake only people who can act + On-call -> clear response owner + Runbook -> first steps for an alert + Incident -> severity, communication, escalation, review + Release gov -> canary, rollback, feature flags, circuit breakers +``` + +So stack selection includes incident process. A serious system needs actionable alerts, service owners, runbooks for key alerts, post-incident reviews, and review actions that feed back into alerts, code, or platform. + +--- + +## 4. Alerts: fewer, sharper, user-centered + +Low-quality alerts: + +- CPU is high. +- Memory is high. +- Disk is at 80%. +- Thread count increased. + +These are clues, not necessarily incidents. Better alerts focus on user symptoms: + +| User symptom | Actionable alert | +|---|---| +| Login fails | Login success rate below SLO | +| Checkout is slow | Checkout P99 above target for 10 minutes | +| Messages delayed | Queue backlog causes notification delay beyond promise | +| Search unavailable | Search error rate and empty-result rate abnormal | + +> **Architectural wisdom:** do not alert "machines feel bad." Alert "users are being hurt." Noise trains people to ignore real incidents. + +--- + +## 5. Choose by maturity + +| Stage | Stack tendency | Goal | +|---|---|---| +| **MVP / small team** | Managed logs + error tracking + uptime checks + a few core metrics | Someone knows when it breaks and can find rough cause | +| **Standard online system** | Metrics + structured logs + key traces + SLO alerts + runbooks | Locate, respond, roll back when users are affected | +| **Many services / teams** | OTel + correlated metrics/logs/traces + service catalog + owners | Debug across teams without shouting | +| **Critical high-reliability path** | SLO platform + canary analysis + synthetic monitoring + incident drills | Catch regressions early, limit blast radius, reduce MTTR | + +Watch cost: full log retention, full trace sampling, and high-cardinality labels become expensive fast. Observability is not "collect everything." It is "leave enough evidence for important questions." + +--- + +## 6. Selection table + +| Signal | Better stack | Why | Watch out | +|---|---|---|---| +| MVP / internal tool | Managed logs + error tracking + uptime | Fast feedback loop | Do not self-build the full platform | +| Standard web app | Metrics + structured logs + SLO alerts | See errors, latency, and core business impact | Alerts must be few and precise | +| Microservices / many teams | OTel + unified metrics/logs/traces | Debug across services, reduce owner-hunting | Requires instrumentation standards and owner governance | +| Payment / transaction / core path | SLO + error budget + canary + runbook + incident process | Reliability becomes measurable and actionable | Higher cost and on-call discipline required | +| Huge telemetry volume / cost sensitive | Tiered storage + sampling + aggregated metrics + cardinality limits | Keep debugging value while controlling bill | Oversampling can lose key evidence | + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **Observability is not dashboards**: it is enough evidence to investigate unknown problems. +- **Work backward from SLOs**: define user-good, then metrics/logs/traces/alerts. +- **Signals differ**: metrics for trends and alerts, logs for details, traces for cross-service paths. +- **Reliability needs response**: alerting, on-call, runbooks, incidents, reviews, rollback. +- **Invest by maturity**: small teams need a feedback loop; large systems need unified standards and platform support. + +> **Next:** general systems need to be visible and recoverable. AI systems add model behavior, context, retrieval quality, cost, and evals. [Chapter 33](33-AI基础设施技术栈选型.md) moves stack selection into the LLM era. + +--- + +## Related links + +- Method core: [06 · Quality attributes](06-质量属性与取舍.md) · [12 · Designing for failure](12-为失败而设计.md) · [13 · Mechanics of scale](13-规模化的力学.md) +- AI collaboration: [24 · Review checklist](24-审查清单AI产出默认缺什么.md) · [25 · Eval-driven](25-评测驱动把够好写进架构.md) +- Cases: [StarArena](../cases/stararena-ticketing/README.md) · [SyncRoom](../cases/syncroom-collaboration/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/en/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" "b/en/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" new file mode 100644 index 0000000..fa8692e --- /dev/null +++ "b/en/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" @@ -0,0 +1,178 @@ +# 33 · AI Infrastructure Technology Stack Selection + +> The thesis in one line: **AI infrastructure selection is not wiring every hot tool together. First identify your scarce resource: model capability, GPU, context, retrieval quality, cost, or controllability. The goal is not tool names; it is using the fewest components to contain AI system risk.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 7 · One thing to practice** +> +> [Chapter 17](17-大模型时代的架构判断.md) introduced nondeterminism, context, and agentic hard parts. [Chapter 22](22-AI原生系统设计.md) designed AI-native systems. This chapter asks how to choose the AI stack underneath: when a hosted API is enough, and when model gateways, self-hosted inference, GPUs, or eval platforms become justified. + +--- + +## Opening: what are you actually self-building? + +When people hear AI infrastructure, they often list: + +- GPU (compute commonly used for model training/inference) +- Vector database +- Agent framework +- Model gateway +- Inference engine +- Eval platform + +But the first question is: + +> Are you building infrastructure, or just building an AI application? + +For an early product, the sane MVP default is usually hosted model API + minimal logs and cost tracking. Only when clear trigger signals appear, such as cost loss, data cannot leave boundary, latency misses target, provider outage is unacceptable, or model customization is necessary, should you sink lower into gateways, self-hosted inference, and GPU pools. + +> **Architectural wisdom:** lower-level AI infrastructure is not automatically more advanced. It gives control, but also hands you cost, capacity planning, failure recovery, security isolation, and operations. + +--- + +## 1. Four layers of an AI stack + +``` + Entry governance: + AI Gateway / auth / rate limit / cost logging / model routing + + Context: + RAG / vector store / document permissions / rerank / citations + + Inference: + inference serving / GPU / KV cache / batching + + Guardrail: + observability / eval / trace / human approval +``` + +You do not need all four from day one. Add the layer that addresses the risk you can see: + +- Cost invisible? Add gateway or usage logs. +- Retrieval quality limits answers? Add RAG evals. +- Many teams call models? Add a model gateway. +- GPU cost beats API cost? Consider self-hosted inference. +- Agent can take actions? Add permissions, human approval, and audit. + +--- + +## 2. API or self-hosted inference is a cost/control trade + +| Choice | Strengths | Costs | +|---|---|---| +| **Hosted model API** | Fast, stable, low ops, updated models | Vendor lock-in, data path limits, unit cost at scale | +| **Self-hosted inference** | Control model, data, cost structure, deployment | GPU, memory, batching, scaling, failures, capacity | +| **Hybrid routing** | Cheap model for simple tasks, stronger model for hard tasks | Routing, evals, fallback, cost accounting | + +Do not ask which is more advanced. Ask: + +1. Can data leave the boundary? +2. Can hosted API meet latency? +3. Is volume high enough that self-hosting is cheaper? +4. Can the team operate GPU serving? + +One yes is usually not enough. Two or three yes answers start to justify going lower. + +--- + +## 3. RAG, long context, and fine-tuning solve different problems + +| Route | Solves | Fits | +|---|---|---| +| **RAG** | Retrieve external knowledge at answer time | Many documents, frequent updates, citations, permission filtering | +| **Long context** | Put lots of material into one request | Small one-off material, fits in context | +| **Fine-tuning** | Change stable behavior, style, or format | Stable output format, strong samples, domain style | + +Common mistake: poor retrieval gets blamed on the model, or a knowledge-update problem gets sent to fine-tuning. + +> **Rule:** use retrieval for knowledge; use fine-tuning for behavior. If you need citations, updates, and permissions, make RAG work first. + +--- + +## 4. Agent framework: workflow first, autonomy later + +Use the [Chapter 22](22-AI原生系统设计.md) fork: + +``` + Can a deterministic workflow solve it? + |-- yes -> workflow first + \-- no -> Agent, with permissions, budget, human approval, trace, eval +``` + +If the flow is fixed, such as "look up order -> check refund rules -> call refund API -> send notification," do not rush to an autonomous Agent. Workflow is more predictable, testable, and auditable. Agent becomes useful when steps are open, tools are many, and the task needs dynamic planning. + +Agent selection should focus on: + +- Can tool permissions be tiered? +- Is human approval supported? +- Are all steps traceable? +- Can budget and max steps be limited? +- Are context compaction and task recovery supported? + +--- + +## 5. Guardrail layer is production baseline + +AI systems differ because output is unstable and quality drifts. After launch, API success rate is not enough. You need to see: + +- Prompt and context. +- Retrieved chunks. +- Model cost. +- Tool authorization. +- Whether the final answer has citations or hallucinations. +- Whether model/prompt/retrieval changes regress quality. + +That is [Chapter 25](25-评测驱动把够好写进架构.md)'s eval discipline. If the system touches money, user data, or automatic actions, eval is not a future nice-to-have. Without eval, every model or prompt change is a blind release. + +--- + +## 6. AI stack selection table + +| Question | Starting choice | Upgrade trigger | Cost reminder | +|---|---|---|---| +| Just validating product? | Hosted model API + basic logs | Many apps share usage, cost invisible, provider outage hurts | Do not self-host GPU first | +| Many models/teams? | AI Gateway | Need auth, rate limit, billing, failover | Gateway sits on critical path | +| Need private knowledge? | RAG + simple vector retrieval | Retrieval quality unstable, permissions complex, corpus grows | RAG ceiling is retrieval quality | +| Small vector scale? | pgvector / single-node vector search | Millions of vectors, complex filters, tight latency | Dedicated vector DB is new ops surface | +| Model cost high? | Model routing + cache + quotas | API cost exceeds self-host total cost, data cannot leave | Self-hosting means GPU and memory ops | +| Need automatic action? | Deterministic workflow | Open-ended steps, dynamic planning required | Agent needs permissions, budget, human approval | +| Need stable iteration? | Trace + small eval set | Production, money/data, frequent model changes | No eval, no reliable upgrade | + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **AI infrastructure starts with the scarce resource**: model, GPU, context, retrieval quality, cost, controllability. +- **Hosted API is the default start** unless data, cost, latency, customization, or availability force an upgrade. +- **Think in four layers**: entry governance, context, inference, guardrails. Add what risk requires. +- **Use RAG for knowledge and fine-tuning for behavior**. +- **Production AI needs guardrails**: traces show the path, evals guard quality. + +> **Next:** language, data, middle layers, API, deployment, observability, and AI infrastructure are now on the table. [Chapter 34](34-技术选型决策树.md) turns them into one decision tree. + +--- + +## Related links + +- AI method: [17 · Architecting in the age of LLMs](17-大模型时代的架构判断.md) · [22 · AI-native design](22-AI原生系统设计.md) · [25 · Eval-driven](25-评测驱动把够好写进架构.md) +- Templates: [AI Gateway](../templates/ai-gateway/README.md) · [Inference Serving](../templates/inference-serving/README.md) · [RAG Knowledge Base](../templates/rag-knowledge-base/README.md) · [Vector Database](../templates/vector-database/README.md) +- Cases: [DocuMind](../cases/documind-rag/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/en/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" "b/en/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" new file mode 100644 index 0000000..49f9449 --- /dev/null +++ "b/en/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" @@ -0,0 +1,194 @@ +# 34 · Technology Selection Decision Tree + +> The thesis in one line: **technology selection is not choosing the strongest tool from a pile. It is pruning along requirements, constraints, stage, team capability, and exit cost. A mature selection does not prove a technology is good; it proves its benefit is worth the cost under current constraints.** + +--- + +> **🧰 Technology Stack Selection Track, Chapter 8 · Track wrap-up** +> +> The previous seven chapters covered language, database, cache/queue/events, API, deployment, observability, and AI infrastructure. This chapter adds no new tools. It gives one decision tree you can use whenever someone asks "should we use X?" + +--- + +## Opening: the root is not A versus B + +The first question is not: + +``` + PostgreSQL or MongoDB? + REST or gRPC? + PaaS or K8s? + API or self-hosted inference? +``` + +It is: + +> Do we really need a new technology? + +If the current stack can meet target performance, cost, reliability, and delivery time, default to keeping it. Every new technology brings learning, integration, operations, and migration cost. + +This is the same restraint as earlier chapters: monolith before microservices, workflow before Agent, hosted API before self-hosted GPU. Architects treat selection as paying a clear cost for a clear problem. + +--- + +## 1. First cut: what stage are you in? + +| Stage | Scarce resource | Selection tendency | +|---|---|---| +| **MVP** | Validation speed | Few components, mainstream stack, managed first, low migration cost | +| **Growth** | Controlled scaling | Observability, gradual release, clear boundaries, local scalability | +| **Scale** | Efficiency and cost | Deep optimization, platformization, unit cost, automation | +| **Critical** | Stability and compliance | Audit, isolation, DR, SLO, incident process | + +The right mature-stage technology can be over-engineering in MVP. While validating, choose fewer components. While growing, choose control. Only at scale should you pay for unit-cost and throughput optimization. + +--- + +## 2. Second cut: where will the system die first? + +If the current stack is insufficient, locate the failure mode before comparing tools: + +| Failure mode | Look first at | +|---|---| +| Data wrong or state mismatch | Data model, transaction boundary, idempotency, Outbox, reconciliation | +| Read hotspot crushes DB | Cache, read model, CDN, rate limit | +| Write spike crushes backend | Queue, backpressure, smoothing, async state | +| P99 amplified by fan-out | API boundary, timeout budget, degradation, trace | +| Releases cause incidents | Deployment platform, canary, rollback, config governance | +| Incidents are hard to locate | Metrics, logs, traces, SLO alerts | +| AI quality drifts | Eval, trace, RAG evaluation, model routing | +| Team collaboration blocks | Module boundaries, platform engineering, service ownership | + +> **Rule:** tools are the outer shell of the answer. Failure mode is the question. + +--- + +## 3. Third cut: can the team operate it? + +Benchmarks can look great while your team cannot run the system. Operating means: + +- Can you deploy it? +- Can you debug it? +- Is it monitored? +- Who fixes it when production breaks? +- Will upgrades break you? +- Are enough people able to understand it? + +A system with lower performance but a team that can operate it often beats a faster system nobody can repair. Technology selection is not a lab contest. It is a long-term operating contract. + +--- + +## 4. Fourth cut: can you exit? + +Mature selections have exit paths: + +| Technology | Exit question | +|---|---| +| New database | How migrate data? How verify dual writes? Where roll back? | +| Model provider | Can the API be adapted? Can prompts and evals be reused? | +| Framework | Is business logic swallowed by framework? Can it be layered away? | +| Message system | How migrate topics, schemas, offsets? | +| Cloud platform | Can images, config, secrets, storage, network be moved? | + +No exit path means binding the future. Before important technology enters production, you need a spike, rollout plan, rollback plan, and ADR. + +--- + +## 5. The unified decision tree + +``` +Need a new technology? + | + |-- Existing stack meets target? -- yes --> keep it + local optimization + | + \-- no + | + |-- MVP? -- yes --> fewest components, fastest validation, low migration cost + | + \-- no + | + |-- What is the failure mode? + | |-- data/consistency -> storage and transaction boundaries + | |-- latency/throughput -> cache, batching, scaling + | |-- availability/failure -> redundancy, degradation, isolation + | |-- AI quality -> eval, RAG, model routing + | \-- team collaboration -> module boundaries, platform capability + | + \-- Can the team operate it and exit? + |-- no -> choose a lighter option + \-- yes -> spike -> ADR -> gradual rollout +``` + +--- + +## 6. Technology selection ADR template + +```md +### ADR-034: Introduce OpenTelemetry for distributed tracing + +- Background: order requests cross 7 services. P99 sometimes exceeds 2s. Each service has local logs only, and one investigation takes about 3 hours. +- Goal: connect request path and per-hop latency, reducing MTTR to under 30 minutes. +- Candidates: + - Add more logs: cheap, but cannot reliably reconstruct paths. + - Build private tracing: flexible, but migration risk is high. + - Use OpenTelemetry: standardized instrumentation, replaceable backend. +- Decision: use OpenTelemetry traces, starting with order, inventory, and payment paths. +- Trade-off: short-term instrumentation and sampling governance cost. +- Benefit: slow requests become traceable across services, backend remains replaceable. +- Review trigger: telemetry cost exceeds budget, or critical path coverage stays below 90%. +- Exit plan: keep standard trace context; observability backend can change; business code does not bind to one vendor SDK. +``` + +The format matters less than making the reason and exit explicit. + +--- + +## 7. One table for the whole track + +| Chapter | Do not ask first | Ask first | +|---|---|---| +| [27](27-编程语言与后端框架选型.md) Language/framework | Which language is more advanced | Do team, ecosystem, runtime, and business complexity fit? | +| [28](28-数据库与存储选型.md) Database/storage | Which database is strongest | Who is source of truth, what is the query shape? | +| [29](29-缓存消息队列与事件系统选型.md) Cache/queue/events | Should we use Kafka | Is this read hotspot, time mismatch, or fact broadcast? | +| [30](30-API与服务通信选型.md) API/communication | REST or gRPC | Sync/async, internal/external, contract strength? | +| [31](31-云原生与部署平台选型.md) Deployment platform | Should we use K8s | Does the team need and support platform capability? | +| [32](32-可观测性与可靠性技术栈选型.md) Observability/reliability | Which monitoring tool | What is user SLO and how does incident response work? | +| [33](33-AI基础设施技术栈选型.md) AI infrastructure | Should we self-host GPU | Is the scarce resource model, context, cost, quality, or control? | + +--- + +## 🎯 Quick check + + + + + +--- + +## Chapter summary + +- **The root is "do we need new technology?"** If the current stack meets targets, keep it. +- **Stage changes the answer**: MVP buys speed, growth buys control, scale buys efficiency, critical systems buy stability and compliance. +- **Locate failure mode before comparing tools**: data, latency, cost, quality, collaboration are different problems. +- **The team must be able to operate it**: operability beats paper performance in production. +- **Good selection can exit**: spike, ADR, gradual rollout, migration path. + +> **Technology stack track wrap-up:** these 8 chapters are not about memorizing more tool names. They train one sentence: **read constraints first, select technology second; acknowledge the cost before enjoying the benefit.** When you read `templates/` and `cases/`, ask the reverse question: why this stack, and would the answer change if constraints changed? + +--- + +## Related links + +- Method core: [02 · Thinking framework](02-架构师的思考框架.md) · [06 · Quality attributes](06-质量属性与取舍.md) · [08 · ADRs](08-架构决策记录与演进.md) · [09 · Taste](09-架构品味.md) +- Practice entry: [templates overview](/templates/README) · [cases overview](/en/cases/README) +- Track review: [27](27-编程语言与后端框架选型.md) · [28](28-数据库与存储选型.md) · [29](29-缓存消息队列与事件系统选型.md) · [30](30-API与服务通信选型.md) · [31](31-云原生与部署平台选型.md) · [32](32-可观测性与可靠性技术栈选型.md) · [33](33-AI基础设施技术栈选型.md) diff --git a/en/tutorial/README.md b/en/tutorial/README.md index ec068da..38af797 100644 --- a/en/tutorial/README.md +++ b/en/tutorial/README.md @@ -18,7 +18,7 @@ This tutorial is the missing lesson on judgment. ## Learning path -The tutorial has six parts. **Reading in order is recommended**, but each chapter also stands on its own. +The tutorial is organized as a sequence of tracks. **Reading in order is recommended**, but each chapter also stands on its own. ### Part 1 · Build the mindset (01–03) — a new way of seeing systems @@ -95,6 +95,23 @@ Foundations + advanced teach *how to judge*; **the practice track teaches how to --- +## 🧰 Technology stack selection track (27–34) — turn "what tech to use" into architecture judgment + +**Prerequisite: foundations + advanced track.** This is not a framework tutorial or a tool ranking. It applies [Chapter 02](./02-架构师的思考框架)'s "requirements → constraints → quality attributes → trade-offs" to languages, databases, cache, APIs, deployment, observability, and AI infrastructure. + +| Ch. | In one line | +|---|---| +| [27 · Programming languages and backend frameworks](./27-编程语言与后端框架选型) | Language and framework choice combines runtime, ecosystem, team capability, and maintenance cost. | +| [28 · Databases and storage](./28-数据库与存储选型) | Draw the data lifecycle before choosing truth sources, read models, search, object storage, and vector stores. | +| [29 · Cache, message queues, and events](./29-缓存消息队列与事件系统选型) | Separate read hotspots, time mismatch, and fact broadcast before naming Redis or Kafka. | +| [30 · APIs and service communication](./30-API与服务通信选型) | Decide sync/async, internal/external, and contract strength before REST, gRPC, or GraphQL. | +| [31 · Cloud native and deployment platforms](./31-云原生与部署平台选型) | Cloud native is not "use K8s"; it is choosing a deployment and rollback model the team can operate. | +| [32 · Observability and reliability stack](./32-可观测性与可靠性技术栈选型) | Work backward from user SLOs to metrics, logs, traces, alerts, on-call, and incident flow. | +| [33 · AI infrastructure stack](./33-AI基础设施技术栈选型) | First identify the scarce resource: model, GPU, context, retrieval quality, cost, or control. | +| [34 · Technology selection decision tree](./34-技术选型决策树) | Connect new-tech necessity, stage, failure mode, team capability, and exit plan into one tree. | + +--- + ## After finishing, you should be able to - [ ] Take a vague requirement and ask the right questions to break it into clear constraints and quality goals. @@ -104,6 +121,7 @@ Foundations + advanced teach *how to judge*; **the practice track teaches how to - [ ] Make architecture decisions based on constraints and trade-offs — and write down the reasons. - [ ] *(Practice track)* Walk a full path on an unfamiliar system or template: read → design → evolve → migrate. - [ ] *(AI-collab track)* Write architecture constraints for AI, review its output, and pick the right collaboration mode for prototype vs production. +- [ ] *(Technology stack track)* Turn "what tech should we use" into a reviewable ADR instead of a popularity or preference call. --- diff --git a/index.md b/index.md index f9af3ef..b1cc11b 100644 --- a/index.md +++ b/index.md @@ -4,7 +4,7 @@ layout: home hero: name: "Awesome Architecture" text: "像架构师一样思考" - tagline: 写代码这件事正在消失,而判断力越来越值钱。26 章架构思维教程 + 25 张真实系统架构地图 + 6 个端到端案例推演——只讲架构,不讲语法。 + tagline: 写代码这件事正在消失,而判断力越来越值钱。34 章架构思维教程 + 25 张真实系统架构地图 + 6 个端到端案例推演——只讲架构,不讲语法。 actions: - theme: brand text: 开始学习 → @@ -22,7 +22,7 @@ hero: features: - icon: 🧠 title: 教判断,不教语法 - details: 不绑定任何语言或框架,讲一套可迁移、不会过时的思考方法:需求 → 约束 → 质量属性 → 取舍。 + details: 不绑定任何语言或框架,讲一套可迁移、不会过时的思考方法:需求 → 约束 → 质量属性 → 取舍,并把它落到技术栈选型。 - icon: 🗺️ title: 25 张真实系统架构地图 details: 从电商、社交信息流、IM、视频流,到 AI 网关、RAG、模型推理服务,每张都讲清「为什么这么设计、会死在哪」。 @@ -47,6 +47,10 @@ features: 案例篇不是更多模板,而是完整项目推演:从起始架构、量化信号、触发升级,一路讲到 ADR、数据流、故障兜底和随堂检验。👉 [进入案例篇](/cases/README) +## 🧰 学会做技术栈选型 + +新增技术栈选型篇(27–34)不讲框架教程,而是讲语言、数据库、缓存、API、部署、观测、AI 基础设施这些选择背后的架构判断。👉 [从第 27 章开始](/tutorial/27-编程语言与后端框架选型) + ## ⚖️ 架构,就是在岔路口做选择 每个模板里最值钱的一节,是「关键决策与权衡」。先尝尝这张可交互的决策卡: diff --git "a/tutorial/10-\345\210\206\345\270\203\345\274\217\347\263\273\347\273\237\347\232\204\347\241\254\351\201\223\347\220\206.md" "b/tutorial/10-\345\210\206\345\270\203\345\274\217\347\263\273\347\273\237\347\232\204\347\241\254\351\201\223\347\220\206.md" index 6ac0d91..b4eb7fa 100644 --- "a/tutorial/10-\345\210\206\345\270\203\345\274\217\347\263\273\347\273\237\347\232\204\347\241\254\351\201\223\347\220\206.md" +++ "b/tutorial/10-\345\210\206\345\270\203\345\274\217\347\263\273\347\273\237\347\232\204\347\241\254\351\201\223\347\220\206.md" @@ -6,6 +6,8 @@ > **🧭 进阶篇从这里开始。** 入门篇(01–09)教你**看懂一个系统、并能从 0 设计一个中小系统**;从本章起的进阶篇,处理的是另一类问题——**系统一旦做大、做关键,才会露出獠牙的那些「硬骨头」**:分布式、失败、规模、演进、组织。 > +> **先别紧张,这一章是入门篇的「老朋友变深」,不是另起炉灶。** 还记得 [05 · 数据与状态](05-数据与状态.md) 里那个「点赞数可以最终一致、ATM 余额必须强一致」的例子吗?还记得那句「大白话版 CAP:网络一断,一致性和可用性只能二选一」吗?这一章干的事,就是**把那几句你已经会背的结论,翻过来看它们背后到底踩了什么坑**——为什么会「最终」一致、为什么强一致那么贵、网络到底是怎么「断」的。**你已经站在门口了,我们只是往里再走一层。** +> > 这恰恰是 AI 时代最值钱的能力。AI 几秒就能吐出一份能跑的 Raft 实现,但「这里到底**要不要**共识、能**容忍多大**不一致、网络裂开那一刻你要**正确还是要在线**」——这些**判断题**的代价由你的业务承担,AI 给不了你标准答案,因为答案取决于你愿意拿什么换什么。**实现越来越廉价,判断越来越值钱。** 这条主线会贯穿整个进阶篇。 --- @@ -71,15 +73,21 @@ - **分区时(PAC)**:网裂成两半,你要么拒绝服务保正确(选 C),要么继续服务但可能给旧数据(选 A)。这是 CAP 讲的部分。 - **没分区时(ELC)**:就算网络一切正常,**强一致依然要钱**——为了让所有副本同意一个值,得等节点间往返协调,这就是**延迟(L)**。你想更快(低 L),就得放松一致性(C)。 +**一句话翻译这两行公式**:CAP 让你以为「一致性只在网络出故障(分区)那种极端时刻才需要权衡」;PACELC 拍醒你——**网络好端端的时候(占 99% 的日子),你每写一次数据、每决定一次「要不要等所有副本都确认」,就已经在偷偷拿延迟换一致性了。** 这不是出事才付的「意外险」,是你天天在付的「水电费」。 + > **判断要点**:CAP 容易让人以为「一致性只在网络故障时才需要权衡」。**错。PACELC 提醒你:即使岁月静好,你每一次"要不要等所有副本确认"的选择,都是在拿延迟换一致性。** 这才是你天天在付的账。 --- ## 四、没有「现在」:逻辑时钟买到的是「因果正确」 -既然每台机器的物理时钟都有偏差、网络延迟又不确定,那「事件 A 比 B 先发生」这句话在分布式里**怎么才算数**?靠看墙上的表是不行的——两台机器的表对不齐,差几十毫秒,而很多事就发生在这几十毫秒里。 +> 💧 **深水区(初读可跳过,主线不依赖它;想搞懂的往下看)**。这一节是分布式里最「学术」的一块。**你只需带走一句话:在分布式里没有统一的「现在」,所以要判断两件事谁先谁后,靠的不是看表,而是看「谁导致了谁」(因果)。** 至于下面那三种时钟的名字,绝大多数业务一辈子用不上——读不下去就跳到第五节,毫无损失。下面是给好奇的人讲透的版本。 + +既然每台机器的物理时钟都有偏差、网络延迟又不确定,那「事件 A 比 B 先发生」这句话在分布式里**怎么才算数**? -办法是**放弃物理时间,改用因果关系**。这就是**逻辑时钟**: +先体会一下这个坑有多真实:你和朋友各看各的手机抢同一张票,两部手机的表可能差了几十毫秒;而「谁先点的」这件事,偏偏就发生在这几十毫秒里。**靠对比两块表的读数来判断谁先谁后,是不可靠的**——表本身就对不齐。 + +办法是**放弃物理时间,改用因果关系**:不问「几点几分」,只问「这件事是不是因为那件事才发生的」。这就是**逻辑时钟**: ``` 节点甲: 事件a1 ──发消息m──▶ @@ -91,9 +99,14 @@ 至于 a1 和 b1 谁先?它们没有因果关系 —— 算"并发",不强行排序。 ``` -- **Lamport 时钟**:给所有事件定一个**全序**(谁都能排出先后),但分不清「真有因果」还是「恰好并发」。 -- **向量时钟(Vector Clock)**:更进一步,能**识别出哪些事件是并发的**(从而发现「写冲突」)。 -- **混合逻辑时钟(HLC)**:把物理时间和逻辑计数缝在一起,既接近真实时间、又保证因果——现代分布式数据库做一致性快照常用它。 +关键招数其实很朴素:**让每条消息都「带上」发送方此刻的计数,收到的人把自己的计数往后调到比它更大。** 这样「发」一定排在「收」前面,因果顺序就被钉死了。在这个朴素思想上,有三种做法,越往后越精细: + +- **Lamport 时钟**:给所有事件排出一条**总队**(谁都能说出先后)。 + - 打个比方:像微信群里按消息到达服务器的先后强行编号——**所有消息都有了唯一的序号**。缺点是:它分不清「B 是真的在回复 A」还是「A、B 只是恰好同时各发各的」。它只保证「不会先看到回复、后看到原帖」,但会把「其实互不相干的两件事」也硬排出个先后。 +- **向量时钟(Vector Clock)**:更进一步,能**看出哪些事件其实是「同时发生、互不相干」的**——也就能发现「写冲突」。 + - 打个比方:两个人同时编辑同一份文档的同一行,向量时钟能识别出「这俩改动谁也不知道对方」,于是系统知道**这里撞车了,得让人来决定留谁**,而不是稀里糊涂用「表上谁的时间晚」覆盖掉另一个。 +- **混合逻辑时钟(HLC)**:把「物理表的时间」和「逻辑计数」缝在一起,既贴近真实时间、又保证因果顺序。现代分布式数据库做「某一刻的一致性快照」常用它。 + - 一句话:前两种纯逻辑、读数和真实时间脱节(不好排查问题);HLC 让序号「看起来还像个时间」,兼顾两边。 > **判断要点**:大多数业务系统**用不上**逻辑时钟——别为了显得高级而引入。但一旦你的系统要保证「**因果正确**」,它就绕不开:[实时协同文档](../templates/collaborative-doc/README.md) 里多人编辑的合并顺序、[实时通讯](../templates/realtime-chat/README.md) 里消息的时序、分布式数据库的一致性快照……背后都是「在没有全局钟的世界里,如何确定先后」这同一个问题。 @@ -105,6 +118,8 @@ 共识买到的东西很硬核:**即使部分节点宕机或失联,存活的多数派仍能对「一个值 / 一条日志的顺序」达成唯一、一致的决定。** 它是分布式世界里「单一权威真相」的来源。 +打个生活里的比方:**共识就像一群人开会表决,任何决议都得「过半数举手」才算通过。** 这么干的好处是稳——哪怕几个人临时离场(节点宕机),只要还凑得齐过半数,会议照样能拍板,而且不会出现「两拨人各自通过了相反决议」的分裂。坏处也一样直白:**每个决定都得等过半数的人来回确认一遍,人越多、坐得越远(跨机房),这一圈下来就越慢。** 所以共识开的是「重要的董事会」,不是「随便拉个群聊」。 + 但它**非常贵**: ``` diff --git "a/tutorial/11-\346\225\260\346\215\256\344\270\200\350\207\264\346\200\247\345\267\245\347\250\213.md" "b/tutorial/11-\346\225\260\346\215\256\344\270\200\350\207\264\346\200\247\345\267\245\347\250\213.md" index dacfaae..1dd9eef 100644 --- "a/tutorial/11-\346\225\260\346\215\256\344\270\200\350\207\264\346\200\247\345\267\245\347\250\213.md" +++ "b/tutorial/11-\346\225\260\346\215\256\344\270\200\350\207\264\346\200\247\345\267\245\347\250\213.md" @@ -6,6 +6,8 @@ > **🧭 这是进阶篇第 2 章。** [10 · 分布式系统的硬道理](10-分布式系统的硬道理.md) 摆出了「病理」——部分失败、没有全局时钟、共识很贵、exactly-once 是幻觉。本章是「临床」:**知道了会乱,那到底怎么治?** 上一章结尾埋的伏笔——「at-least-once 投递 + 消费端幂等 = 效果上的恰好一次」——正是本章所有手法的地基。 > +> **用一个你天天遇到的场景把本章串起来:网购下单。** 它背后是「扣库存、建订单、发优惠券」三件事。单机时代一个数据库事务就能保证它们「要么全成、要么全败」;可一旦这三件事分到了三个服务、三个库,那条让你睡得着觉的事务边界就**被网络剪断了**——扣了库存、建了订单,优惠券却没发出去,数据就「半拉子」了。本章这一整套手艺(Saga、Outbox、幂等……),解决的都是同一个问题:**没有了那条万能的事务边界,怎么把数据重新弄对。** +> > 这也是 AI 时代最考验人的一层。AI 几秒就能给你写出「乐观路径」的下单代码:扣库存、建订单、发消息,一路 `await` 到底。但它几乎从不主动给你补上「**第三步失败了,前两步怎么办**」。而那,恰恰是这一章的全部主题。 --- @@ -111,7 +113,8 @@ Saga 怎么把这一串步骤串起来,有两种风格,这是本节最重要的 | **适合** | 步骤多、补偿复杂、要强监控的**关键长流程**(订单、支付、入职) | 步骤少、参与方少、追求松耦合的轻量流程 | | **风险** | 编排器膨胀成「上帝服务」 | 步骤一多就变成「事件意大利面」,没人讲得清全流程 | -> **判断要点**:**步骤少、链路短,用编舞;步骤多、补偿复杂、要能一眼看清「现在走到哪、为什么卡住」,用编排。** 一个朴素的经验法则:当你发现「要靠翻好几个服务的日志才能拼出一笔订单到底发生了什么」时,就该上编排器了。这也是为什么 Uber、DoorDash 这类公司会专门做**持久化工作流引擎**(下面案例会讲)——本质就是把 Saga 编排器做成了平台级基础设施。这里的编排与我们在agent系统里的router agent相似,编舞与handoff模式相似。 +> **判断要点**:**步骤少、链路短,用编舞;步骤多、补偿复杂、要能一眼看清「现在走到哪、为什么卡住」,用编排。** 一个朴素的经验法则:当你发现「要靠翻好几个服务的日志才能拼出一笔订单到底发生了什么」时,就该上编排器了。这也是为什么 Uber、DoorDash 这类公司会专门做**持久化工作流引擎**(下面案例会讲)——本质就是把 Saga 编排器做成了平台级基础设施。 + --- ## 三、双写难题:既改了数据库,又要发条消息,怎么不丢不重? @@ -198,6 +201,8 @@ Saga 也好、事件驱动也罢,背后都压着一个极其普遍、又极其 ## 五、事件溯源:存「发生过什么」,而不是「现在是什么」 +> 💧 **深水区(本节 + 下一节 CQRS 都可初读跳过,主线不依赖它们)**。事件溯源和 CQRS 是两件「重武器」——**绝大多数日常的增删改查系统都用不上,硬上反而是给自己添堵**。所以这两节你读完只要记住一句:**「它们是什么、以及什么时候才轮到我用」**,会判断就够了,不必现在就掌握细节。下面给好奇的人讲透。 + 到这里,我们一直在「**存当前状态**」的世界里打补丁。现在来一次世界观的翻转——**事件溯源(Event Sourcing)**:不存「账户现在余额是 100」,而是存下**导致这个余额的一连串事件**:「开户(+0)→ 存入 80 → 存入 50 → 取出 30」。当前状态(余额 100)不再是被直接保存的东西,而是**把所有事件依次「重放」算出来的结果**。 ``` @@ -228,7 +233,7 @@ Saga 也好、事件驱动也罢,背后都压着一个极其普遍、又极其 - **查询变难**:库里是一堆事件,你想查「余额大于 1000 的账户」?没法直接 `WHERE`——得先把事件重放成状态。这正是下一节 **CQRS** 要解决的问题(两者常常结对出现)。 - **事件 schema 演进是地狱**:事件一旦写下就**永不删改**(那是真相)。可三年后你的事件结构变了,**老事件还得能被新代码正确重放**——这种「跨越数年的向后兼容」是事件溯源最难的工程挑战(第七节专门讲演进)。 -- **重放成本**:事件攒到几百万条,每次从头重放算当前状态会很慢——于是要定期存**快照(snapshot)**,从最近的快照往后放,而不是从盘古开天辟地放起。 +- **重放成本**:事件攒到几百万条,每次从头重放算当前状态会很慢——于是要定期存**快照(snapshot)**,从最近的快照往后放,而不是从盘古开天辟地放起。(就像打游戏的存档点:不必每次都从第一关重打,从最近的存档接着玩就行。) > **判断要点**:**事件溯源是把双刃剑,绝不是「更先进所以更好」。** 它在「**审计/可追溯压倒一切、且业务天然就是一串事件**」的领域(账务、交易、订单状态机、[协同文档](../templates/collaborative-doc/README.md) 的编辑历史)闪闪发光;但若硬塞进一个普通的增删改查后台,你买到的全是「查询难、演进难、心智负担重」的代价,却用不上它的好处。**先问:我真的需要「过去每一刻的完整历史」吗?** 不需要,就老老实实存状态。 @@ -240,6 +245,8 @@ Saga 也好、事件驱动也罢,背后都压着一个极其普遍、又极其 核心思想一句话:**写用一套模型,读用另一套模型,中间靠「事件 / 同步」把读模型喂新。** +打个比方:像餐厅把**后厨**和**菜单展示**分开。后厨(写侧)只管「把菜做对、库存记准」,按这个目标来组织;而摆在客人面前的菜单、点评墙、热销榜(读侧)只管「让人一眼看明白、点得快」,各做各的、各自优化。两边靠服务员(事件同步)来回传话保持大致同步——代价是菜单上的「售罄」标签可能比后厨慢半拍更新(最终一致)。 + ``` 传统:读和写共用同一个模型、同一张表 —— 既要好写(规范化、强一致),又要好读(各种查询) 结果常常是「两头都将就」,一个复杂查询能拖垮整个写库。 @@ -284,7 +291,7 @@ CQRS 的甜头和苦头: - **后向兼容(backward)**:新代码能读懂老数据/老消息。 - **前向兼容(forward)**:老代码能读懂(至少能不崩地忽略)新数据/新消息。 -而把数据库从旧结构安全迁到新结构,业界久经考验的套路叫 **expand-contract(先扩展,后收缩)**,又叫「平行变更」: +而把数据库从旧结构安全迁到新结构,业界久经考验的套路叫 **expand-contract(先扩展,后收缩)**,又叫「平行变更」。**它的直觉就像在旧桥旁边先架一座新桥:先让两座桥并行通车(新旧字段共存),等车流都平稳走上新桥了,再拆掉旧桥——全程没有一刻是「断路」的。** ``` ❌ 危险做法:直接 RENAME / DROP 列 —— 部署的一瞬间,还没升级的旧实例集体崩溃 diff --git "a/tutorial/12-\344\270\272\345\244\261\350\264\245\350\200\214\350\256\276\350\256\241.md" "b/tutorial/12-\344\270\272\345\244\261\350\264\245\350\200\214\350\256\276\350\256\241.md" index baacc6b..661271b 100644 --- "a/tutorial/12-\344\270\272\345\244\261\350\264\245\350\200\214\350\256\276\350\256\241.md" +++ "b/tutorial/12-\344\270\272\345\244\261\350\264\245\350\200\214\350\256\276\350\256\241.md" @@ -4,7 +4,9 @@ --- -> **🧭 进阶篇第 3 章。** [06 · 质量属性与取舍](06-质量属性与取舍.md) 给了你「可用性 = 几个 9 = 每年允许停机多久」的标尺;[10 · 分布式系统的硬道理](10-分布式系统的硬道理.md) 摆出了病理——部分失败、灰色失败、自动化在分区时「正确地」闯祸。这一章把它们拧成一根绳:**既然失败必然发生、又分不清死与慢,那一个系统该长成什么样,才能在零件不断坏掉的同时整体不倒?** 这就是「为失败而设计(Design for Failure)」。 +> **🧭 进阶篇第 3 章。** [06 · 质量属性与取舍](06-质量属性与取舍.md) 给了你「可用性 = 几个 9 = 每年允许停机多久」的标尺;[10 · 分布式系统的硬道理](10-分布式系统的硬道理.md) 摆出了病理——部分失败、灰色失败(「分不清它是死了还是只是慢」)、自动化在分区时「正确地」闯祸。这一章把它们拧成一根绳:**既然失败必然发生、又分不清死与慢,那一个系统该长成什么样,才能在零件不断坏掉的同时整体不倒?** 这就是「为失败而设计(Design for Failure)」。 +> +> **这一章的好消息是:它几乎不靠新名词,全靠生活常识。** 保险丝、船舱、留余量、丢车保帅——你早就懂这些道理,本章只是把它们安到系统上。读起来会比上一章轻松很多。 > > 还是那条主线:AI 几秒就能给你写出一个能跑的 happy path。但「这个依赖挂了要不要降级、重试几次才不算雪上加霜、丢哪部分流量保哪部分」——**这些判断的代价由你的线上事故承担,AI 给不了。** @@ -92,18 +94,25 @@ AWS 的 CTO Werner Vogels 把这句话钉进了一代工程师的脑子里: **Cell-based 架构:把整个系统复制成多个独立「细胞」。** 更高一层的隔离——不是隔离一个池,而是把整套服务栈(网关、服务、数据)复制成多个互相隔离的 **cell**,每个 cell 服务一部分用户。一个 cell 整个烧了,只影响落在它里面的那批用户,其余 cell 毫无感知。AWS 大量内部服务用这种「cell-based」架构来**给爆炸半径设上限**。 -**Shuffle sharding:用随机组合,让「连坐」概率趋近于零。** 这是 AWS 的一个精妙发明。假设你有 8 个后端节点、要服务很多客户: +**Shuffle sharding:用随机组合,让「连坐」概率趋近于零。** 这是 AWS 的一个精妙发明。 + +> 💧 **深水区(初读可跳过,记住下面这个比方就够了)**:**它就像发扑克牌。** 给每个客户随机发「一手牌」(几个节点的组合),而不是让一整桌人共用同一手牌。这样当某个「毒客户」把自己那几张牌(节点)玩坏时,**几乎不会有别人和他拿到的是完全相同的一手牌**——别人顶多和他重了一两张,手里还有别的牌能正常用。于是「一个坏客户拖垮一整组人」的连坐,被摊薄到几乎不可能发生。下面是把这个比方算成数字的版本。 + +假设你有 8 个后端节点、要服务很多客户: ``` 普通分片:把客户切成 4 组,每组固定 2 个节点 → 某 2 个节点被一个「毒客户」打挂,固定绑这 2 个节点的那一整组客户全遭殃 - Shuffle sharding:给每个客户随机分配 2 个节点的组合(C(8,2)=28 种组合) - → 两个客户「恰好分到完全相同的两个节点」的概率极低 + Shuffle sharding:给每个客户随机分配 2 个节点的组合 + → 从 8 个节点里随机挑 2 个,一共有 28 种不同的组合(这就是 C(8,2)=28) + → 两个客户「恰好抽到完全相同的那 2 个节点」的概率,只有 1/28,很低 → 一个毒客户打挂它那 2 个节点,几乎不会和别人「完全重叠」 别人哪怕共享了其中 1 个节点,还有另 1 个能用 → 受影响面被摊薄到接近 0 ``` +(节点越多、每人分到的越多,组合数会爆炸式增长——上千节点时组合数是天文数字,两人「完全撞车」的概率低到可以忽略。这就是 shuffle sharding 威力的来源。) + > **架构智慧**:隔离的核心判断,是先想清楚 **「故障域(failure domain)」的边界画在哪**——哪些东西必须一起死、哪些绝不能互相拖累。**最该被舱壁隔开的,永远是「核心」和「非核心」**:别让「猜你喜欢」挂掉时,把「下单支付」一起带走。隔离不是免费的(更多池 = 更多闲置资源、更复杂的容量规划),所以**它本身也是一道取舍**——把隔离的颗粒度,花在「绝不能被拖垮」的关键路径上。 --- diff --git "a/tutorial/13-\350\247\204\346\250\241\345\214\226\347\232\204\345\212\233\345\255\246.md" "b/tutorial/13-\350\247\204\346\250\241\345\214\226\347\232\204\345\212\233\345\255\246.md" index edc4add..24acac7 100644 --- "a/tutorial/13-\350\247\204\346\250\241\345\214\226\347\232\204\345\212\233\345\255\246.md" +++ "b/tutorial/13-\350\247\204\346\250\241\345\214\226\347\232\204\345\212\233\345\255\246.md" @@ -63,6 +63,8 @@ **一致性哈希(Consistent Hashing)** 就是来解这个的。它的核心思想美得像魔术:**把哈希空间想象成一个环(0 到 2³²-1 首尾相接),节点和数据都哈希到环上;一个 key 顺时针找到的第一个节点,就归它。** +用大白话理解这个「环」:**想象一个圆形钟面。** 几台机器先各自占住钟面上的几个钟点位置;每个数据也落在某个钟点上,然后**顺时针走,撞到的第一台机器就负责它**。妙处在于:这时你新加一台机器,等于往钟面上插一个新钟点——**只有「新钟点」到「它前一个钟点」之间那一小段的数据需要换东家,钟面上其它位置的数据全都纹丝不动。** 这就是为什么加一台机器不再「全集群地震」。 + ``` 0/2³² │ @@ -198,7 +200,9 @@ ── 平均数把"少数极慢"稀释没了,可那 1% 恰恰是你最该在意的体验。 ``` -**真正的杀手是扇出放大(fan-out amplification)。** 现代系统里,一个用户请求往往要扇出成几十上百个内部子调用(查 100 个分片、调 100 个微服务),**只有等最慢的那个子调用回来,整个请求才算完成**: +**真正的杀手是扇出放大(fan-out amplification)。** 现代系统里,一个用户请求往往要扇出成几十上百个内部子调用(查 100 个分片、调 100 个微服务),**只有等最慢的那个子调用回来,整个请求才算完成**。 + +打个比方:**这就像一桌 100 个人点的菜,要等最后一道上齐了才能开饭。** 单看每道菜,只有 1% 的概率会做得慢;可一桌 100 道菜里,「至少有一道慢」几乎是必然的——于是整桌人(整个请求)几乎每次都得干等。**人越多(扇出越大),开饭越难不迟到。** ``` 一个请求扇出到 100 个子调用,每个子调用 p99 = 1%(即 1% 概率慢): @@ -224,9 +228,11 @@ ## 七、排队的直觉:为什么逼近 100% 利用率时,系统会突然爆炸 +> 💧 **深水区(公式可全部跳过,只记一个生活直觉)**:这一节有几个数学名字(Little 定律、USL、Amdahl),但**它们想说的就一件你天天体验的事——高速公路。** 路上车不多时,你想多快开多快;可一旦车流逼近「路面塞满」的程度,只要再多几辆车,就会从「顺畅」瞬间变成「堵死」,而且越堵越死。**服务器和高速公路一模一样:利用率(路有多满)一旦逼近 100%,延迟(你被堵多久)就会爆炸式飙升。** 这就是为什么「看起来闲着 30% 没跑满」的服务器不是浪费——那 30% 是给突发车流留的缓冲带。下面把这个直觉讲细。 + 最后一个、也是最深的力学——**排队论**。它解释了一个让无数人栽跟头的现象:系统在 70% 利用率时一切正常,加一点流量到 95%,延迟突然飙升十倍。 -先记住 **Little 定律**(简洁到不像真的,却永远成立): +先记住 **Little 定律**(简洁到不像真的,却永远成立)——别被公式吓到,它无非是说「**排队的人数 = 来人的速度 × 每人平均待多久**」,你在奶茶店排队时早就懂这个道理了: ``` L = λ × W @@ -258,7 +264,7 @@ > **架构智慧**:**「留余量」是设计,不是浪费。** 那个看起来「闲着 30% 没跑满」的服务器,买的是**应对突发的能力**和**可控的尾延迟**——把利用率压到 100% 省下来的机器钱,会在第一波流量尖峰里以「系统雪崩」的形式十倍奉还。这也是为什么 [在线票务](../templates/online-ticketing/README.md) 这种「开售即洪峰」的系统,必须按峰值而非均值预留容量。 -而当你想靠加机器来提升容量时,还有最后一个冷水——**通用可扩展性定律(USL)与 Amdahl 定律**: +而当你想靠加机器来提升容量时,还有最后一个冷水——**通用可扩展性定律(USL)与 Amdahl 定律**。还是用大白话:**就像往一个厨房里塞厨师。** 一开始多一个厨师多一份产出;但厨师太多,大家开始抢灶台、互相打招呼、等对方让路——加到某个点,再多塞人反而更慢。机器之间也一样,越多越要「互相对齐、互相协调」,这份开销会吃掉、甚至吃穿你加机器的收益: ``` 理想线性: 10 台 = 10 倍 实际: diff --git "a/tutorial/14-\346\274\224\350\277\233\344\270\216\346\213\206\345\210\206\345\244\247\345\236\213\347\263\273\347\273\237.md" "b/tutorial/14-\346\274\224\350\277\233\344\270\216\346\213\206\345\210\206\345\244\247\345\236\213\347\263\273\347\273\237.md" index 49fc4bf..d598ffe 100644 --- "a/tutorial/14-\346\274\224\350\277\233\344\270\216\346\213\206\345\210\206\345\244\247\345\236\213\347\263\273\347\273\237.md" +++ "b/tutorial/14-\346\274\224\350\277\233\344\270\216\346\213\206\345\210\206\345\244\247\345\236\213\347\263\273\347\273\237.md" @@ -218,6 +218,8 @@ Martin Fowler 2001 年在澳洲雨林里见到一种植物:**绞杀榕(strangler **演进式架构**(Evolutionary Architecture,Neal Ford / Rebecca Parsons / Patrick Kua)给的解药是**适应度函数**:**把你在乎的架构约束,写成一段能自动运行、会失败、能卡住 CI 的测试。** 架构规则一旦能被机器持续验证,它就从"墙上贴的、靠自觉的规范",变成了"违反就红、合不进去的硬约束"。 +> 💧 **深水区(名字唬人,意思很简单)**:「适应度函数」这个学名听着玄,**其实就是「给架构装一套自动体检 + 门禁」**。你平时写单元测试是检查「功能对不对」;适应度函数只是把检查对象换成「架构规矩有没有被破坏」——比如「订单模块是不是偷偷 import 了计费模块的内部代码」。一旦有人违反,CI 直接亮红灯、合不进去。**它是架构的免疫系统:不阻止系统长大,只阻止它在长大的过程中烂掉。** + ``` 适应度函数 = 给架构装上"持续体检",一违反就报警 diff --git "a/tutorial/15-\347\273\204\347\273\207\345\215\263\346\236\266\346\236\204.md" "b/tutorial/15-\347\273\204\347\273\207\345\215\263\346\236\266\346\236\204.md" index 1e0661f..e6a4786 100644 --- "a/tutorial/15-\347\273\204\347\273\207\345\215\263\346\236\266\346\236\204.md" +++ "b/tutorial/15-\347\273\204\347\273\207\345\215\263\346\236\266\346\236\204.md" @@ -109,7 +109,14 @@ Conway 自己举的例子刀刀见血:**「如果你派四个小组去写一个 | **赋能团队**(Enabling) | 临时性的能力缺口 | 像「巡回教练」,短期进驻帮流式团队补上某项技能,然后**撤走**(不常驻、不接管) | | **复杂子系统团队**(Complicated-subsystem) | 需要深度专精的硬骨头 | 接管那种「得是博士才搞得定」的部分(搜索排序、视频编码、风控模型),**替别人把这块认知负荷封装掉** | -光有四种团队还不够,Team Topologies 还规定了它们之间**只许有三种「互动模式」**——这等于直接给康威定律里的「沟通结构」立了规矩:**「协作(collaboration)」**(两队短期紧密共创,适合探索期,但要警惕变成永久耦合)、**「服务化(X-as-a-Service)」**(一队把能力做成自助服务给另一队用,沟通成本最低,是平台团队的常态)、**「赋能(facilitating)」**(一队短期帮另一队补能力)。把互动模式管起来,就是在**主动设计系统的接口结构**——你不希望两个服务永久紧耦合,就别让那两个团队永久处于「协作」模式。 +光有四种团队还不够,Team Topologies 还规定了它们之间**只许有三种「互动模式」**。 + +> 💧 **深水区(可跳过,记住「管团队怎么打交道 = 管系统接口长啥样」就够)**。这三种模式说白了就是「两个团队之间该怎么打交道」的三种姿势,而**团队怎么打交道,最后会原样长成系统里两个模块怎么打交道**(这就是康威定律)。三种姿势: +> - **协作(collaboration)**:两队凑一起、肩并肩一起干。**像两个人合写一份文档**——磨合快、适合探索期,但坏处是「你中有我」,容易粘住分不开,所以**只该短期用**。 +> - **服务化(X-as-a-Service)**:一队把能力做成「自助服务」给另一队用,对方照着说明书自取即可。**像你用云盘、点外卖**——不用和对方开会,沟通成本最低,这是平台团队的常态,也是最该追求的默认姿势。 +> - **赋能(facilitating)**:一队像「巡回教练」短期进驻,帮另一队补上某个不会的技能,补完就撤。 +> +> 关键判断:**你不希望两个服务将来永久紧耦合,就别让那两个团队长期处于「协作」模式**——让他们尽早转成「服务化」。管住团队的互动模式,就是在悄悄设计系统的接口结构。 > **判断要点**:这张表的灵魂是**「流式对齐团队是主角,其余三类都是为了给它减负」**。平台、赋能、复杂子系统团队存在的唯一理由,是**让那些直接对用户负责的团队,把宝贵的脑容量花在业务上,而不是花在偶然复杂度上**。**先问「哪个团队认知负荷要爆了」,再决定拆什么——而不是先看技术图谱拍脑袋拆。** 而团队之间默认应走「服务化」(沟通成本最低),只在探索期短暂用「协作」——这就是用组织手段,把系统推向「松耦合」。 > diff --git "a/tutorial/16-\345\256\211\345\205\250\344\270\216\345\244\232\347\247\237\346\210\267\346\236\266\346\236\204.md" "b/tutorial/16-\345\256\211\345\205\250\344\270\216\345\244\232\347\247\237\346\210\267\346\236\266\346\236\204.md" index 7071f17..c2e978e 100644 --- "a/tutorial/16-\345\256\211\345\205\250\344\270\216\345\244\232\347\247\237\346\210\267\346\236\266\346\236\204.md" +++ "b/tutorial/16-\345\256\211\345\205\250\344\270\216\345\244\232\347\247\237\346\210\267\346\236\266\346\236\204.md" @@ -40,7 +40,7 @@ I 错误信息泄露内部结构? R 改了订单事后赖账? I 备份/日志里有明文? ``` -**关键不在于背熟六个字母,而在于养成「对着每一道边界,逐条想坏事」的反射。** 你会发现:网关那道边界主要防假冒(S)和拒绝服务(D);服务那道边界主要防越权(E)和篡改(T);数据那道边界主要防拖库泄露(I)。**防御资源,就该按这张图精准投放,而不是均匀地撒。** +**关键不在于背熟六个字母,而在于养成「对着每一道边界,逐条想坏事」的反射。** 给个记住 STRIDE 的笨办法:它就是六个最常见的「坏人想干的事」——**假冒你(S)、改你数据(T)、干完赖账(R)、偷看不该看的(I)、把你打挂(D)、把自己变成管理员(E)**。你不用背字母,只要每经过一道信任边界,就把这六问挨个问一遍。你会发现:网关那道边界主要防假冒(S)和拒绝服务(D);服务那道边界主要防越权(E)和篡改(T);数据那道边界主要防拖库泄露(I)。**防御资源,就该按这张图精准投放,而不是均匀地撒。** > **架构智慧**:补丁思维问「我加了哪些安全措施?」,结构思维问「我的每一条数据流、每一道信任边界,都被谁、用 STRIDE 的哪一条攻击过了吗?」。**前者是清单,后者是地图。** 安全不是往系统上「贴」东西,而是在画架构时就把「攻击者会怎么想」织进结构里——这正是 [06 章](06-质量属性与取舍.md) 那句「不信任输入」的系统化版本。**而它和别的质量属性最大的不同是:性能、可用性可以「先上线、再优化」,安全的结构性漏洞却往往「上线即定型、事后补不上」。** @@ -113,7 +113,7 @@ ↑ 但隔离全靠"代码纪律" 按租户重要性分档 ↑ 但成本/运维随租户数线性涨 ``` -而在「数据怎么隔离」这个最关键的维度上,又是从软到硬的一道谱系: +**这两张谱系其实是同一件事的「拉近看」**:上面那张说的是整套系统(服务 + 库)共享到什么程度,下面这张把镜头推到最要命的「数据」上,看它具体隔到多硬。**一句话记法:从「大家挤一间合租房」到「每人一套独立公寓」,中间是连续的档位,越独立越安全也越贵。** 而在「数据怎么隔离」这个最关键的维度上,又是从软到硬的一道谱系: ``` 软(省钱、易串)◀──────────────────────────────▶ 硬(贵、难串) diff --git "a/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" "b/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" index 5a43fab..0f79b7e 100644 --- "a/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" +++ "b/tutorial/17-\345\244\247\346\250\241\345\236\213\346\227\266\344\273\243\347\232\204\346\236\266\346\236\204\345\210\244\346\226\255.md" @@ -1,10 +1,10 @@ # 17 · 大模型时代的架构判断:vibe coding 时代,你靠什么不可替代 -> 一句话点题:**当「写代码」坍缩成几秒钟、几句自然语言就能完成的廉价动作,真正稀缺、真正值钱的,只剩下一件事——在动手之前判断「这个系统该长什么样、会死在哪、我在拿什么换什么」。** 这是进阶篇的收官,也是整套教程的收官:把前面所有的硬道理,投射到我们正身处的这个 AI 时刻。 +> 一句话点题:**当「写代码」坍缩成几秒钟、几句自然语言就能完成的廉价动作,真正稀缺、真正值钱的,只剩下一件事——在动手之前判断「这个系统该长什么样、会死在哪、我在拿什么换什么」。** 这是进阶篇的收官:把前面所有的硬道理,投射到我们正身处的这个 AI 时刻。 --- -> **🏁 全教程到这里收尾。** 入门篇(01–09)教你**看懂系统、并从 0 设计一个中小系统**;进阶篇(10–16)教你**驾驭做大做关键后才咬人的硬骨头**:分布式、失败、规模、演进、组织、安全。这最后一章不引入新的"硬骨头",而是做一件更重要的事——**把这套判断力,对准当下:一个 AI 替你写代码(vibe coding)、同时又催生出一批全新系统(LLM / Agent)的时代。** +> **🏁 进阶篇到这里收尾。** 入门篇(01–09)教你**看懂系统、并从 0 设计一个中小系统**;进阶篇(10–16)教你**驾驭做大做关键后才咬人的硬骨头**:分布式、失败、规模、演进、组织、安全。这一章不引入新的"硬骨头",而是做一件更重要的事——**把这套判断力,对准当下:一个 AI 替你写代码(vibe coding)、同时又催生出一批全新系统(LLM / Agent)的时代。** --- @@ -51,7 +51,7 @@ vibe coding 很爽,也很危险。Simon Willison 有一个清醒的区分:**不 ## 三、非确定性:LLM 系统的头号新约束 -传统系统的地基是**确定性**:同样的输入,永远得到同样的输出——所以你能写精确断言、能复现 bug、能回归测试。**LLM 把这块地基抽走了**:同样的输入,因为采样温度、模型版本更新、上下文变化,可能给出**不同的输出**。 +传统系统的地基是**确定性**(同样的输入永远得到同样的输出)。**LLM 把这块地基抽走了**——这就是**非确定性**:同样一句话问两遍,因为采样随机性、模型版本更新、上下文变化,它可能给你**两个不一样的答案**。过去你能写精确断言、能复现 bug、能回归测试,全靠「输入定、输出就定」;这块地基一抽走,整套测试和验收的打法都得换。 这是架构级的新约束,逼出一套新打法: @@ -68,7 +68,7 @@ vibe coding 很爽,也很危险。Simon Willison 有一个清醒的区分:**不 ## 四、上下文工程:新的「内存层级」 -LLM 的上下文窗口,就是它的**工作内存**——有限、且按 token 计费昂贵。如何"在有限且昂贵的窗口里,放进恰好够用的信息",成了 LLM 系统的核心功夫,这就是**上下文工程(context engineering)**。它本质上是 [05 · 数据与状态](05-数据与状态.md) 的"为访问形态选存储"和 [13 · 规模化的力学](13-规模化的力学.md) 的"内存层级"在 AI 时代的回响: +LLM 的上下文窗口,就是它的**工作内存**——有限、且按 token 计费昂贵。如何"在有限且昂贵的窗口里,放进恰好够用的信息",成了 LLM 系统的核心功夫,这就是**上下文工程(context engineering)**。说白了就是:**模型的「脑容量」只有这么大、还按量收费,你得精挑细选往里塞什么——塞少了它没依据瞎答,塞多了又贵又让它「抓不住重点」。**它本质上是 [05 · 数据与状态](05-数据与状态.md) 的"为访问形态选存储"和 [13 · 规模化的力学](13-规模化的力学.md) 的"内存层级"在 AI 时代的回响: ``` 新的"内存层级"(越往下越便宜、越慢、越大): diff --git "a/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" "b/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" index bc06cd4..04df18d 100644 --- "a/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" +++ "b/tutorial/23-\350\247\204\346\240\274\345\215\263\346\236\266\346\236\204\347\272\246\346\235\237\346\200\216\344\271\210\345\206\231\347\273\231AI.md" @@ -6,7 +6,7 @@ > **🤝 AI 协同设计篇第 1 章 · 本篇讲什么** > -> **前置:实战篇(18–22)。** 你已经会**设计、演进、拆迁** AI 系统了。这最后一篇换个视角:**不是「你怎么造」,而是「你怎么和 AI 一起造,且不失控」。** 它不讲 vibe coding 的工具技巧,只讲两件事——**怎么把约束写给 AI(本章 + 25)、怎么审查 AI 的产出(24)**,最后(26)收成一套决策树。和 [architecture-copilot](https://github.com/study8677/architecture-copilot) skill 同一条产品线。 +> **前置:实战篇(18–22)。** 你已经会**设计、演进、拆迁** AI 系统了。AI 协同这一篇换个视角:**不是「你怎么造」,而是「你怎么和 AI 一起造,且不失控」。** 它不讲 vibe coding 的工具技巧,只讲两件事——**怎么把约束写给 AI(本章 + 25)、怎么审查 AI 的产出(24)**,最后(26)收成一套决策树。和 [architecture-copilot](https://github.com/study8677/architecture-copilot) skill 同一条产品线。 --- diff --git "a/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" "b/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" index d8d306c..5437d21 100644 --- "a/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" +++ "b/tutorial/25-\350\257\204\346\265\213\351\251\261\345\212\250\346\212\212\345\244\237\345\245\275\345\206\231\350\277\233\346\236\266\346\236\204.md" @@ -137,7 +137,7 @@ - **eval 不替代传统测试**:确定性逻辑照样单测,eval 只管非确定输出的质量分布。 - **eval 不免费**:烧钱、裁判会错、会过拟合老化——要权衡覆盖与成本、持续校准维护。它是 AI 系统的「质量适应度函数」([14](14-演进与拆分大型系统.md))。 -> **承上启下**:到这儿,AI 协同的三件武器齐了——**规格(23)给约束、清单(24)审产出、eval(25)守质量**。但什么时候用哪件、什么时候干脆放手 vibe、什么时候必须 spec-first?最后一章 [26 · 协作决策树:何时 vibe、何时 spec-first](26-协作决策树何时vibe何时spec-first.md) 把这三件武器收成**一套可照着走的 workflow**,并为整套 26 章教程收尾。 +> **承上启下**:到这儿,AI 协同的三件武器齐了——**规格(23)给约束、清单(24)审产出、eval(25)守质量**。但什么时候用哪件、什么时候干脆放手 vibe、什么时候必须 spec-first?AI 协同设计篇最后一章 [26 · 协作决策树:何时 vibe、何时 spec-first](26-协作决策树何时vibe何时spec-first.md) 把这三件武器收成**一套可照着走的 workflow**。 --- diff --git "a/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" "b/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" index c06e8b9..576a66a 100644 --- "a/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" +++ "b/tutorial/26-\345\215\217\344\275\234\345\206\263\347\255\226\346\240\221\344\275\225\346\227\266vibe\344\275\225\346\227\266spec-first.md" @@ -1,10 +1,10 @@ # 26 · 协作决策树:何时 vibe、何时 spec-first -> 一句话点题:**不是「该不该用 AI 写代码」,而是「这一段,该放手 vibe,还是先把规格立好再让它写」。原型尽情 vibe,生产用判断收口——这一章把前三章(规格 / 审查 / 评测)收成一棵能照着走的决策树,也为整套 26 章教程画上句号。** +> 一句话点题:**不是「该不该用 AI 写代码」,而是「这一段,该放手 vibe,还是先把规格立好再让它写」。原型尽情 vibe,生产用判断收口——这一章把前三章(规格 / 审查 / 评测)收成一棵能照着走的决策树,也为 AI 协同设计篇收口。** --- -> **🏁 全教程最后一章。** 你已经走过四篇:**看懂系统、从 0 设计([01–09](01-为什么先有架构思维.md))→ 驾驭做大做关键的硬骨头([10–17](10-分布式系统的硬道理.md))→ 把方法落到真实 AI 系统([18–22](18-读地图用框架拆解陌生系统.md))→ 学会把约束写给 AI、审查它、给它的质量守门([23–25](23-规格即架构约束怎么写给AI.md))**。这一章把最后三件武器收成一套 workflow,然后,就该轮到你了。 +> **🤝 AI 协同设计篇最后一章。** 你已经走过四段能力:**看懂系统、从 0 设计([01–09](01-为什么先有架构思维.md))→ 驾驭做大做关键的硬骨头([10–17](10-分布式系统的硬道理.md))→ 把方法落到真实 AI 系统([18–22](18-读地图用框架拆解陌生系统.md))→ 学会把约束写给 AI、审查它、给它的质量守门([23–25](23-规格即架构约束怎么写给AI.md))**。这一章把最后三件武器收成一套 workflow。 --- @@ -108,14 +108,14 @@ ## 五、回到原点:你练的到底是什么 -绕了 26 章,我们回到 [01 章](01-为什么先有架构思维.md) 和这个仓库存在的理由: +走到 AI 协同设计篇末尾,我们回到 [01 章](01-为什么先有架构思维.md) 和这个仓库存在的理由: > **写代码正在变廉价,而架构判断力,正变得前所未有地稀缺和值钱。** 这套教程从头到尾,教的从来不是某个框架、某种语法——那些 AI 几秒就能产出。教的是一种**不会贬值**的东西: ``` - 会贬值的(AI 正在让它廉价) 不会贬值的(你这 26 章练的) + 会贬值的(AI 正在让它廉价) 不会贬值的(前面这些章节练的) ────────────────────── ────────────────────────────── • 记住某个 API / 语法 • 拿到模糊需求,问出对的问题(02/07) • 手写样板实现 • 在取舍中做有据的决策(06/08) @@ -125,7 +125,7 @@ → 交给 AI → 这,才是 AI 时代的你 ``` -而最后这一篇(23–26)其实在说一件事:**架构判断力,在 AI 时代不仅没过时,还多了一个全新的、极高杠杆的用武之地——通过「规格 / 审查 / 评测 / 决策」这个接口,把你的判断,变成一支 AI 大军的行为约束。** 你不再只是「设计一个系统」,而是「**设计一套让 AI 持续造出好系统的护栏与流程**」。这是判断力的放大,不是替代。 +而 AI 协同这一篇(23–26)其实在说一件事:**架构判断力,在 AI 时代不仅没过时,还多了一个全新的、极高杠杆的用武之地——通过「规格 / 审查 / 评测 / 决策」这个接口,把你的判断,变成一支 AI 大军的行为约束。** 你不再只是「设计一个系统」,而是「**设计一套让 AI 持续造出好系统的护栏与流程**」。这是判断力的放大,不是替代。 > **架构智慧**:**vibe coding 不是判断力的终结,是它的杠杆。** 一个有判断力的人 + AI = 十个人的产出;一个没判断力的人 + AI = 十倍速地制造你看不懂、扛不住、改不动的系统。决定结果的,从来不是 AI 多强,而是握着方向盘的人,看不看得清地图。 @@ -159,9 +159,9 @@ --- -## 🏁 全教程结语:轮到你了 +## 🤝 AI 协同篇结语:判断力继续往下走 -26 章,四篇,一条主线: +到 26 章为止,你已经走过一条主线: ``` 01–09 看懂系统、从 0 设计一个中小系统 —— 建立判断 @@ -176,11 +176,9 @@ 这套教程给你的从来不是结论,是**提问的能力**。当你对每一个技术选择、每一段 AI 产出,都能自然地问出「**为什么是它?代价是什么?会死在哪?**」——你就已经在用架构师的方式思考了。 -> **现在,合上教程。** 挑一个你正在做的系统,或 [`templates/`](../templates/README.md) 里任意一张地图,走一遍:**读懂它([18](18-读地图用框架拆解陌生系统.md))→ 自己设计一版([19](19-完整设计演练中等复杂度系统.md))→ 想清楚它怎么演进([20](20-演进剧本MVP到规模化.md))→ 把约束写给 AI([23](23-规格即架构约束怎么写给AI.md))→ 让 AI 造、你来收口([24](24-审查清单AI产出默认缺什么.md)/[25](25-评测驱动把够好写进架构.md))。** +> **继续往下走。** 会设计、会协作之后,你还会在真实项目里不断遇到「到底用什么技术」的问题。下一篇 [27 · 编程语言与后端框架选型](27-编程语言与后端框架选型.md) 开始,我们把同一套架构判断力落到技术栈选型上:语言、数据库、缓存、API、部署、观测、AI 基础设施,都不再凭热度拍板。 > -> 想要一个「带着走」的教练,就用配套的 [architecture-copilot](https://github.com/study8677/architecture-copilot) skill——它把这 26 章变成在 Claude Code / Cursor / Codex 里**一步步引导你做架构判断**的交互式伙伴。 -> -> 在一个 AI 替所有人写代码的时代,愿你成为那个——**先看清地图,再决定上不上路的人。** +> 想要一个「带着走」的教练,就用配套的 [architecture-copilot](https://github.com/study8677/architecture-copilot) skill——它把这套教程变成在 Claude Code / Cursor / Codex 里**一步步引导你做架构判断**的交互式伙伴。 --- diff --git "a/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" "b/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" new file mode 100644 index 0000000..14c8ac1 --- /dev/null +++ "b/tutorial/27-\347\274\226\347\250\213\350\257\255\350\250\200\344\270\216\345\220\216\347\253\257\346\241\206\346\236\266\351\200\211\345\236\213.md" @@ -0,0 +1,188 @@ +# 27 · 编程语言与后端框架选型 + +> 一句话点题:**语言和框架不是信仰题,而是约束题。架构师不问「哪个最流行」,而问「这个选择会改变团队速度、运行成本、性能上限、生态可得性和未来迁移成本吗」。如果答案是会,它就是架构决策;如果答案只是写法不同,它就是实现细节。** + +--- + +> **🧰 技术栈选型篇第 1 章 · 本章只练一件事** +> +> 前 26 章一直强调「架构不是框架」。但现实里,你还是要选 Java、Go、Python、TypeScript、Rust,还是某个 Web 框架。技术栈选型篇不是回来教语法,而是把「用什么技术」重新拉回 [02 章](02-架构师的思考框架.md) 的框架:需求、约束、质量属性、取舍。 + +--- + +## 开场:技术选型不是投票 + +很多团队选语言,像在做投票: + +``` + 我喜欢 Go → 用 Go + 招 Java 容易 → 用 Java + AI 生态都在 Python → 用 Python + 前后端统一 TS → 用 TypeScript +``` + +这些理由不是没用,但它们只是**线索**,不是决策。真正的架构判断要继续往下问: + +- 这条业务链路的性能瓶颈是什么? +- 团队最缺的是交付速度、运行效率、稳定性,还是招聘可得性? +- 这个生态里有没有成熟的库、框架、调试工具、监控方案? +- 三年后要换人维护,新人能不能读懂、改动、上线? + +> **判断标准:**如果一个语言/框架选择会显著影响质量属性([06 章](06-质量属性与取舍.md)),它就是架构决策;如果只是代码风格不同,就别把它上升到架构战争。 + +--- + +## 一、先分清:语言、运行时、框架分别影响什么 + +新手常把三件事混在一起: + +| 层次 | 它决定什么 | 例子 | 架构上真正要看 | +|---|---|---|---| +| **语言** | 表达方式、类型系统、生态入口 | Java、Go、Python、TypeScript、Rust | 团队熟悉度、长期维护、错误能否早暴露 | +| **运行时** | 并发模型、内存、启动速度、部署形态 | JVM、Node.js、CPython、Go runtime | 延迟、吞吐、资源成本、冷启动 | +| **框架** | 约定、组件组合、开发节奏 | Spring Boot、FastAPI、NestJS、Gin | 交付速度、可测试性、插件生态、团队一致性 | + +所以,「我们用 Java 还是 Go」不是一个完整问题。更完整的问题是: + +> 在当前团队、当前业务、当前质量目标下,我们需要一个什么样的**运行与交付模型**? + +例如内部 SaaS 后台([案例 02 PatchDesk](../cases/patchdesk-saas/README.md))最初的核心不是极限性能,而是权限、多租户、审计、报表这些业务复杂度。此时成熟框架和团队可维护性,通常比单机 QPS 更重要。 + +--- + +## 二、五把尺子:别从工具名开始 + +做语言/框架选型,先用五把尺子量: + +| 尺子 | 要问的问题 | 偏向的选择 | +|---|---|---| +| **业务复杂度** | 规则多、状态多、权限多吗? | 类型系统强、工程约定强、测试生态成熟 | +| **性能与资源** | CPU、内存、尾延迟(P99,最慢 1% 请求)是不是核心? | 运行时开销低、并发模型清晰 | +| **生态成熟度** | 支付、鉴权、ORM、消息、监控有没有现成方案? | 生态深、文档多、社区稳定 | +| **团队能力** | 团队会什么?招人容易吗?代码评审能不能守住质量? | 团队主语言,或学习成本可控的新语言 | +| **交付与演进** | 需要快速迭代,还是长期高可靠? | 框架约定清晰、迁移路径明确 | + +> **架构智慧:**不要为了「语言更先进」换栈。只有当新技术能明确换来某个质量属性,并且你愿意支付学习、运维、招聘、迁移成本时,它才值得进入候选。 + +--- + +## 三、常见后端语言的架构取舍 + +下面不是排名,而是帮助你形成判断: + +| 技术 | 常见优势 | 常见代价 | 适合什么场景 | +|---|---|---|---| +| **Java / Kotlin + JVM** | 生态成熟、企业库多、性能稳定、可维护性强 | 项目可能偏重,启动慢一些,框架复杂度高 | 中大型业务系统、金融、电商、SaaS 后台 | +| **Go** | 部署简单、并发模型直接、资源占用低 | 泛型/工程抽象历史包袱较多,复杂业务表达需要纪律 | 网关、基础设施、微服务、实时链路 | +| **Python** | AI/数据生态强、原型快、表达成本低 | 运行性能和并发模型要小心,大型工程需要强约束 | AI 服务、数据平台、自动化、低 QPS 后台 | +| **TypeScript / Node.js** | 前后端同语言、I/O 并发友好、生态大 | CPU 密集不合适,依赖生态质量参差 | BFF(Backend for Frontend,面向前端的后端)、中小 SaaS、实时轻服务 | +| **Rust** | 性能和内存安全强、适合底层系统 | 学习曲线高、交付速度可能慢 | 存储、代理、引擎、对性能/安全极敏感的组件 | + +注意这里的关键词是「组件」。一个系统不一定只能一种语言: + +``` + 业务主服务: Java / Go / TypeScript + AI 推理与数据处理: Python + 高性能代理或存储引擎: Rust / Go + 前端与 BFF: TypeScript +``` + +多语言不是罪,但要小心它带来的**认知税**:构建、部署、监控、调试、招聘、代码审查都会变多。小团队为了「每个模块都用最合适语言」而引入 5 种栈,通常是在提前透支组织能力([15 章](15-组织即架构.md))。 + +--- + +## 四、框架选型:默认选成熟,除非你有明确反证 + +框架不是越轻越好,也不是越全越好。它本质上是在帮团队做约定: + +``` + 框架给你: + 路由 / 依赖注入 / 配置 / 数据访问 / 鉴权 / 测试 / 可观测性入口 + + 框架也拿走: + 自由度 / 学习成本 / 升级成本 / 调试透明度 +``` + +如果系统处在 MVP 阶段,你要优先降低交付风险;如果系统会长期多人维护,你要优先降低协作风险。很多时候,**成熟而稍显笨重的框架**,比「很酷但全靠团队自律」的轻框架更稳。 + +可以用这张表做第一轮筛选: + +| 问题 | 如果答案是「是」 | 选型倾向 | +|---|---|---| +| 团队新人多、多人长期维护? | 是 | 约定强、文档好、生态成熟的框架 | +| 需要极快试错、业务还不确定? | 是 | 轻量框架 + 清晰模块边界 | +| 有大量企业集成、事务、权限? | 是 | 成熟企业框架 | +| 是高并发网关/代理? | 是 | 运行时开销低、网络模型成熟的框架 | +| 是 AI / 数据密集服务? | 是 | 优先靠近 Python / 数据生态,外层用稳定 API 包起来 | + +--- + +## 五、什么时候该换语言或换框架 + +不要因为「看起来旧」就换。换技术栈应该由**触发信号**推动: + +| 触发信号 | 说明 | 可能动作 | +|---|---|---| +| P99 长期超 SLO(服务等级目标),且瓶颈来自运行时 | 不是写法问题,而是模型不匹配 | 把热点链路拆成更合适的运行时 | +| 团队交付越来越慢,框架约定挡路 | 业务复杂度超过原框架承载 | 先模块化,再局部迁移 | +| 生态缺失导致大量自研 | 维护成本持续升高 | 换到生态更成熟的栈 | +| 招聘和代码审查困难 | 组织能力跟不上技术选择 | 收敛语言,或加强平台约束 | +| 安全/合规/性能要求变高 | 旧栈很难补齐能力 | 对关键组件单独升级 | + +> 对照 [14 章](14-演进与拆分大型系统.md):换栈也要像拆单体一样做。不要「大爆炸重写」,先抽边界、并行运行、影子流量、逐步切换。 + +--- + +## 六、一个可复制的选型结论模板 + +不要在 ADR 里写「我们选择 Go,因为 Go 很快」。写成这样: + +```md +### ADR-027:订单入口服务使用 Go + Gin + +- 背景:下单入口是 CPU 不重但连接数高的 I/O 链路,P99 目标 200ms,团队已有 Go 运维经验。 +- 选择:入口层使用 Go + Gin,业务状态机仍留在 Java 订单服务。 +- 放弃:放弃单语言仓库的简单性,增加一条构建链路。 +- 换来:入口层部署更轻、连接处理更直接,可以独立扩容和限流。 +- 复审条件:如果入口逻辑变成复杂业务编排,或团队 Go 维护能力不足,重新评估是否回收进主业务栈。 +``` + +这一段的重点不是 Go,而是**为什么只把入口层切出去**,以及你承认了代价。 + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **语言/框架不是信仰题,是约束题**:先问业务复杂度、性能、生态、团队、演进,再看工具名。 +- **分清语言、运行时、框架**:语言影响表达和生态,运行时影响资源与并发,框架影响团队约定和交付节奏。 +- **成熟默认优先**:除非你有明确质量属性收益,否则不要为了「先进」换栈。 +- **多语言要付认知税**:构建、部署、监控、调试、招聘、审查都会变复杂。 +- **换栈也要演进式**:像 [14 章](14-演进与拆分大型系统.md) 一样,抽边界、并行运行、逐步迁移,不要大爆炸重写。 + +> **承上启下**:语言和框架解决的是「代码如何运行与协作」。但系统真正长期难的是数据。下一章 [28 · 数据库与存储选型](28-数据库与存储选型.md),我们把问题从「服务怎么写」推进到「数据到底放在哪里、怎么查、怎么一致、怎么省钱」。 + +--- + +## 相关链接 + +- 方法论本体:[02 · 架构师的思考框架](02-架构师的思考框架.md) · [06 · 质量属性与取舍](06-质量属性与取舍.md) · [08 · ADR](08-架构决策记录与演进.md) +- 演进配套:[14 · 演进与拆分大型系统](14-演进与拆分大型系统.md) · [15 · 组织即架构](15-组织即架构.md) +- 案例对照:[PatchDesk:轻量工单 SaaS](../cases/patchdesk-saas/README.md) · [CodePilot:编码 Agent 平台](../cases/codepilot-agent/README.md) diff --git "a/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" "b/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" new file mode 100644 index 0000000..5affc0f --- /dev/null +++ "b/tutorial/28-\346\225\260\346\215\256\345\272\223\344\270\216\345\255\230\345\202\250\351\200\211\345\236\213.md" @@ -0,0 +1,202 @@ +# 28 · 数据库与存储选型 + +> 一句话点题:**数据库不是「MySQL 还是 PostgreSQL」这么小的问题,而是「这份数据的读写模式、一致性要求、查询形态、增长速度和失败代价是什么」。选存储,先画数据生命周期;工具名排在后面。** + +--- + +> **🧰 技术栈选型篇第 2 章 · 本章只练一件事** +> +> [05 章](05-数据与状态.md) 说过:系统真正难的不是逻辑,是数据。语言可以换,服务可以拆,但数据一旦放错地方,迁移成本会非常高。本章把数据库、缓存、搜索、对象存储、向量库放到同一张选型地图里看。 + +--- + +## 开场:别用一个数据库扛所有问题 + +很多系统一开始长这样: + +``` + App ──▶ 一个关系型数据库 + ├─ 交易数据 + ├─ 报表查询 + ├─ 搜索筛选 + ├─ 文件附件 + └─ AI 检索向量 +``` + +MVP 这样做没错。问题是,随着业务长大,这些数据的访问方式完全不同: + +- 订单要强一致,不能丢。 +- 报表要扫大量历史数据,不能拖垮主库。 +- 搜索要相关性排序,不是简单 `LIKE`。 +- 图片、视频、附件要便宜存、能走 CDN(内容分发网络)。 +- RAG(检索增强生成)要向量召回和权限过滤。 + +> **架构判断:**不是「哪个数据库最好」,而是「哪类数据,该用哪种存储模型承载」。一个系统常常需要多种存储,但每增加一种,都会增加一致性、同步、运维和排障成本。 + +--- + +## 一、先画数据生命周期 + +选数据库之前,先把一类数据从出生到归档画出来: + +``` + 写入 ──▶ 校验 ──▶ 事务提交 ──▶ 查询/检索 ──▶ 分析/报表 ──▶ 归档/删除 + │ │ │ │ │ │ + 谁写? 怎么验? 要多一致? 怎么查? 多久查一次? 保留多久? +``` + +然后回答五个问题: + +| 问题 | 为什么重要 | +|---|---| +| **写多还是读多?** | 决定是否需要读写分离、缓存、索引模型 | +| **事务边界在哪里?** | 决定能不能用关系型数据库的事务兜住 | +| **查询形态是什么?** | 主键查、范围查、全文搜索、向量相似度,完全不同 | +| **数据增长速度多快?** | 决定分区、冷热分层、归档策略 | +| **错了会怎样?** | 决定一致性、备份、审计、恢复等级 | + +这一步和 [07 章](07-从0到1设计一个系统.md) 的信封背面估算是一回事:先算数据量、读写比、保留周期,再谈工具。 + +--- + +## 二、主存储:关系型数据库仍然是默认起点 + +如果你不知道该选什么,大多数业务系统默认从关系型数据库开始: + +| 类型 | 适合 | 不适合 | +|---|---|---| +| **关系型数据库**(如 PostgreSQL、MySQL) | 交易、订单、权限、租户、账务、需要事务的数据 | 极大规模分析、全文搜索、海量非结构化文件 | +| **文档数据库**(Document DB,如 MongoDB) | 结构经常变化、文档整体读写、弱关系数据 | 强事务、多表复杂关联、严格报表 | +| **键值存储**(Key-Value,如 DynamoDB、Redis 持久化形态) | 按 key 高速读写、结构简单、超大规模 | 临时复杂查询、灵活关联 | +| **列式/分析数据库**(OLAP,在线分析处理) | 报表、聚合、日志分析、行为分析 | 高频小事务写入 | + +> **默认建议:**先用关系型数据库把核心交易数据放稳。等报表、搜索、日志、向量检索真的变成独立压力,再把对应读模型拆出去。不要一上来为了「现代」把每类数据都塞进不同数据库。 + +--- + +## 三、读模型:搜索、分析、向量不是主库的附属品 + +当查询形态开始偏离主库擅长的方向,就要考虑读模型: + +| 需求 | 常见存储/引擎 | 关键取舍 | +|---|---|---| +| **全文搜索** | Elasticsearch、OpenSearch、Meilisearch | 相关性强,但索引同步和最终一致要处理 | +| **报表分析** | ClickHouse、BigQuery、Snowflake | 扫描聚合快,但不是交易主库 | +| **对象存储** | S3、OSS、GCS | 存文件便宜可靠,但不能当数据库做复杂查询 | +| **向量数据库**(Vector DB) | Milvus、Qdrant、pgvector | 相似度检索强,但权限过滤、召回质量和成本要评估 | +| **时序数据库**(Time Series DB) | Prometheus、InfluxDB | 指标时间线查询强,但不适合普通业务对象 | + +这里最容易犯的错是把读模型当成事实源: + +``` + 正确: + 主库 = 事实源(Source of Truth) + 搜索/分析/向量库 = 从主库或对象存储同步出来的读模型 + + 错误: + 用户改了资料 → 只改搜索索引 → 主库不知道 +``` + +读模型可以落后,但要说清楚**能落后多久**、**怎么补偿**、**怎么重建**。这就是 [11 章](11-数据一致性工程.md) 的一致性工程。 + +--- + +## 四、RAG 场景:向量库不是唯一答案 + +RAG 企业知识库([案例 03 DocuMind](../cases/documind-rag/README.md))很容易被简化成: + +``` + 文档切块 ──▶ 向量库 ──▶ topK ──▶ LLM 回答 +``` + +真实系统更像: + +``` + 原文对象存储 ──▶ 解析/切块 ──▶ 元数据主库 + │ │ + ├────────▶ 关键词索引 ◀────┤ + ├────────▶ 向量索引 ◀────┤ + └────────▶ 权限/租户过滤 ◀──┘ +``` + +选型时要问: + +- 权限过滤是在检索前做,还是检索后过滤?后过滤可能召回不够。 +- 只做向量召回,还是混合检索(Hybrid Search,关键词 + 向量)? +- 原文和引用存在哪里?向量库不应该是唯一事实源。 +- 索引坏了能不能从原文和元数据重建? +- 成本能不能随文档量和查询量线性增长? + +> 向量库解决的是「相似度召回」,不是权限、事实源、引用、评测的全部问题。不要把一个 RAG 系统压扁成一个库。 + +--- + +## 五、什么时候该拆存储 + +拆存储的触发信号要具体: + +| 信号 | 说明 | 可能动作 | +|---|---|---| +| 报表查询拖慢交易库 | OLTP(在线事务处理)和 OLAP(在线分析处理)互相干扰 | 同步到分析库 | +| 搜索相关性差、LIKE 扫描慢 | 查询形态变成全文搜索 | 建搜索索引 | +| 附件/图片撑爆数据库 | 二进制文件不该塞主库 | 迁到对象存储 + CDN | +| 单表增长导致索引和备份变慢 | 数据生命周期没有分层 | 分区、归档、冷热分离 | +| RAG 召回质量不稳 | 单一向量召回不够 | 混合检索 + 重排 + eval | + +也要记住反面: + +> 如果当前数据量小、团队少、故障代价低,一套关系型数据库 + 合理索引 + 定期备份,往往比过早引入五种存储更健康。 + +--- + +## 六、写一条存储选型 ADR + +```md +### ADR-028:报表查询从交易库拆到 ClickHouse + +- 背景:工单列表和状态流转依赖主库,但月度报表开始扫描 2 亿行历史事件,导致交易库 P99 从 120ms 升到 900ms。 +- 选择:主库继续作为事实源;通过 Outbox 事件同步到 ClickHouse,报表只查分析库。 +- 放弃:放弃报表的强实时性,允许 1 分钟以内延迟。 +- 换来:交易链路与报表链路隔离,报表聚合性能提升,主库负载稳定。 +- 复审条件:如果报表必须秒级实时,或同步延迟超过业务可接受范围,重新评估流式同步和预聚合。 +``` + +这条 ADR 的关键是:它没有说「ClickHouse 很快」,而是说清楚了**哪个读模型拖垮了哪个事实源**。 + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **选存储先画数据生命周期**:写入、校验、事务、查询、分析、归档,每一步都有约束。 +- **关系型数据库仍是多数业务系统的默认事实源**:先稳住核心交易数据,再拆读模型。 +- **搜索、分析、向量库是不同查询形态的读模型**:它们强在特定查询,弱在事务和事实源。 +- **RAG 不是一个向量库**:还包括原文、元数据、权限、混合检索、重排、引用和 eval。 +- **拆存储要靠触发信号**:报表拖垮主库、搜索变慢、附件撑爆库、召回质量不稳,这些才是升级理由。 + +> **承上启下**:主存储解决「事实放在哪里」。但系统一旦有热点、洪峰、异步协作,就会遇到缓存、消息队列和事件系统。下一章 [29 · 缓存、消息队列与事件系统选型](29-缓存消息队列与事件系统选型.md),我们专门讲这些「很有用,也很容易被误用」的中间层。 + +--- + +## 相关链接 + +- 方法论本体:[05 · 数据与状态](05-数据与状态.md) · [06 · 质量属性与取舍](06-质量属性与取舍.md) · [11 · 数据一致性工程](11-数据一致性工程.md) +- 演进配套:[13 · 规模化的力学](13-规模化的力学.md) · [14 · 演进与拆分大型系统](14-演进与拆分大型系统.md) +- 案例对照:[DocuMind:企业 RAG 知识库](../cases/documind-rag/README.md) · [StarArena:演唱会抢票系统](../cases/stararena-ticketing/README.md) diff --git "a/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" "b/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" new file mode 100644 index 0000000..b27cdab --- /dev/null +++ "b/tutorial/29-\347\274\223\345\255\230\346\266\210\346\201\257\351\230\237\345\210\227\344\270\216\344\272\213\344\273\266\347\263\273\347\273\237\351\200\211\345\236\213.md" @@ -0,0 +1,219 @@ +# 29 · 缓存、消息队列与事件系统选型 + +> 一句话点题:**缓存不是数据库,消息队列不是银弹,事件系统也不是「加个 Kafka」就完事。它们解决的是三类压力:读热点、写洪峰、跨边界协作。选之前先问:我是在降低延迟、削平峰值,还是解耦状态推进?** + +--- + +> **🧰 技术栈选型篇第 3 章 · 本章只练一件事** +> +> [28 章](28-数据库与存储选型.md) 讲事实源和读模型。本章讲事实源周围最常见的三类中间层:缓存(Cache,临时加速读)、消息队列(Message Queue,异步排队)、事件系统(Event System,用事件表达事实变化)。它们能救系统,也能把系统变得更难懂。 + +--- + +## 开场:这三个东西经常被混用 + +很多架构图里会出现: + +``` + App ──▶ Redis ──▶ MQ ──▶ Kafka ──▶ Worker +``` + +然后大家说:有缓存、有队列、有事件驱动,很高级。但真正要问的是: + +- Redis 里放的是可丢的缓存,还是不该丢的业务状态? +- MQ 是为了削峰,还是为了让两个服务异步协作? +- Kafka 里的消息是命令(Command,让别人做事),还是事件(Event,告诉别人发生了什么)? +- 消费失败、重复消费、乱序、积压怎么办? + +> **架构判断:**中间层的价值不在名字,而在它改变了什么质量属性:延迟、吞吐、可用性、耦合度、一致性、恢复成本。 + +--- + +## 一、缓存:只加速,不篡位 + +缓存适合解决**读热点**: + +``` + 用户请求 ──▶ 应用 ──▶ 缓存命中 → 快速返回 + └─ 缓存未命中 → 查主库 → 写缓存 → 返回 +``` + +但缓存最容易犯三个错: + +| 错误 | 后果 | 正确姿势 | +|---|---|---| +| 把缓存当事实源 | 缓存丢了数据就丢 | 主库是事实源,缓存可重建 | +| 不设计失效策略 | 用户看到旧数据或脏数据 | TTL(过期时间)、主动失效、版本号 | +| 所有请求一起穿透 | 主库被打爆 | 空值缓存、请求合并、限流、预热 | + +缓存选型也要看数据形态: + +| 缓存类型 | 适合 | 注意 | +|---|---|---| +| **本地缓存** | 配置、字典、低频变化数据 | 多实例不一致,更新慢 | +| **分布式缓存**(如 Redis) | 热点对象、会话、计数、限流 | 网络开销、容量、淘汰策略 | +| **CDN**(内容分发网络) | 图片、视频、静态资源、公开页面 | 失效延迟、边缘缓存一致性 | + +> 判断句:如果缓存丢了,系统应该变慢,而不是变错。变错,说明你把业务状态偷偷放进缓存了。 + +--- + +## 二、消息队列:把「现在必须做」改成「可以排队做」 + +消息队列最常见的价值是**削峰填谷**: + +``` + 洪峰请求 ──▶ 入口限流 ──▶ 队列 ──▶ Worker 按能力消费 ──▶ 主库 +``` + +它把瞬时压力变成可控排队,常见于: + +- 抢票锁座后的出票通知。 +- 下单成功后的发券、短信、邮件。 +- 文档上传后的解析、切块、索引。 +- 视频上传后的转码。 + +但队列引入后,同步世界变成异步世界: + +| 新问题 | 你必须回答 | +|---|---| +| **重复消息** | 消费者是否幂等?同一条消息处理两次会怎样? | +| **消息丢失** | 生产、存储、消费确认链路怎么保证? | +| **消息乱序** | 是否需要同一业务键内有序? | +| **队列积压** | 用户看到什么?系统如何降级? | +| **死信**(Dead Letter,处理失败的消息) | 失败消息去哪里?谁来修? | + +所以,队列不是「加了就稳定」,而是把问题从**请求延迟**转成了**异步一致性与恢复**。 + +--- + +## 三、事件系统:记录发生了什么,而不是命令别人做什么 + +事件(Event)和命令(Command)很容易混: + +| 类型 | 含义 | 例子 | 谁负责结果 | +|---|---|---|---| +| **命令 Command** | 请你做某事 | `CreateOrder`、`SendEmail` | 接收方要执行成功或失败 | +| **事件 Event** | 某事已经发生 | `OrderPaid`、`TicketLocked` | 订阅方按需反应 | + +事件系统适合跨边界传播事实: + +``` + 订单服务:订单已支付(OrderPaid) + │ + ├─ 库存服务:确认扣减 + ├─ 通知服务:发短信 + ├─ 数据平台:更新报表 + └─ 风控服务:记录行为 +``` + +事件的好处是解耦:订单服务不需要知道所有下游。但代价是: + +- 事件 schema(结构定义)一旦发布,下游会依赖,升级要兼容。 +- 下游处理失败时,事实已经发生,不能简单回滚。 +- 事件太细会淹没系统,太粗又表达不清。 +- 事件链太长,排障会变难,必须有 trace(链路追踪)。 + +--- + +## 四、Kafka、RabbitMQ、Redis Streams、NATS 该怎么理解 + +别先背产品名,先看通信语义: + +| 类型 | 常见代表 | 更像什么 | 适合 | +|---|---|---|---| +| **任务队列** | RabbitMQ、Celery、Sidekiq | 派活给 worker | 后台任务、邮件、图片处理 | +| **日志型事件流** | Kafka、Pulsar | 可回放的事实日志 | 事件总线、数据同步、审计、流处理 | +| **轻量消息/流** | Redis Streams、NATS | 简单快速的异步通道 | 中小规模异步、低延迟内部消息 | +| **云托管队列** | SQS、Pub/Sub | 少运维的可靠队列 | 云上业务、团队不想自运维 | + +选型时问四件事: + +1. 需要消息**可回放**吗?需要就偏事件流。 +2. 需要复杂路由和投递确认吗?任务队列更合适。 +3. 团队能不能运维集群?不能就优先托管。 +4. 消息是不是核心审计事实?是的话,持久化、保留周期、schema 治理都要严肃对待。 + +--- + +## 五、Outbox:别让数据库事务和消息发送各干各的 + +最经典的坑: + +``` + 1. 写订单成功 + 2. 发送 OrderCreated 消息失败 + 结果:主库里有订单,下游永远不知道 +``` + +或反过来: + +``` + 1. 消息发出成功 + 2. 写订单失败 + 结果:下游收到一个不存在的订单 +``` + +Outbox(发件箱模式)的做法是: + +``` + 同一个本地事务: + 写业务表 + 写 outbox 表 + │ + ▼ + 后台投递器扫描 outbox → 发消息 → 标记已投递 +``` + +它不让「写事实」和「发事件」分裂。代价是多了一张表、一个投递器、幂等和重试逻辑,但换来的是跨服务一致性可控。这正是 [11 章](11-数据一致性工程.md) 的核心套路。 + +--- + +## 六、一个选型模板 + +```md +### ADR-029:文档入库使用队列削峰,索引事件使用 Kafka + +- 背景:企业知识库上传高峰会同时触发解析、切块、向量化和索引,同步处理导致上传接口超时。 +- 选择:上传接口只保存原文和元数据,写入任务队列;解析完成后发布 DocumentIndexed 事件到 Kafka,供搜索、审计和报表订阅。 +- 放弃:用户不能立刻搜索到刚上传文档,允许 1-3 分钟索引延迟。 +- 换来:上传链路稳定,后台处理可限速、重试、扩容,下游通过事件解耦。 +- 风险:队列积压会影响可搜索时间;需要积压告警、死信处理和幂等消费者。 +``` + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **缓存解决读热点**:它应该可重建,不能偷偷变成事实源。 +- **消息队列解决洪峰和异步任务**:它把同步延迟问题换成异步一致性、积压和恢复问题。 +- **事件系统传播事实变化**:事件是「发生了什么」,不是「命令别人做什么」。 +- **选产品先选语义**:任务队列、事件流、轻量消息、云托管队列解决的问题不同。 +- **Outbox 是跨服务一致性的基本功**:写业务事实和写待发送事件要在同一个本地事务里完成。 + +> **承上启下**:缓存、队列、事件解决的是服务背后的压力与协作。下一章 [30 · API 与服务通信选型](30-API与服务通信选型.md),我们看服务之间正面怎么说话:REST、gRPC、GraphQL、Webhook、事件 API,到底各自适合什么边界。 + +--- + +## 相关链接 + +- 方法论本体:[11 · 数据一致性工程](11-数据一致性工程.md) · [12 · 为失败而设计](12-为失败而设计.md) · [13 · 规模化的力学](13-规模化的力学.md) +- 模板对照:[通知 / 推送系统](../templates/notification-system/README.md) · [在线票务 / 抢票](../templates/online-ticketing/README.md) · [RAG 知识库](../templates/rag-knowledge-base/README.md) +- 案例对照:[StarArena](../cases/stararena-ticketing/README.md) · [DocuMind](../cases/documind-rag/README.md) · [FeedStream](../cases/feedstream-content/README.md) diff --git "a/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" "b/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" new file mode 100644 index 0000000..d5c1aab --- /dev/null +++ "b/tutorial/30-API\344\270\216\346\234\215\345\212\241\351\200\232\344\277\241\351\200\211\345\236\213.md" @@ -0,0 +1,187 @@ +# 30 · API 与服务通信选型 + +> 一句话点题:**API 不是「REST 还是 gRPC」的格式选择,而是边界选择。同步还是异步、内部还是外部、强契约还是灵活查询、一次请求还是持续流,这些才决定你该怎么通信。** + +--- + +> **🧰 技术栈选型篇第 4 章 · 本章只练一件事** +> +> [04 章](04-十大核心架构模式.md) 讲过分层、微服务、事件驱动;[29 章](29-缓存消息队列与事件系统选型.md) 讲过异步。现在我们把视角放到服务边界上:两个系统一旦要说话,你就必须选择通信方式、契约、版本、失败处理和权限边界。 + +--- + +## 开场:通信方式决定耦合方式 + +同样是「订单告诉库存扣减」,可以有很多做法: + +``` + A. 订单同步调用库存 REST API + B. 订单同步调用库存 gRPC + C. 订单发布 OrderCreated 事件,库存异步消费 + D. 库存暴露 GraphQL,订单按需查询 + E. 库存回调订单 Webhook +``` + +每种都能工作,但耦合方式完全不同: + +- 同步调用:结果清楚,但调用方会被被调方拖慢或拖死。 +- 异步事件:解耦,但结果不立即可知,一致性更复杂。 +- GraphQL:客户端灵活,但服务端治理和性能更难。 +- Webhook:适合外部通知,但重试、签名、幂等必须做。 + +> **架构判断:**先定交互语义,再定协议。不要先说「我们用 gRPC」,而要先说「这条链路是否必须同步知道结果」。 + +--- + +## 一、第一把刀:同步还是异步 + +| 通信方式 | 适合 | 代价 | +|---|---|---| +| **同步请求/响应** | 用户正在等结果、需要立即校验、失败要立刻反馈 | 调用链变长,尾延迟(P99)叠加,依赖故障会扩散 | +| **异步消息/事件** | 可稍后完成、要削峰、多个下游订阅 | 状态推进复杂,需要幂等、补偿、积压处理 | +| **流式通信**(Streaming) | 持续输出、实时状态、长任务进度 | 连接管理、背压(Backpressure,下游处理不过来时减速)、断线恢复 | + +经验规则: + +``` + 用户必须马上知道「能不能继续」 → 同步 + 用户只需要知道「已经受理」 → 异步 + 用户要持续看见变化 → 流式 +``` + +比如抢票系统([案例 01 StarArena](../cases/stararena-ticketing/README.md)):「能不能进入等候室」要同步;「出票通知」可以异步;「排队位置变化」适合流式或轮询。 + +--- + +## 二、REST、gRPC、GraphQL 不是谁替代谁 + +| 方式 | 更适合 | 不适合 | +|---|---|---| +| **REST**(基于资源的 HTTP API) | 对外开放 API、普通 Web / SaaS、易调试、生态通用 | 高频内部调用、强类型契约要求极高的场景 | +| **gRPC**(高性能远程过程调用) | 内部服务间调用、低延迟、高吞吐、强 IDL(接口定义语言) | 浏览器直连、公开 API、调试门槛低的需求 | +| **GraphQL**(客户端按需查询) | 多端聚合查询、字段变化频繁、前端需要灵活组合 | 写操作复杂、缓存/权限/限流治理弱的团队 | +| **Webhook**(反向回调) | 第三方事件通知、支付回调、外部集成 | 需要同步强结果的核心链路 | +| **MCP**(Model Context Protocol,模型上下文协议) | 给 AI Agent 暴露工具、资源和上下文 | 普通业务服务间通信,或没有 Agent 语义的场景 | + +重点不是「哪个先进」,而是边界: + +- 对外 API 要优先易理解、稳定、可版本化。 +- 内部高频调用可以优先强契约和性能。 +- 前端多端聚合可以考虑 GraphQL,但要有治理能力。 +- 第三方回调必须考虑签名、幂等、重放攻击。 +- Agent 工具接口要把权限和人审写进协议边界([23 章](23-规格即架构约束怎么写给AI.md))。 + +--- + +## 三、契约比协议更重要 + +API 最大的风险不是 HTTP 还是 protobuf,而是**契约不清楚**: + +| 契约点 | 要写清楚 | +|---|---| +| **输入输出** | 字段含义、必填/可选、单位、枚举 | +| **错误语义** | 哪些可重试?哪些是用户错误?哪些是系统错误? | +| **幂等性** | 同一个请求重放两次会怎样?幂等键在哪里? | +| **版本策略** | 字段怎么新增/废弃?旧客户端多久兼容? | +| **限流与配额** | 谁可以调多少?超过后返回什么? | +| **安全边界** | 鉴权、授权、签名、审计怎么做? | + +没有契约治理,REST 会变成乱七八糟的 URL,GraphQL 会变成随意暴露数据库,gRPC 会变成强类型的泥球。 + +--- + +## 四、内部通信:别让调用链无限变长 + +微服务系统最常见的性能问题不是单个服务慢,而是调用链扇出: + +``` + 用户请求 + └─ A + ├─ B + │ ├─ D + │ └─ E + └─ C + ├─ F + └─ G +``` + +每多一跳,都会增加: + +- 网络延迟。 +- 超时和重试风暴。 +- 依赖故障传播。 +- trace 排障成本。 + +所以内部 API 要配套三件事: + +1. **超时预算**:上游 500ms,不能给每个下游都 500ms。 +2. **重试纪律**:只重试幂等请求,加退避和抖动。 +3. **降级策略**:非核心依赖失败时,返回部分结果或兜底。 + +这和 [12 章](12-为失败而设计.md) 的韧性工程是同一个问题。 + +--- + +## 五、外部 API:稳定性比优雅更重要 + +对外 API 一旦发布,就是承诺。要特别关注: + +- **向后兼容**:新增字段通常安全,删除/改含义危险。 +- **错误码稳定**:客户会写逻辑依赖你的错误语义。 +- **文档与示例**:外部开发者看不懂,再优雅也没用。 +- **签名与防重放**:特别是支付、Webhook、Agent 工具调用。 +- **审计与速率限制**:出问题后能追踪,被滥用时能限制。 + +如果是平台型产品,API 不是实现细节,而是产品的一部分。它的版本和兼容策略就是架构边界。 + +--- + +## 六、一个选型 ADR + +```md +### ADR-030:内部订单与库存使用 gRPC,支付回调用 Webhook + 幂等 + +- 背景:订单与库存都在内部网络,调用频繁且需要强契约;支付来自第三方,只能由对方异步通知。 +- 选择:内部订单-库存使用 gRPC + protobuf 定义契约;外部支付结果通过 Webhook 接收,签名校验后按 payment_id 幂等推进状态机。 +- 放弃:内部接口不直接暴露给浏览器;支付结果不追求同步完成。 +- 换来:内部链路契约清晰、性能稳定;外部集成符合支付系统的异步现实。 +- 风险:Webhook 重放和乱序需要处理;内部调用链要设置超时预算和 trace。 +``` + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **通信方式决定耦合方式**:同步、异步、流式分别适合不同交互语义。 +- **REST、gRPC、GraphQL、Webhook、MCP 各有边界**:不是替代关系,而是面向不同场景。 +- **契约比协议更重要**:字段、错误、幂等、版本、限流、安全必须写清楚。 +- **内部调用要防扇出和级联失败**:超时预算、重试纪律、降级、trace 是基本功。 +- **外部 API 是产品承诺**:兼容、文档、错误语义、签名和审计都要严肃对待。 + +> **承上启下**:到这里,我们已经选了服务怎么写、数据怎么放、中间层怎么协作、服务怎么通信。下一章 [31 · 云原生与部署平台选型](31-云原生与部署平台选型.md),问题变成:这些东西到底部署在哪里,谁来扩容、发布、隔离和兜底? + +--- + +## 相关链接 + +- 方法论本体:[04 · 十大核心架构模式](04-十大核心架构模式.md) · [12 · 为失败而设计](12-为失败而设计.md) · [16 · 安全与多租户架构](16-安全与多租户架构.md) +- AI 协同:[23 · 规格即架构](23-规格即架构约束怎么写给AI.md) · [26 · 协作决策树](26-协作决策树何时vibe何时spec-first.md) +- 案例对照:[StarArena](../cases/stararena-ticketing/README.md) · [CodePilot](../cases/codepilot-agent/README.md) · [SyncRoom](../cases/syncroom-collaboration/README.md) diff --git "a/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" "b/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" new file mode 100644 index 0000000..8404a4d --- /dev/null +++ "b/tutorial/31-\344\272\221\345\216\237\347\224\237\344\270\216\351\203\250\347\275\262\345\271\263\345\217\260\351\200\211\345\236\213.md" @@ -0,0 +1,191 @@ +# 31 · 云原生与部署平台选型 + +> 一句话点题:**云原生(Cloud Native)不是「上 Kubernetes」,而是把部署、扩缩、回滚、观测、故障恢复做成一套可重复的工程能力。部署平台选型,本质是在问:你的团队现在值不值得为这些能力支付复杂度。** + +--- + +> **🧰 技术栈选型篇第 5 章 · 本章只练一件事** +> +> [30 章](30-API与服务通信选型.md) 讲服务之间怎么通信。本章往下看一层:服务到底跑在哪里,怎么发布、怎么扩容、怎么回滚、谁来值班。小系统要的是少操心,大系统要的是可控和自治;选错平台,每天都在交复杂度税。 + +--- + +## 开场:你选的不是机器,是运维模型 + +新人常问: + +``` + 用虚拟机,还是容器? + 用 Serverless,还是 Kubernetes? + 自建,还是上云? +``` + +架构师会先问另一组问题: + +- 谁负责发布? +- 谁负责扩容? +- 谁负责告警? +- 新版本坏了谁能回滚? +- 证书、密钥、配置、日志、指标、权限谁管? +- 出事时团队看不看得懂平台? + +所谓部署平台,不是一台机器或一个控制台,而是**从代码变成线上服务的整条路径**:构建、配置、密钥、发布、健康检查、流量切换、自动扩缩、日志指标、故障回滚。 + +> **判断要点:**平台越省心,通常越少控制权、越可能有厂商绑定;平台越可控,通常越吃团队运维能力。没有高级低级,只有这笔复杂度现在值不值。 + +--- + +## 一、四个台阶:不是成熟度排名,是复杂度价格表 + +可以把常见部署形态粗略看成四层: + +``` + PaaS(平台即服务) / 托管应用平台 + → 最省心,适合 MVP / 小团队 / 标准 Web 应用 + + 托管容器平台(Managed Containers) + → 仍然省心,但有容器和服务边界 + + Serverless(无服务器计算) + → 适合事件驱动、突发流量、后台任务 + + Kubernetes / K8s(容器编排平台) + → 控制力最强,也最吃平台能力 +``` + +不要把这四层理解成「越往后越高级」。一个三人团队用 PaaS 稳稳上线,比自建 K8s 天天修集群更健康。反过来,如果你已经有几十个服务、多个团队、复杂流量治理,继续把所有东西塞进简单平台,也会变成瓶颈。 + +> **架构智慧:**选部署平台像选交通工具。去楼下买菜,自行车最好;跨城搬家,卡车最好。问题不在卡车是不是更先进,而在你是不是正在搬家。 + +--- + +## 二、什么时候 Kubernetes 值得上 + +Kubernetes 解决的不是「怎么运行一个容器」,而是**怎么管理一群不断变化的容器**: + +- 调度:把容器放到合适节点。 +- 扩缩:按负载增减副本。 +- 服务发现:服务实例变了,调用方仍能找到它。 +- 滚动发布:逐步替换版本。 +- 自愈:实例挂了自动拉起。 +- 资源隔离:限制 CPU、内存、权限。 +- 声明式配置:描述目标状态,平台负责逼近目标。 + +它适合的信号通常是: + +| 信号 | 说明 | +|---|---| +| 服务数量多 | 需要统一调度、发布、资源治理 | +| 多团队独立部署 | 团队之间不能互相排队发版 | +| 复杂流量治理 | 灰度、金丝雀、蓝绿、区域路由变成常态 | +| 需要混合云/私有化 | 要跨环境保持部署模型一致 | +| 有平台团队 | 有人把 K8s 封装成内部开发者平台,而不是让所有业务团队啃配置 | + +如果你只是一个标准 Web 应用 + 一个数据库 + 一个队列,K8s 多半不是收益,而是负担。你会提前买下证书、Ingress(入口流量)、网络策略、镜像仓库、集群升级、权限控制、节点资源、可观测性这一整套复杂度。 + +--- + +## 三、Serverless:不是不用管服务器,而是换了一组限制 + +Serverless(无服务器计算)的价值是: + +- 按需扩缩,低流量时成本低。 +- 适合事件触发,比如上传文件后转码、定时任务、Webhook 处理。 +- 团队少管机器,更多关注函数逻辑。 + +它的代价也明确: + +- 冷启动:长时间不用后首次请求可能慢。 +- 运行时限制:执行时间、内存、网络、依赖包大小都有边界。 +- 可观测性和本地调试更难。 +- 厂商绑定更强。 + +所以 Serverless 特别适合「短、散、事件驱动」的任务,不适合所有东西。把一个复杂长流程硬拆成几十个函数,没有工作流编排、trace 和重试纪律,会变成另一种难维护。 + +--- + +## 四、部署策略本身就是架构 + +部署平台不能只看「能不能跑」,还要看坏版本上线时怎么收场: + +| 能力 | 要回答的问题 | +|---|---| +| **健康检查** | 平台怎么知道实例真的能接流量? | +| **回滚** | 新版本坏了,能不能快速回到旧版本? | +| **渐进发布** | 能不能灰度 / 金丝雀(Canary) / 蓝绿(Blue-Green)切流量? | +| **配置与密钥** | 配置、密码、证书是否和代码分离,并可审计? | +| **基础设施即代码**(IaC) | 线上资源是否可版本化、可审查、可重建? | + +这就是为什么 GitOps(以 Git 作为运维事实源)重要:它把「线上应该长什么样」从人工点控制台,变成版本化、可审查、可回滚的声明。对小团队,这可能只是一份简单配置;对大组织,会变成平台工程的黄金路径。 + +> **架构智慧:**部署不是最后一步,部署是架构的一部分。一个不能快速回滚、不能定位版本、不能解释配置来源的平台,会把每次上线都变成赌博。 + +--- + +## 五、最稳的演进路径 + +``` + MVP / 单体阶段: + 托管应用平台 + 托管数据库 + 简单 CI/CD + + 多服务阶段: + 容器化 + 托管容器平台 + 标准日志/指标/密钥 + + 多团队阶段: + 托管 K8s + 平台团队 + GitOps + 服务目录 + 权限治理 + + 强监管 / 私有化 / 混合云: + K8s 或私有云平台,但必须接受更高运维成本 +``` + +这条路径和 [04 章](04-十大核心架构模式.md) 的「单体优先」、[15 章](15-组织即架构.md) 的「平台工程」是同一件事:**先让业务跑起来,再把反复出现的运维复杂度收敛成平台能力。** + +--- + +## 六、部署平台选型表 + +| 判断信号 | 更倾向的选择 | 为什么 | 警惕代价 | +|---|---|---|---| +| 1–5 人团队,MVP,标准 Web 应用 | PaaS / 托管应用平台 | 少操心,发布链路短,把时间留给业务验证 | 控制权少,迁移成本可能较高 | +| 单体或少量服务,需要环境一致性 | 托管容器平台 | 解决「我电脑能跑线上不能跑」,但不吞下 K8s 全套复杂度 | 仍要处理镜像、配置、日志、健康检查 | +| 多服务、多团队、独立部署 | 托管 Kubernetes | 统一调度、扩缩、发布、隔离和生态 | 没平台团队时,复杂度压到业务团队 | +| 事件驱动、突发流量、后台任务 | Serverless + 队列 | 按需扩缩,不用长期养机器 | 冷启动、限制、可观测性和厂商绑定 | +| 强监管、私有化、混合云 | 私有云 / 自管 K8s | 满足数据边界、合规、可移植性 | 运维成本最高,需要专职平台能力 | + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **云原生不是工具清单**:核心是自动化、弹性、可恢复、可观测、可重复发布。 +- **部署平台选的是运维模型**:省心 vs 控制权,简单 vs 可治理,低门槛 vs 平台能力。 +- **Kubernetes 不是默认答案**:它适合多服务、多团队、复杂治理;小团队过早上 K8s 往往是过度设计。 +- **部署策略是架构的一部分**:健康检查、回滚、灰度、密钥、IaC/GitOps 决定出事时能不能收场。 +- **从简单开始,按信号升级**:先托管、再容器、再平台化,让真实痛点决定复杂度投入。 + +> **承上启下**:部署平台负责让系统跑起来、发得出去、坏了能退。下一章 [32 · 可观测性与可靠性技术栈选型](32-可观测性与可靠性技术栈选型.md),我们补上另一半:系统跑起来之后,你能不能看得见、叫得准、救得回。 + +--- + +## 相关链接 + +- 方法论本体:[04 · 十大核心架构模式](04-十大核心架构模式.md) · [06 · 质量属性与取舍](06-质量属性与取舍.md) · [12 · 为失败而设计](12-为失败而设计.md) +- 组织配套:[15 · 组织即架构](15-组织即架构.md) · [24 · 审查清单](24-审查清单AI产出默认缺什么.md) +- 案例对照:[PatchDesk](../cases/patchdesk-saas/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" "b/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" new file mode 100644 index 0000000..7c5dbc1 --- /dev/null +++ "b/tutorial/32-\345\217\257\350\247\202\346\265\213\346\200\247\344\270\216\345\217\257\351\235\240\346\200\247\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" @@ -0,0 +1,171 @@ +# 32 · 可观测性与可靠性技术栈选型 + +> 一句话点题:**可观测性(Observability)不是装一堆监控大屏,而是在系统出问题时,你能不能用证据回答:谁受影响、哪里变慢、为什么变慢、该不该叫醒人。可靠性技术栈选型,是在给线上系统装神经系统和免疫系统。** + +--- + +> **🧰 技术栈选型篇第 6 章 · 本章只练一件事** +> +> [31 章](31-云原生与部署平台选型.md) 讲怎么部署。本章讲上线以后怎么知道系统是不是健康,以及不健康时怎么收场。不要从 Prometheus、Grafana、ELK、OpenTelemetry 这些名字开始,要从 SLO(服务等级目标)开始。 + +--- + +## 开场:监控和可观测性不是一回事 + +Monitoring(监控)回答的是: + +> 我提前知道要看什么,现在它有没有越线? + +例如 CPU 超过 90%、接口错误率超过 5%、队列积压超过 1 万。 + +Observability(可观测性)回答的是: + +> 我不知道问题会从哪里冒出来,但系统留下的证据够不够我追下去? + +例如某个用户下单慢,请求穿过网关、订单、库存、支付、第三方接口,到底卡在哪一跳?是某个租户、某个版本、某个可用区,还是某个数据库索引? + +> **判断要点:**小系统靠监控也能活;分布式系统只靠监控会瞎。服务越多、调用链越长、版本越频繁,越需要可观测性,而不只是更多仪表盘。 + +--- + +## 一、从 SLO 倒推,别从工具倒推 + +可靠性技术栈的第一步是定义三件事: + +``` + SLI(Service Level Indicator,服务等级指标) + → 你测什么:成功率、P99 延迟、错误率、可用性? + + SLO(Service Level Objective,服务等级目标) + → 你内部承诺做到多少:99.9% 请求 < 300ms? + + Error Budget(错误预算) + → 允许失败的额度:预算没烧光,可以继续发版;烧光了,先修稳定性 +``` + +这套语言的价值,是把「系统稳不稳」从感觉变成可讨论的数字。告警也应该从 SLO 倒推: + +> 用户真的受影响了,才值得叫醒人。 + +CPU 高、内存高、线程多,都只是原因候选。如果它们没有影响用户旅程,就不该直接变成半夜电话。 + +--- + +## 二、三类证据:指标、日志、链路 + +可观测性常说三类信号: + +| 信号 | 适合回答 | 代价 | +|---|---|---| +| **Metrics 指标** | 系统整体趋势:QPS、错误率、P95/P99 延迟、队列深度 | 便宜、适合告警,但细节少 | +| **Logs 日志** | 单个事件细节:订单为什么失败、鉴权为什么拒绝 | 细节多,但成本和噪音高 | +| **Traces 链路追踪** | 一个请求跨服务的完整路径和每跳耗时 | 对分布式定位很强,但采样和上下文传播要设计 | + +OpenTelemetry / OTel(开放遥测标准与工具集)的价值在这里:它尽量把埋点和后端存储/查询解耦。你先用相对标准的方式生成遥测数据,以后后端从开源换到商业、从 A 厂商换到 B 厂商,迁移成本会小一些。 + +> **判断要点:**工具可以换,埋点习惯很难换。先把 trace id、结构化日志、关键业务指标这些「证据格式」打好,比先纠结大屏配色重要。 + +--- + +## 三、可靠性不止看见,还要收场 + +很多团队买了可观测性工具,可靠性却没有变好,原因是: + +> 看见问题不等于能处理问题。 + +可靠性还需要响应链路: + +``` + 告警(Alerting) → 只叫醒能行动的人 + 值班(On-call) → 明确谁负责响应 + Runbook(处置手册) → 告警来了第一步做什么 + 事故管理(Incident) → 分级、沟通、升级、复盘 + 发布治理 → 灰度、回滚、功能开关、熔断降级 +``` + +所以第 32 章的选型不能只选「监控后端」,还要选**事故流程**。严肃系统至少要做到:告警可行动、服务有 owner(负责人)、关键告警有 runbook、事故后有复盘,复盘产出能回写到告警、代码或平台。 + +--- + +## 四、告警:少而准,别制造噪音 + +低质量告警长这样: + +- CPU 高了。 +- 内存高了。 +- 磁盘用了 80%。 +- 线程数变多了。 + +这些都是线索,不一定是事故。更好的告警围绕用户症状: + +| 用户症状 | 可行动告警 | +|---|---| +| 登录失败 | 登录成功率低于 SLO | +| 下单慢 | 下单 P99 延迟连续 10 分钟超过目标 | +| 消息发不出去 | 队列积压导致通知延迟超过承诺 | +| 搜索不可用 | 搜索错误率和空结果率异常 | + +> **架构智慧:**别告警「机器不舒服」,要告警「用户正在受伤」。否则你会养出一套噪音系统,最后大家对真正事故也麻木。 + +--- + +## 五、按成熟度选栈 + +| 阶段 | 倾向的栈 | 目标 | +|---|---|---| +| **MVP / 小团队** | 托管日志 + 错误追踪 + uptime 探测 + 少量核心指标 | 出事有人知道,能找到大概原因 | +| **标准线上系统** | 指标 + 结构化日志 + 关键链路追踪 + SLO 告警 + runbook | 用户受影响时能定位、能响应、能回滚 | +| **多服务 / 多团队** | OpenTelemetry + 指标/日志/链路统一关联 + 服务目录 + owner | 跨团队协作时不靠喊人破案 | +| **高可靠关键链路** | SLO 平台 + 金丝雀分析 + 合成监控 + 事故演练 | 提前发现退化,限制爆炸半径,缩短 MTTR(平均恢复时间) | + +注意成本:日志全量保留、trace 全量采样、高基数标签(取值种类极多的标签)都会迅速烧钱。可观测性不是采得越多越好,而是**为关键问题留下足够证据**。 + +--- + +## 六、选型表 + +| 场景信号 | 更倾向的栈 | 选它的理由 | 警惕代价 | +|---|---|---|---| +| MVP / 内部工具 | 托管日志 + 错误追踪 + uptime 探测 | 快速闭环,出事能知道 | 不要一开始自建全套平台 | +| 标准 Web 应用 | 指标监控 + 结构化日志 + SLO 告警 | 能看错误率、延迟、核心业务是否受影响 | 告警规则要少而准 | +| 微服务 / 多团队 | OTel + Metrics/Logs/Traces 统一关联 | 跨服务定位问题,减少找人时间 | 埋点规范、采样、owner 维护都要治理 | +| 支付 / 交易 / 核心链路 | SLO + 错误预算 + 金丝雀发布 + runbook + 事故管理 | 把可靠性变成可度量、可响应、可复盘 | 成本高,需要值班纪律和组织承诺 | +| 数据量巨大 / 成本敏感 | 分层存储 + 采样 + 聚合指标 + 限制高基数标签 | 保留排障能力,控制账单 | 采样过度会丢关键证据 | + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **可观测性不是大屏**:它是系统留下足够证据,让你能追问未知问题。 +- **从 SLO 倒推技术栈**:先定义用户眼里的好,再决定指标、日志、链路和告警。 +- **三类证据各有用处**:Metrics 看趋势和告警,Logs 看细节,Traces 看跨服务路径。 +- **可靠性还需要响应流程**:告警、值班、runbook、事故分级、复盘、回滚,缺一块都会断。 +- **按成熟度投入**:小团队先闭环,大系统再统一标准和平台化;采集不是越多越好,证据要够用且付得起。 + +> **承上启下**:通用系统需要看得见和救得回,AI 系统还多了模型、上下文、检索质量、成本和评测这些新变量。下一章 [33 · AI 基础设施技术栈选型](33-AI基础设施技术栈选型.md),我们把技术栈选型推进到 LLM 时代。 + +--- + +## 相关链接 + +- 方法论本体:[06 · 质量属性与取舍](06-质量属性与取舍.md) · [12 · 为失败而设计](12-为失败而设计.md) · [13 · 规模化的力学](13-规模化的力学.md) +- AI 协同:[24 · 审查清单](24-审查清单AI产出默认缺什么.md) · [25 · 评测驱动](25-评测驱动把够好写进架构.md) +- 案例对照:[StarArena](../cases/stararena-ticketing/README.md) · [SyncRoom](../cases/syncroom-collaboration/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" "b/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" new file mode 100644 index 0000000..f8c267e --- /dev/null +++ "b/tutorial/33-AI\345\237\272\347\241\200\350\256\276\346\226\275\346\212\200\346\234\257\346\240\210\351\200\211\345\236\213.md" @@ -0,0 +1,182 @@ +# 33 · AI 基础设施技术栈选型 + +> 一句话点题:**AI 基础设施选型不是把热门工具全接一遍,而是先看清你的稀缺资源是什么:模型能力、GPU、上下文、检索质量、成本,还是可控性。选型的核心不是工具名,而是用最少组件把 AI 系统的风险关进笼子里。** + +--- + +> **🧰 技术栈选型篇第 7 章 · 本章只练一件事** +> +> [17 章](17-大模型时代的架构判断.md) 讲 AI 系统多了非确定性、上下文、Agentic 三个硬骨头;[22 章](22-AI原生系统设计.md) 讲 AI 原生系统怎么设计。本章专门回答:这些系统背后的 AI 技术栈该怎么选,什么时候托管 API 就够,什么时候才值得下沉到模型网关、自建推理、GPU、评测平台。 + +--- + +## 开场:你到底在自建什么 + +一听 AI Infrastructure(AI 基础设施),很多人脑子里马上冒出: + +- GPU(图形处理器,训练/推理常用计算资源) +- 向量数据库 +- Agent 框架 +- 模型网关 +- 推理引擎 +- 评测平台 + +但架构判断的第一步不是列清单,而是问: + +> 你是真的在建基础设施,还是只是在做一个 AI 应用? + +如果你只是做早期产品,MVP 阶段最合理的默认答案通常是:调用托管模型 API,加最小日志和成本监控,先把业务闭环跑通。只有当你遇到明确触发信号,比如成本失控、数据不能出域、延迟不达标、供应商单点不可接受、模型需要深度定制,才值得往下沉到网关、自建推理、GPU 池这些重型组件。 + +> **架构智慧:**AI 基础设施不是越底层越高级。越底层,你拿回的控制权越多,同时也接走了成本、容量规划、故障恢复、安全隔离和团队运维能力的账单。 + +--- + +## 一、AI 栈的四层 + +一套生产级 AI 系统,可以先粗略拆成四层: + +``` + 入口治理层: + AI Gateway(模型网关) / 鉴权 / 限流 / 成本记录 / 模型路由 + + 上下文层: + RAG(检索增强生成) / 向量库 / 文档权限 / 重排 / 引用 + + 推理层: + Inference Serving(推理服务) / GPU / KV Cache(生成中间状态缓存) / 批处理 + + 守门层: + Observability(可观测性) / Eval(评测) / Trace(调用链追踪) / 人工审批 +``` + +这四层不是一开始都要有。正确顺序是:**先把不可省的风险看见,再补对应的层。** + +- 看不见成本,先补网关或用量日志。 +- 检索质量决定答案上限,先补 RAG eval。 +- 多个团队都在调模型,再补统一模型网关。 +- GPU 成本超过 API 成本,再考虑自建推理。 +- Agent 能执行副作用,必须补权限、人审和审计。 + +--- + +## 二、API 还是自建推理:这是成本账,不是面子账 + +| 方式 | 优势 | 代价 | +|---|---|---| +| **托管模型 API** | 快、稳定、少运维、模型更新快 | 供应商锁定、数据路径受限、规模上来后单位成本可能贵 | +| **自建推理服务** | 控制模型、数据、成本结构和部署环境 | 要管理 GPU、显存、批处理、扩缩容、故障、容量 | +| **混合路由** | 简单任务走便宜模型,复杂任务走强模型 | 路由策略、评测、回退和成本核算更复杂 | + +这个分叉不要问「哪个更先进」,要问四个问题: + +1. 数据能不能出域? +2. 延迟目标托管 API 能不能满足? +3. 调用量大到自建更便宜了吗? +4. 团队会不会运营 GPU 服务? + +四个问题里只中一个,通常还不足以上自建;同时命中两三个,才说明基础设施下沉有意义。 + +--- + +## 三、RAG、长上下文、微调:先分清你在补知识还是补行为 + +RAG、Long Context(长上下文)、Fine-tuning(微调)经常被混着比较,但它们解决的不是同一个问题: + +| 路线 | 解决什么 | 适合 | +|---|---|---| +| **RAG** | 回答时检索外部知识 | 资料多、常更新、要引用来源、要权限过滤 | +| **长上下文** | 一次塞入大量材料 | 材料不多、临时任务、上下文放得下 | +| **微调** | 改变稳定行为、风格、格式 | 输出格式固定、领域风格稳定、样本质量高 | + +小白最容易犯的错是:检索没做好就怪模型笨,或者把本该用 RAG 解决的「知识更新」问题拿去微调。 + +> **判断句:**知识用检索,行为才谈微调。要引用、要更新、要权限,优先把 RAG 做对。 + +--- + +## 四、Agent 框架:先工作流,再自主 + +Agent 框架看起来很强,但第一个问题仍然是 [22 章](22-AI原生系统设计.md) 的判断: + +``` + 能用确定工作流解决? + ├─ 能 → 工作流优先 + └─ 不能 → Agent,但必须配权限、预算、人审、trace、eval +``` + +如果流程固定,例如「检索订单 → 判断是否符合退款规则 → 调退款接口 → 发通知」,不要急着上自主 Agent。工作流更可预测、可测试、可审计。只有当步骤开放、工具多、任务目标需要动态规划,Agent 才开始有意义。 + +Agent 选型要重点看: + +- 工具权限能否分级? +- 是否支持人工审批? +- 是否能记录每一步 trace? +- 是否能限制预算和最大步数? +- 是否能做上下文压缩和任务恢复? + +--- + +## 五、守门层不是锦上添花,是生产门槛 + +AI 系统和传统系统最大的不同,是输出不稳定、质量会漂移。上线后只看接口成功率不够,还要看: + +- prompt 和上下文是什么? +- 检索到了哪些片段? +- 模型调用成本多少? +- 工具调用有没有越权? +- 最终答案有没有引用、有没有胡编? +- 换模型或改提示后质量有没有退化? + +这就是 [25 章](25-评测驱动把够好写进架构.md) eval 的价值。如果系统已经碰钱、碰用户数据、碰自动操作,eval 不是「以后再补」,而是架构的一部分。没有 eval,每次换模型、改提示、换检索策略,本质上都是闭眼上线。 + +--- + +## 六、AI 技术栈选型表 + +| 判断问题 | 起步选择 | 触发升级 | 代价提醒 | +|---|---|---|---| +| 只是验证 AI 产品吗? | 托管模型 API + 基础日志 | 多应用共用、成本看不清、供应商故障影响大 | 先别自建 GPU | +| 多模型 / 多团队调用吗? | AI Gateway | 需要统一鉴权、限流、计费、故障转移 | 网关在关键路径上,必须高可用 | +| 需要私有知识作答吗? | RAG + 简单向量检索 | 检索质量不稳、权限复杂、知识库变大 | RAG 上限是检索质量 | +| 向量规模小吗? | pgvector / 单机向量检索 | 百万级以上、过滤复杂、延迟吃紧 | 专用向量库是新运维对象 | +| 模型调用成本高吗? | 模型路由 + 缓存 + 配额 | API 成本高于自建总成本、数据不能出域 | 自建推理要管理 GPU 和显存 | +| 需要自动行动吗? | 确定性 Workflow | 步骤开放、必须动态规划 | Agent 要配权限、预算、人审 | +| 要稳定迭代吗? | Trace + 小 eval 集 | 生产级、碰钱、换模型频繁 | 没有评测就没有可靠升级 | + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **AI 基础设施选型先看稀缺资源**:模型能力、GPU、上下文、检索质量、成本、可控性,哪一个最紧,就先围绕它设计。 +- **默认从托管 API 起步**:除非数据、成本、延迟、定制化或可用性触发升级,否则不要一开始自建推理。 +- **四层拆解更清楚**:入口治理、上下文、推理、守门。缺什么补什么,不要一次性全上。 +- **知识用 RAG,行为才谈微调**:别把检索问题误判成模型问题。 +- **生产级 AI 必须有守门层**:trace 看得见链路,eval 守得住质量。 + +> **承上启下**:到这里,技术栈的主要拼图都看过了:语言、数据、中间层、API、部署、观测、AI 基础设施。最后一章 [34 · 技术选型决策树](34-技术选型决策树.md),把它们收成一棵能照着走的判断树。 + +--- + +## 相关链接 + +- AI 方法论:[17 · 大模型时代的架构判断](17-大模型时代的架构判断.md) · [22 · AI 原生系统设计](22-AI原生系统设计.md) · [25 · 评测驱动](25-评测驱动把够好写进架构.md) +- 模板对照:[AI 网关](../templates/ai-gateway/README.md) · [模型推理服务](../templates/inference-serving/README.md) · [RAG 知识库](../templates/rag-knowledge-base/README.md) · [向量数据库](../templates/vector-database/README.md) +- 案例对照:[DocuMind](../cases/documind-rag/README.md) · [CodePilot](../cases/codepilot-agent/README.md) diff --git "a/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" "b/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" new file mode 100644 index 0000000..6047868 --- /dev/null +++ "b/tutorial/34-\346\212\200\346\234\257\351\200\211\345\236\213\345\206\263\347\255\226\346\240\221.md" @@ -0,0 +1,196 @@ +# 34 · 技术选型决策树 + +> 一句话点题:**技术选型不是在一堆工具里挑最强的,而是沿着需求、约束、阶段、团队能力和退出成本一路剪枝。成熟的选型,不是证明某个技术好,而是证明在当前约束下,它的收益配得上代价。** + +--- + +> **🧰 技术栈选型篇第 8 章 · 本篇收束** +> +> 前 7 章分别讲了语言、数据库、缓存队列、API、部署、观测、AI 基础设施。这一章不再新增工具,而是给你一棵统一的决策树。以后遇到任何「要不要上 X」,都按这棵树走一遍。 + +--- + +## 开场:选型的根节点不是 A 还是 B + +技术选型的第一个问题,不是: + +``` + PostgreSQL 还是 MongoDB? + REST 还是 gRPC? + PaaS 还是 K8s? + API 还是自建推理? +``` + +而是: + +> 我们真的需要引入新技术吗? + +如果现有技术栈能在目标性能、成本、可靠性和交付周期内解决问题,默认沿用现有栈。任何新技术都会带来学习成本、集成成本、运维成本和未来迁移成本。 + +这和前面反复说的克制是一脉相承的:能单体就别微服务,能工作流就别 Agent,能托管 API 起步就别自建 GPU。架构师把选型理解成**为一个明确问题支付一笔明确成本**。 + +--- + +## 一、第一刀:现在处在哪个阶段 + +同一个系统,不同阶段答案完全不同: + +| 阶段 | 最缺什么 | 选型倾向 | +|---|---|---| +| **MVP** | 验证速度 | 少组件、主流栈、托管优先、低迁移成本 | +| **成长期** | 可控增长 | 可观测、灰度、边界清晰、能局部扩展 | +| **规模期** | 效率与成本 | 深度优化、平台化、单位成本、自动化治理 | +| **关键期** | 稳定与合规 | 审计、隔离、容灾、SLO、事故流程 | + +一个技术在成熟期是正确答案,在 MVP 可能就是过度设计。验证需求时,优先少组件;支撑增长时,优先可控;优化规模时,才值得为单位成本、吞吐和深度定制付出复杂度。 + +--- + +## 二、第二刀:系统会先死在哪里 + +当确认现有栈不够,不要马上找工具,先定位失败模式: + +| 失败模式 | 优先看 | +|---|---| +| 数据错、状态对不上 | 数据模型、事务边界、幂等、Outbox、对账 | +| 读热点打爆主库 | 缓存、读模型、CDN、限流 | +| 写洪峰压垮后端 | 队列、背压、削峰、异步状态 | +| P99 被扇出放大 | API 边界、超时预算、降级、trace | +| 发布容易出事故 | 部署平台、灰度、回滚、配置治理 | +| 出事定位困难 | 指标、日志、链路追踪、SLO 告警 | +| AI 质量漂移 | eval、trace、RAG 评测、模型路由 | +| 团队协作卡住 | 模块边界、平台工程、服务 ownership | + +> **判断句:**工具只是答案的外壳,失败模式才是选型的题目。 + +--- + +## 三、第三刀:团队能不能养得起 + +很多技术在 benchmark(基准测试)里很好看,但你的团队不一定养得起。养得起包括: + +- 会不会部署? +- 会不会排障? +- 有没有监控? +- 线上坏了谁能修? +- 版本升级会不会炸? +- 有没有足够多的人理解它? + +一个「性能更强但没人会修」的系统,在生产里常常输给「性能够用但团队熟悉」的系统。技术选型不是实验室比赛,而是长期运营合同。要把可运维性和关键人员风险写进判断,否则选中的不是技术,是未来事故。 + +--- + +## 四、第四刀:能不能退出 + +成熟的选型一定有退出方案: + +| 技术 | 退出问题 | +|---|---| +| 新数据库 | 数据怎么迁移?双写如何校验?回滚到哪里? | +| 模型供应商 | API 能否适配?提示和 eval 能否复用? | +| 框架 | 业务逻辑是否被框架吞掉?能否分层隔离? | +| 消息系统 | topic / schema / 消费位点如何迁移? | +| 云平台 | 镜像、配置、密钥、存储、网络能否移走? | + +没有退出路线的选型,就是把未来绑死。重要技术进入生产前,至少要有 Spike(小实验)、灰度计划、回滚方案和 ADR。 + +--- + +## 五、统一决策树 + +``` +要不要引入新技术? + │ + ├─ 现有栈能满足目标? ── 是 ─▶ 沿用 + 局部优化 + │ + └─ 否 + │ + ├─ 现在是 MVP? ── 是 ─▶ 选最少组件、最快验证、低迁移成本 + │ + └─ 否 + │ + ├─ 失败模式是什么? + │ ├─ 数据/一致性 → 先看存储与事务边界 + │ ├─ 延迟/吞吐 → 先看缓存、批处理、扩展方式 + │ ├─ 可用性/失败 → 先看冗余、降级、隔离 + │ ├─ AI 质量 → 先看 eval、RAG、模型路由 + │ └─ 团队协作 → 先看模块边界、平台能力 + │ + └─ 候选方案能否被团队养住、能否退出? + ├─ 不能 → 换轻一点的方案 + └─ 能 → Spike 验证 → 写 ADR → 灰度采用 +``` + +--- + +## 六、技术选型 ADR 模板 + +```md +### ADR-034:引入 OpenTelemetry 统一链路追踪 + +- 背景:订单请求跨 7 个服务,P99 偶发超过 2s,只有各服务日志,定位一次问题平均 3 小时。 +- 目标:把跨服务请求路径和每跳耗时串起来,把 MTTR(平均恢复时间)降到 30 分钟以内。 +- 候选: + - 继续加日志:成本低,但无法稳定还原调用路径。 + - 引入私有追踪方案:定制强,但未来迁移困难。 + - 使用 OpenTelemetry:埋点标准化,后端可替换。 +- 选择:使用 OpenTelemetry 采集 trace,先覆盖订单、库存、支付三条关键链路。 +- 放弃:短期增加埋点工作和采样治理成本。 +- 换来:慢请求能跨服务定位,后续可接不同观测后端。 +- 复审条件:采样成本超过预算,或关键链路覆盖率低于 90%,重新评估采样策略和埋点规范。 +- 退出方案:保留标准 trace context,观测后端可替换;业务代码不绑定某个厂商 SDK。 +``` + +ADR 的重点不是格式,而是把「为什么选」和「选错怎么退」写清楚。 + +--- + +## 七、总表:每章的核心判断 + +| 章节 | 不要先问 | 先问 | +|---|---|---| +| [27](27-编程语言与后端框架选型.md) 语言/框架 | 哪个语言更先进 | 团队、生态、运行时、业务复杂度匹配吗 | +| [28](28-数据库与存储选型.md) 数据库/存储 | 哪个数据库最强 | 事实源是谁,查询形态是什么 | +| [29](29-缓存消息队列与事件系统选型.md) 缓存/队列/事件 | 要不要上 Kafka | 是读热点、时间错配,还是业务事实广播 | +| [30](30-API与服务通信选型.md) API/通信 | REST 还是 gRPC | 同步/异步、内部/外部、契约强度是什么 | +| [31](31-云原生与部署平台选型.md) 部署平台 | 要不要 K8s | 团队是否需要并养得起平台能力 | +| [32](32-可观测性与可靠性技术栈选型.md) 观测/可靠性 | 用哪个监控工具 | 用户 SLO 是什么,事故怎么收场 | +| [33](33-AI基础设施技术栈选型.md) AI 基础设施 | 要不要自建 GPU | 稀缺资源是模型、上下文、成本、质量还是可控性 | + +--- + +## 🎯 随堂检验 + + + + + +--- + +## 本章小结 + +- **选型的根节点是「要不要新技术」**:现有栈能满足目标,默认沿用。 +- **阶段决定答案**:MVP 买速度,成长期买可控,规模期买效率,关键期买稳定与合规。 +- **先定位失败模式,再比较工具**:数据、延迟、成本、质量、协作,对应的是不同问题。 +- **团队养得起才算选得上**:可运维性比纸面性能更接近生产真相。 +- **好选型必须能退出**:Spike 验证、ADR 记录、灰度采用、保留迁移路线。 + +> **技术栈选型篇收束**:这 8 章不是让你记住更多技术名,而是训练一句话:**先看约束,再选技术;先承认代价,再享受收益。** 接下来读 `templates/` 和 `cases/` 时,你可以反过来问每一个系统:它为什么是这套技术栈?如果约束换了,答案会不会变? + +--- + +## 相关链接 + +- 方法论本体:[02 · 架构师的思考框架](02-架构师的思考框架.md) · [06 · 质量属性与取舍](06-质量属性与取舍.md) · [08 · 架构决策记录与演进](08-架构决策记录与演进.md) · [09 · 架构品味](09-架构品味.md) +- 练习入口:[templates/README](../templates/README.md) · [cases/README](../cases/README.md) +- 本篇回看:[27](27-编程语言与后端框架选型.md) · [28](28-数据库与存储选型.md) · [29](29-缓存消息队列与事件系统选型.md) · [30](30-API与服务通信选型.md) · [31](31-云原生与部署平台选型.md) · [32](32-可观测性与可靠性技术栈选型.md) · [33](33-AI基础设施技术栈选型.md) diff --git a/tutorial/README.md b/tutorial/README.md index eedbffd..280c3f1 100644 --- a/tutorial/README.md +++ b/tutorial/README.md @@ -18,7 +18,7 @@ ## 学习路径 -教程分六段,**建议按顺序读**,但每一章也能独立看。 +教程按连续篇章展开,**建议按顺序读**,但每一章也能独立看。 ### 第一段:建立思维(01–03)—— 换一个看系统的视角 @@ -95,6 +95,23 @@ --- +## 🧰 技术栈选型篇(27–34)—— 把「用什么技术」变成架构判断 + +**前置:入门篇 + 进阶篇。** 这一篇不做框架教程,也不做工具排行榜,而是把 [02](02-架构师的思考框架.md) 的「需求 → 约束 → 质量属性 → 取舍」落到语言、数据库、缓存、API、部署、观测和 AI 基础设施选型上。 + +| 章节 | 一句话 | +|---|---| +| [27 · 编程语言与后端框架选型](27-编程语言与后端框架选型.md) | 语言和框架不是信仰题,而是运行时、生态、团队和维护成本的组合选择。 | +| [28 · 数据库与存储选型](28-数据库与存储选型.md) | 先画数据生命周期,再决定事实源、读模型、搜索、对象存储和向量库。 | +| [29 · 缓存、消息队列与事件系统选型](29-缓存消息队列与事件系统选型.md) | 分清读热点、时间错配和业务事实广播,别把 Redis / Kafka 当银弹。 | +| [30 · API 与服务通信选型](30-API与服务通信选型.md) | 先判断同步/异步、内部/外部、契约强度,再谈 REST、gRPC、GraphQL。 | +| [31 · 云原生与部署平台选型](31-云原生与部署平台选型.md) | 云原生不是上 K8s,而是选择团队养得起的部署、扩缩、回滚模型。 | +| [32 · 可观测性与可靠性技术栈选型](32-可观测性与可靠性技术栈选型.md) | 从用户 SLO 倒推指标、日志、链路、告警、值班和事故流程。 | +| [33 · AI 基础设施技术栈选型](33-AI基础设施技术栈选型.md) | 先看稀缺资源是模型、GPU、上下文、检索质量、成本还是可控性。 | +| [34 · 技术选型决策树](34-技术选型决策树.md) | 用一棵树把「是否引入新技术、阶段、失败模式、团队能力、退出方案」串起来。 | + +--- + ## 读完这套教程,你应该能做到 - [ ] 拿到一个模糊需求,能问出对的问题,把它拆成明确的约束和质量目标。 @@ -104,6 +121,7 @@ - [ ] 做架构决策时,不再凭感觉或跟风,而是基于约束和取舍,并把理由写下来。 - [ ] *(实战篇)* 对着陌生系统或模板,能完整走一遍「读懂 → 设计 → 演进 → 迁移」。 - [ ] *(AI 协同篇)* 能把架构约束写给 AI、审查其产出,并在原型与生产之间选对协作方式。 +- [ ] *(技术栈选型篇)* 能把「用什么技术」写成可复盘的 ADR,而不是凭热度或喜好拍板。 ---