sparkplug604 · sparkplug604 · Jun 15, 2026 · Jun 15, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -116,5 +116,8 @@ praxis = "praxis.cli:main"
 [tool.setuptools]
 package-dir = {"" = "src"}
 
+[tool.setuptools.package-data]
+praxis = ["demo_data/stackoverflow_developer_survey/*"]
+
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/setup.py b/setup.py
@@ -14,5 +14,7 @@
     package_dir={"": "src"},
     packages=find_packages(where="src"),
     python_requires=">=3.9",
+    include_package_data=True,
+    package_data={"praxis": ["demo_data/stackoverflow_developer_survey/*"]},
     entry_points={"console_scripts": ["praxis=praxis.cli:main"]},
 )
diff --git a/src/praxis/commands/demo.py b/src/praxis/commands/demo.py
@@ -4,37 +4,23 @@
 from __future__ import annotations
 
 import argparse
+import shutil
+from importlib import resources
 from pathlib import Path
 
 from praxis.cli import main as praxis_main
 from praxis.paths import default_root, research_dir
 
 
 MODULES = {
-    "core": "Capture one local source, index it, and search it with explanations.",
+    "core": "Ingest a bundled Stack Overflow survey aggregate, index it, and search it with explanations.",
     "reach": "Create one fixture GTM client, produce evidence, and build a context pack.",
     "agency": "Create two fixture clients and run one workflow across both.",
     "all": "Run the Core, Reach, and Agency demos.",
 }
 
-CORE_SOURCE = """# Praxis Core Demo Source
-
-Praxis Core turns source material into searchable, source-traceable agent knowledge.
-
-The important idea is not just retrieval. Praxis keeps raw evidence, summaries,
-chunks, source IDs, hashes, graph links, confidence metadata, conflict warnings,
-and rollback records attached to the knowledge agents use.
-
-This demo source exists so a new user can see the Core loop without needing a
-web page, API key, or private corpus.
-
-## What An Agent Can Reuse
-
-- Capture useful sources during real work.
-- Search them later with semantic, keyword, and graph signals.
-- Inspect why a result matched before trusting it.
-- Promote selected knowledge into reusable instructions or references.
-"""
+CORE_DEMO_DATA = ("demo_data", "stackoverflow_developer_survey")
+CORE_DEMO_SOURCE_ID = "src:stackoverflow-dev-survey-ai-tooling-mini"
 
 
 def run_step(root: Path, label: str, args: list[str]) -> int:
@@ -46,17 +32,24 @@ def run_step(root: Path, label: str, args: list[str]) -> int:
     return int(code)
 
 
-def write_core_demo_source(root: Path) -> Path:
-    path = research_dir(root) / "demo_sources" / "praxis-core-demo.md"
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(CORE_SOURCE, encoding="utf-8")
-    return path
+def copy_core_demo_dataset(root: Path) -> Path:
+    target = research_dir(root) / "demo_sources" / "stackoverflow_developer_survey"
+    if target.exists():
+        shutil.rmtree(target)
+    target.mkdir(parents=True, exist_ok=True)
+    data_root = resources.files("praxis").joinpath(*CORE_DEMO_DATA)
+    for item in data_root.iterdir():
+        if item.is_file():
+            (target / item.name).write_bytes(item.read_bytes())
+    return target
 
 
 def demo_core(root: Path) -> int:
     print("# Praxis Core demo")
-    print("\nThis demo captures one local source, writes provisional SkillGraph memory, chunks it, embeds it, and searches it.")
-    source_path = write_core_demo_source(root)
+    print(
+        "\nThis demo ingests a bundled Stack Overflow Developer Survey aggregate, writes provisional SkillGraph memory, chunks it, embeds it, and searches it."
+    )
+    source_path = copy_core_demo_dataset(root)
     steps = [
         ("Initialize relational DB", ["init-db"]),
         ("Initialize SkillGraph", ["init-graph"]),
@@ -66,22 +59,22 @@ def demo_core(root: Path) -> int:
                 "ingest",
                 str(source_path),
                 "--title",
-                "Praxis Core Demo Source",
+                "Stack Overflow Developer Survey AI Tooling Mini Dataset",
                 "--source-type",
-                "docs",
+                "survey",
                 "--source-id",
-                "src:praxis-core-demo",
+                CORE_DEMO_SOURCE_ID,
                 "--freshness-window-days",
-                "90",
+                "365",
                 "--notes",
-                "Generated by praxis demo core.",
+                "Bundled aggregate demo dataset derived from official Stack Overflow Developer Survey 2024 and 2025 archive files.",
             ],
         ),
         ("Chunk changed sources", ["chunk", "--changed-only", "--no-runtimes", "--no-skills"]),
         ("Embed chunks locally", ["embed", "--provider", "local-hash"]),
         (
             "Search with explanations",
-            ["search", "how does Praxis keep agent knowledge reusable and traceable?", "--explain", "--limit", "3"],
+            ["search", "Stack Overflow developer survey AI tool adoption trust accuracy developer segments", "--explain", "--limit", "3"],
         ),
     ]
     for label, args in steps:
@@ -93,7 +86,7 @@ def demo_core(root: Path) -> int:
 Core demo complete.
 
 You saw the full source-to-search path:
-  source -> evidence archive -> provisional SkillGraph memory -> chunks -> embeddings -> explained search.
+  bundled aggregate survey data -> evidence archive -> provisional SkillGraph memory -> chunks -> embeddings -> explained search.
 
 Try next:
   praxis changes list

diff --git a/src/praxis/commands/research_common.py b/src/praxis/commands/research_common.py
@@ -88,7 +88,7 @@ def infer_source_type(source: str, text: str) -> str:
 
 
 def credibility_score(source_type: str, metadata: dict[str, Any]) -> int:
-    if source_type in {"repo", "package", "paper", "docs"}:
+    if source_type in {"repo", "package", "paper", "docs", "survey"}:
         return 4
     if source_type == "local":
         return 3

diff --git a/src/praxis/demo_data/stackoverflow_developer_survey/BUILD.md b/src/praxis/demo_data/stackoverflow_developer_survey/BUILD.md
@@ -0,0 +1,42 @@
+# Build Notes
+
+This demo dataset is an aggregate-only excerpt derived from the official Stack Overflow Developer Survey archives.
+
+## Source Inputs
+
+- 2024 schema: `https://media.githubusercontent.com/media/StackExchange/Survey/main/packages/archive/2024/schema.csv`
+- 2024 results: `https://media.githubusercontent.com/media/StackExchange/Survey/main/packages/archive/2024/results.csv`
+- 2025 schema: `https://media.githubusercontent.com/media/StackExchange/Survey/main/packages/archive/2025/schema.csv`
+- 2025 results: `https://media.githubusercontent.com/media/StackExchange/Survey/main/packages/archive/2025/results.csv`
+
+The raw source files are not committed to this repository. Their downloaded byte counts and SHA-256 checksums are recorded in `source_manifest.json`.
+
+## Aggregation
+
+The bundled CSVs are generated from shared 2024/2025 AI and tooling fields:
+
+- `AISelect`
+- `AISent`
+- `AIAcc`
+- `AIComplex`
+- `AIThreat`
+- `DevType`
+- `LanguageHaveWorkedWith`
+- `DatabaseHaveWorkedWith`
+- `PlatformHaveWorkedWith`
+- `WebframeHaveWorkedWith`
+
+Generated files include:
+
+- `ai_attitudes_by_year.csv`: response counts and percentages for shared AI attitude fields.
+- `ai_developer_segments.csv`: developer-segment rollups for adoption, favorability, trust, complex-task handling, and job-threat perception.
+- `developer_tooling_top_items.csv`: top shared tooling items by year.
+- `shared_schema_fields.csv`: selected schema fields and 2024/2025 result-column mappings.
+
+## Privacy Boundary
+
+Only aggregate rows are bundled. Raw respondent-level rows, free-text answers, and survey PDFs are excluded from the repository.
+
+## License
+
+The source survey database is distributed by Stack Overflow under ODbL 1.0, with database contents under DbCL 1.0. Preserve attribution and license terms if redistributing or modifying this derived dataset.
diff --git a/src/praxis/demo_data/stackoverflow_developer_survey/README.md b/src/praxis/demo_data/stackoverflow_developer_survey/README.md
@@ -0,0 +1,34 @@
+# Stack Overflow Developer Survey AI Tooling Mini Dataset
+
+This bundled Praxis Core demo source is a small aggregate excerpt derived from the official Stack Overflow Developer Survey archives for 2024 and 2025.
+
+It is designed for a first-run Praxis demo: no credentials, no large download, no respondent-level records. The raw survey files stay outside the repo; only aggregate counts, percentages, schema field notes, and source provenance are bundled.
+
+## What Praxis Can Retrieve
+
+- AI adoption by year from `AISelect`.
+- Developer favorability toward AI tools from `AISent`.
+- Trust in AI output accuracy from `AIAcc`.
+- Perceived ability to handle complex tasks from `AIComplex`.
+- Job-threat perception from `AIThreat`.
+- Developer-segment rollups by `DevType`.
+- Top shared tooling items from language, database, platform, and web framework columns.
+
+## Example Aggregate Findings
+
+- In the 2024 survey excerpt, 37,662 of 65,437 respondents answered that they used AI tools in their development process.
+- In the 2025 survey excerpt, 26,469 of 49,191 respondents reported daily, weekly, monthly, or infrequent AI tool use.
+- The segment file lets Praxis compare AI use, favorability, trust, complex-task handling, and job-threat perception across common developer types.
+- The tooling file lets Praxis retrieve common language, database, platform, and web framework signals alongside AI attitudes.
+
+## Files
+
+- `ai_attitudes_by_year.csv`: response counts and percentages for shared AI attitude fields.
+- `ai_developer_segments.csv`: aggregate AI metrics by developer segment.
+- `developer_tooling_top_items.csv`: top tooling items from shared HaveWorkedWith columns.
+- `shared_schema_fields.csv`: selected shared schema fields and result-column mappings.
+- `source_manifest.json`: source URLs, access dates, licenses, and checksums.
+
+## License And Attribution
+
+This aggregate excerpt is derived from Stack Overflow Developer Survey data. The source survey database is distributed by Stack Overflow under ODbL 1.0, with database contents under DbCL 1.0. Preserve attribution and source-license terms if redistributing or modifying this derived dataset.