diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index bd587969a..9296619ef 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -51,7 +51,7 @@ "name": "snowflake", "source": "./plugins/snowflake", "description": "Snowflake data analysis commands for engineering metrics and reports", - "version": "0.4.0" + "version": "0.5.0" }, { "name": "sosreport", diff --git a/docs/data.json b/docs/data.json index fb62d802e..624c88dd5 100644 --- a/docs/data.json +++ b/docs/data.json @@ -797,7 +797,7 @@ "name": "Setup Snowflake" } ], - "version": "0.4.0" + "version": "0.5.0" }, { "commands": [ diff --git a/plugins/snowflake/.claude-plugin/plugin.json b/plugins/snowflake/.claude-plugin/plugin.json index 6c9610e1f..9648a3366 100644 --- a/plugins/snowflake/.claude-plugin/plugin.json +++ b/plugins/snowflake/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "snowflake", "description": "Snowflake data analysis commands for engineering metrics and reports", - "version": "0.4.0", + "version": "0.5.0", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/snowflake/README.md b/plugins/snowflake/README.md index 97da3dd7f..cbebe5299 100644 --- a/plugins/snowflake/README.md +++ b/plugins/snowflake/README.md @@ -4,7 +4,7 @@ Snowflake data analysis commands for engineering metrics and reports. Uses the [ ## Prerequisites -1. **Snowflake access** -- You need an account on your organization's Snowflake instance with the appropriate role (e.g., `JIRA_CLOUDMARTS_GROUP` for Jira data). See [the data platform documentation](https://dataverse.pages.redhat.com/data-docs/data-users/) for access provisioning. +1. **Snowflake access** -- You need an account on your organization's Snowflake instance with the `PUBLIC` role. See [the data platform documentation](https://dataverse.pages.redhat.com/data-docs/data-users/) for access provisioning. 2. **Python 3** -- Required for report generation. Most systems have this pre-installed. diff --git a/plugins/snowflake/commands/activity-type-report.md b/plugins/snowflake/commands/activity-type-report.md index b2c797ee3..51f963e72 100644 --- a/plugins/snowflake/commands/activity-type-report.md +++ b/plugins/snowflake/commands/activity-type-report.md @@ -13,6 +13,8 @@ snowflake:activity-type-report /snowflake:activity-type-report [months] --todo /snowflake:activity-type-report [months] --all /snowflake:activity-type-report [months] --uncategorized +/snowflake:activity-type-report [months] --uncategorized --todo +/snowflake:activity-type-report [months] --uncategorized --all /snowflake:activity-type-report [months] --uncategorized --sample [N] ``` @@ -35,7 +37,7 @@ Activity type categories: ### Phase 1: Verify Snowflake Connection -Read and follow the `setup-snowflake` skill. This checks for the Snowflake MCP server, guides the user through setup if needed, and sets the session context (`JIRA_CLOUDMARTS_GROUP` role, `JIRA_DB.CLOUDRHAI_MARTS` schema). +Read and follow the `setup-snowflake` skill. This checks for the Snowflake MCP server, guides the user through setup if needed, and sets the session context (`PUBLIC` role, `JIRA_DB.CLOUDRHAI_MARTS` schema). If setup fails, abort with the guidance message from the skill. Do not proceed without a working Snowflake connection. @@ -79,15 +81,28 @@ And `DPTP --uncategorized` means projects=DPTP, months=6 (default), closed issue Core query pattern (adapt based on available columns/views): ```sql +WITH bot_issues AS ( + SELECT DISTINCT ISSUE + FROM JIRA_LABEL_RHAI + WHERE LABEL IN ( + 'auto-created', 'bot-created', 'ai-generated', 'ai-generated-jira', + 'cloud-automated-jira', 'on-call-bot', 'automated', 'team:automatic_rule', + 'bot-duplicate', + 'art:image-build-failure', 'art:reconciliation', + 'acs-generated', 'triaged-test-automation' + ) +) SELECT ji.ISSUE_KEY AS ISSUEKEY, ji.PROJECT AS PROJECT_KEY, ji.SUMMARY, SUBSTR(ji.DESCRIPTION, 1, 2000) AS DESCRIPTION_EXCERPT, ji.CREATED, + CASE WHEN bi.ISSUE IS NOT NULL THEN TRUE ELSE FALSE END AS IS_BOT, -- join for issue type name: jit.PNAME AS ISSUE_TYPE -- join for status name: js.PNAME AS STATUS FROM JIRA_ISSUE_NON_PII ji +LEFT JOIN bot_issues bi ON bi.ISSUE = ji.ID LEFT JOIN JIRA_ISSUETYPE_RHAI jit ON jit.ID = ji.ISSUETYPE LEFT JOIN JIRA_ISSUESTATUS_RHAI js ON js.ID = ji.ISSUESTATUS_ID -- If --uncategorized: LEFT JOIN JIRA_CUSTOMFIELDVALUE_NON_PII cfv @@ -107,13 +122,23 @@ WHERE ji.PROJECT IN ('DPTP', 'TRT', ...) ORDER BY ji.CREATED DESC ``` -If `JIRA_NODEASSOCIATION_RHAI` and `JIRA_COMPONENT_RHAI` views exist, also fetch components: +The `bot_issues` CTE identifies issues filed by automation bots via labels in `JIRA_LABEL_RHAI`. These labels were verified across 48 HP projects — they reliably distinguish bot-filed tickets (e.g., ART image-build-failure, ACM auto-created CVEs) from human engineering work. Labels describing automation *work* by humans (e.g., `automation`, `qe-automation`, `auto-closed`) are intentionally excluded. + +If `JIRA_NODEASSOCIATION_RHAI` and `JIRA_COMPONENT_RHAI` views exist, also fetch components (reuse the same `bot_issues` CTE from the query above — identical label list): ```sql +WITH bot_issues AS ( + -- Same CTE as main query above — keep label list in sync + SELECT DISTINCT ISSUE + FROM JIRA_LABEL_RHAI + WHERE LABEL IN (<>) +) SELECT ji.ISSUE_KEY AS ISSUEKEY, - LISTAGG(c.CNAME, ', ') WITHIN GROUP (ORDER BY c.CNAME) AS COMPONENTS + LISTAGG(c.CNAME, ', ') WITHIN GROUP (ORDER BY c.CNAME) AS COMPONENTS, + MAX(CASE WHEN bi.ISSUE IS NOT NULL THEN TRUE ELSE FALSE END) AS IS_BOT FROM JIRA_ISSUE_NON_PII ji +LEFT JOIN bot_issues bi ON bi.ISSUE = ji.ID LEFT JOIN JIRA_NODEASSOCIATION_RHAI na ON na.SOURCE_NODE_ID = ji.ID AND na.ASSOCIATION_TYPE = 'IssueComponent' LEFT JOIN JIRA_COMPONENT_RHAI c ON c.ID = na.SINK_NODE_ID @@ -177,7 +202,7 @@ All subsequent phases write to `$RUN_DIR/`. **Cache check**: If `$RUN_DIR/classified_issues.json` already exists (full mode) or `$RUN_DIR/estimates.json` already exists (sample mode), skip classification entirely and go directly to Phase 5. Tell the user: "Found existing classification in `$RUN_DIR/` — skipping Vertex AI API call to save tokens. Delete the directory to force re-classification." -Otherwise, write the fetched issues to `$RUN_DIR/issues.json` as a JSON array. Each object should include: `ISSUEKEY`, `PROJECT_KEY`, `SUMMARY`, `DESCRIPTION_EXCERPT`, `CREATED`, `ISSUE_TYPE`, `STATUS`, and `COMPONENTS` (if available). +Otherwise, write the fetched issues to `$RUN_DIR/issues.json` as a JSON array. Each object should include: `ISSUEKEY`, `PROJECT_KEY`, `SUMMARY`, `DESCRIPTION_EXCERPT`, `CREATED`, `ISSUE_TYPE`, `STATUS`, `COMPONENTS` (if available), and `IS_BOT`. Find the scripts directory: ```bash @@ -203,7 +228,7 @@ python3 "$SCRIPT_DIR/sample_and_estimate.py" \ --draw-sample $RUN_DIR/sample_to_classify.json \ --sample-size ${N:-0} ``` -(0 = auto-recommend based on ±2.5% target precision, typically ~400 issues) +(0 = auto-recommend based on ±2.5% target precision, typically ~400 issues. Stratifies by (project, is_bot) to ensure both human and bot populations are represented in the sample.) **Step 2: Classify only the sample** ```bash @@ -281,10 +306,37 @@ Include the status filter in the summary header. When `--uncategorized` is activ #### Full mode summary: +When bot issues are detected (any issue has `IS_BOT=true`), show separate human and bot distributions. The human distribution is the primary output — it shows what engineers are actually working on. The bot distribution is secondary context. + ``` Activity Type Report: $RUN_DIR/activity-type-report.html -54,478 issues across 52 projects (2025-10-02 to 2026-04-07) +3,114 closed issues across 1 project (2026-01-22 to 2026-04-22) + Human: 38 (1.2%) | Automated/Bot: 3,076 (98.8%) + +Human Work — Activity Type Distribution: + Product / Portfolio Work 15 (39.5%) + Quality / Stability / Reliability 8 (21.1%) + Future Sustainability 6 (15.8%) + Incidents & Support 4 (10.5%) + Security & Compliance 3 (7.9%) + Associate Wellness & Development 1 (2.6%) + Uncategorized 1 (2.6%) + +Automated/Bot Work — Activity Type Distribution: + Quality / Stability / Reliability 3,050 (99.2%) + Product / Portfolio Work 15 (0.5%) + Uncategorized 11 (0.4%) + +Classification cost: 86,313 input + 13,008 output = 99,321 tokens, $0.45 +``` + +When zero bot issues are detected, omit the human/bot split and show the current format: + +``` +Activity Type Report: $RUN_DIR/activity-type-report.html + +247 closed issues across 1 project (2025-10-02 to 2026-04-07) Activity Type Distribution: Quality / Stability / Reliability 98 (39.7%) @@ -300,7 +352,32 @@ Classification cost: 86,313 input + 13,008 output = 99,321 tokens, $0.45 #### Sample mode summary: -Include credible intervals and sample metadata: +Include credible intervals and sample metadata. When bot issues are detected, show separate human and bot distributions with their own credible intervals. + +``` +Activity Type Report (Sampled Estimate): $RUN_DIR/activity-type-report.html + +4,338 issues across 1 project (2025-10-02 to 2026-04-07) + Human: 1,237 (28.5%) | Automated/Bot: 3,101 (71.5%) +Sample: 369 classified (8.5%) — 25 API calls, $0.45 + +Human Work — Activity Type Distribution (95% Credible Intervals): + Product / Portfolio Work 32.1% [25.4% — 39.2%] + Quality / Stability / Reliability 22.8% [17.0% — 29.3%] + Future Sustainability 16.5% [11.4% — 22.4%] + Incidents & Support 12.3% [ 7.9% — 17.5%] + Security & Compliance 8.7% [ 5.1% — 13.3%] + Associate Wellness & Development 4.2% [ 1.8% — 7.8%] + Uncategorized 3.4% [ 1.3% — 6.6%] + +Automated/Bot Work — Activity Type Distribution (95% Credible Intervals): + Quality / Stability / Reliability 96.2% [94.1% — 97.8%] + Product / Portfolio Work 1.5% [ 0.5% — 3.1%] + Uncategorized 1.3% [ 0.4% — 2.8%] + ... +``` + +When zero bot issues are detected, omit the split and show the original format: ``` Activity Type Report (Sampled Estimate): $RUN_DIR/activity-type-report.html @@ -318,7 +395,7 @@ Activity Type Distribution (95% Credible Intervals): Incidents & Support 3.5% [ 1.9% — 5.5%] ``` -Read the estimates from `$RUN_DIR/estimates.json` (field: `overall.estimates[]`, each with `category`, `posterior_mean`, `ci_low`, `ci_high`) and the usage from `$RUN_DIR/classified_sample_usage.txt` (or `classified_issues_usage.txt` in full mode). +Read the estimates from `$RUN_DIR/estimates.json`. For the overall distribution, use `overall.estimates[]` (each with `category`, `posterior_mean`, `ci_low`, `ci_high`). When `human` and `bot` keys are present in the JSON, use `human.estimates[]` and `bot.estimates[]` for the separate distributions. Read usage from `$RUN_DIR/classified_sample_usage.txt` (or `classified_issues_usage.txt` in full mode). After the summary, tell the user the HTML report is available at the path shown and can be opened directly in a browser from their host filesystem. @@ -340,7 +417,7 @@ After the summary, tell the user the HTML report is available at the path shown - N = sample size (default: auto-recommended for ±2.5% precision, typically ~400) - The report shows posterior means with 95% credible intervals instead of exact counts - Dramatically reduces API cost and time for large datasets (e.g., 27 API calls vs. 1,000+) - - Uses stratified sampling by project to ensure all projects are represented + - Uses stratified sampling by (project, is_bot) to ensure all projects and both human/bot populations are represented - **--todo** (optional) - Analyze only open/backlog issues (non-closed statuses: New, In Progress, To Do, Refinement, etc.) @@ -366,8 +443,9 @@ Each run produces a directory under `.work/snowflake/reports/` containing the ra The report includes: - Sankey diagram: Project to Activity Type flows -- Summary statistics -- Searchable, paginated detail table with direct Jira links per issue +- Human/All/Bot toggle (when bot issues are detected) to view distributions separately +- Summary statistics with human/bot counts +- Searchable, paginated detail table with direct Jira links per issue and Source column (Human/Bot) - CSV export capability ## Examples @@ -417,7 +495,17 @@ The report includes: /snowflake:activity-type-report DPTP --uncategorized ``` -10. **Uncategorized with sampling:** +10. **Uncategorized open/backlog issues:** + ```bash + /snowflake:activity-type-report DPTP,TRT 6 --uncategorized --todo + ``` + +11. **Uncategorized across all statuses:** + ```bash + /snowflake:activity-type-report DPTP,TRT 6 --uncategorized --all + ``` + +12. **Uncategorized with sampling:** ```bash /snowflake:activity-type-report DPTP,TRT,ART 6 --uncategorized --sample ``` @@ -436,5 +524,6 @@ The report includes: - **Self-contained output**: The HTML report works offline after generation -- no server needed. - **Cached classifications**: Re-running the same projects and date range skips the Vertex AI API call and reuses the existing `classified_issues.json` (or `estimates.json` in sample mode). Delete the run directory to force re-classification. - **Completed work by default**: By default, only closed issues (ISSUESTATUS_ID=6) with work-completed resolutions (RESOLUTION IN (10000, 10041) i.e. Done/Done-Errata, or NULL) are analyzed — this excludes no-work closures like Duplicate, Won't Do, Obsolete, Not a Bug, Can't Do, Cannot Reproduce, and MirrorOrphan (~25% of closed issues globally). Use `--todo` for open/backlog work, or `--all` for everything. -- **Sampling mode**: For large datasets (thousands of issues), `--sample` uses Bayesian inference to estimate the activity type distribution from a small classified sample. Uses a Dirichlet-Multinomial conjugate model with uninformative priors — implemented entirely with Python stdlib (`random.gammavariate`). The report clearly labels results as estimates and shows credible intervals. +- **Bot detection**: Issues filed by automation bots are identified via labels in `JIRA_LABEL_RHAI` (e.g., `auto-created`, `art:image-build-failure`, `ai-generated-jira`). The SQL CTE uses 13 verified bot labels covering general bot patterns and project-specific automation (ART, ACM, OCM, SREP, etc.). When bot issues are detected, the report shows a Human/All/Bot toggle and separate distributions. Labels describing automation *work* by humans (e.g., `automation`, `qe-automation`, `auto-closed`) are intentionally excluded. Projects with no bot issues show the standard single-view report. +- **Sampling mode**: For large datasets (thousands of issues), `--sample` uses Bayesian inference to estimate the activity type distribution from a small classified sample. Uses a Dirichlet-Multinomial conjugate model with uninformative priors — implemented entirely with Python stdlib (`random.gammavariate`). Stratifies by (project, is_bot) to ensure both human and bot populations are represented. The report clearly labels results as estimates and shows credible intervals, with separate human/bot estimates when applicable. - **Uncategorized filter**: The `--uncategorized` flag uses `customfield_10464` (Activity Type) from the `JIRA_CUSTOMFIELDVALUE_NON_PII` view. **This custom field ID is specific to Red Hat JIRA instances.** The typical workflow is: run with `--uncategorized` to find and classify issues missing their Activity Type, review the report, then use `/jira:categorize-activity-type` to apply the classifications back to Jira. diff --git a/plugins/snowflake/scripts/classify_issues.py b/plugins/snowflake/scripts/classify_issues.py index 39451f683..32321174e 100644 --- a/plugins/snowflake/scripts/classify_issues.py +++ b/plugins/snowflake/scripts/classify_issues.py @@ -61,6 +61,14 @@ VALID_CATEGORIES = set(ACTIVITY_TYPE_DEFINITIONS.keys()) +def _get_is_bot(issue): + """Extract bot flag from an issue, handling both Snowflake and processed formats.""" + val = issue.get("IS_BOT", issue.get("is_bot", False)) + if isinstance(val, str): + return val.lower() in ("true", "1", "yes") + return bool(val) + + def build_prompt(batch): """Build the classification prompt for a batch of issues.""" defs_text = "\n\n".join( @@ -283,6 +291,7 @@ def main(): "status": issue.get("STATUS", issue.get("status", "")), "components": issue.get("COMPONENTS", issue.get("components", "")), "created": issue.get("CREATED", issue.get("created", "")), + "is_bot": _get_is_bot(issue), }) # Write output @@ -303,6 +312,14 @@ def main(): pct = count / len(output) * 100 print(f" {cat:<45s} {count:>4d} ({pct:.1f}%)") + # Print bot/human split + bot_count = sum(1 for item in output if item.get("is_bot")) + human_count = len(output) - bot_count + if bot_count > 0: + print(f"\nBot/Human Split:") + print(f" Human: {human_count:>6,} ({human_count/len(output)*100:.1f}%)") + print(f" Automated: {bot_count:>6,} ({bot_count/len(output)*100:.1f}%)") + # Print cost summary total_tokens = total_input_tokens + total_output_tokens print(f"\nAPI Usage:") diff --git a/plugins/snowflake/scripts/generate_sankey.py b/plugins/snowflake/scripts/generate_sankey.py index 7abb57ed3..a32425222 100644 --- a/plugins/snowflake/scripts/generate_sankey.py +++ b/plugins/snowflake/scripts/generate_sankey.py @@ -41,8 +41,19 @@ } -def generate_d3_sankey(data): - """Generate a pure-JS D3 sankey when Plotly is not installed.""" +def _get_is_bot(issue): + """Extract bot flag from an issue.""" + val = issue.get("IS_BOT", issue.get("is_bot", False)) + if isinstance(val, str): + return val.lower() in ("true", "1", "yes") + return bool(val) + + +def generate_d3_sankey(data, container_id="d3-sankey"): + """Generate a pure-JS D3 sankey diagram.""" + if not data: + return f'
No issues in this view
' + flow_counts = Counter() for issue in data: flow_counts[(issue["project_key"], issue["activity_type"])] += 1 @@ -72,14 +83,18 @@ def generate_d3_sankey(data): ) return f""" -
- - +
+ +

Project → Activity Type

- {sankey_html} + {f'''
{sankey_human}
+ + ''' if has_bots else f'''
{sankey_all}
'''}
@@ -593,13 +667,18 @@ def generate_html(data, title, projects_str, months, usage_info=None, "var JQL_URL_KEY_LIMIT = 100;\n" "var ACTIVITY_COLORS = " + json.dumps(ACTIVITY_COLORS) + ";\n" "var ACTIVITY_TYPES = " + json.dumps(sorted(ACTIVITY_COLORS.keys())) + ";\n" + "var HAS_BOTS = " + json.dumps(has_bots) + ";\n" ) app_js += r""" var COLUMNS = [ {key: "issue_key", label: "Issue Key", width: "120px"}, {key: "project_key", label: "Project", width: "80px"}, {key: "activity_type", label: "Activity Type", width: "200px"}, - {key: "summary", label: "Summary", width: ""}, + {key: "summary", label: "Summary", width: ""}""" + if has_bots: + app_js += r""", + {key: "is_bot", label: "Source", width: "80px"}""" + app_js += r""", {key: "issue_type", label: "Type", width: "90px"}, {key: "status", label: "Status", width: "100px"}, {key: "components", label: "Components", width: "140px"}, @@ -610,15 +689,59 @@ def generate_html(data, title, projects_str, months, usage_info=None, var currentPage = 1, pageSize = 50; var filteredData = TABLE_DATA.slice(); var colFilters = {}; +var currentViewFilter = HAS_BOTS ? "human" : "all"; function escapeHtml(s) { if (s == null) return ""; return String(s).replace(/&/g,"&").replace(//g,">").replace(/"/g,"""); } +function switchView(view) { + currentViewFilter = view; + // Update toggle buttons + var btns = document.querySelectorAll('.toggle-btn'); + for (var i = 0; i < btns.length; i++) { + var btn = btns[i]; + if (btn.getAttribute('data-view') === view) { + btn.classList.add('active'); + } else { + btn.classList.remove('active'); + } + } + // Show/hide sankeys and summaries + var views = ['human', 'bot', 'all']; + for (var v = 0; v < views.length; v++) { + var vName = views[v]; + var sankey = document.getElementById('sankey-' + vName); + var summary = document.getElementById('summary-' + vName); + if (sankey) { + sankey.style.display = vName === view ? 'block' : 'none'; + // Render pending sankey on first show + if (vName === view) { + var pending = sankey.querySelector('[data-pending="true"]'); + if (pending) { + pending.removeAttribute('data-pending'); + // Re-render by re-running the sankey script + var scripts = sankey.querySelectorAll('script'); + for (var s = 0; s < scripts.length; s++) { + var newScript = document.createElement('script'); + newScript.textContent = scripts[s].textContent; + scripts[s].parentNode.replaceChild(newScript, scripts[s]); + } + } + } + } + if (summary) summary.style.display = vName === view ? 'block' : 'none'; + } + applyFilters(); +} + function applyFilters() { var globalTerm = document.getElementById("global-search").value.toLowerCase(); filteredData = TABLE_DATA.filter(function(row) { + // View filter (human/bot/all) + if (currentViewFilter === "human" && row.is_bot) return false; + if (currentViewFilter === "bot" && !row.is_bot) return false; // Global search if (globalTerm) { var match = false; @@ -632,8 +755,13 @@ def generate_html(data, title, projects_str, months, usage_info=None, // Column filters for (var col in colFilters) { if (!colFilters[col]) continue; - var val = String(row[col] || "").toLowerCase(); - if (val.indexOf(colFilters[col].toLowerCase()) < 0) return false; + if (col === "is_bot") { + var expected = colFilters[col].toLowerCase() === "bot"; + if (row.is_bot !== expected) return false; + } else { + var val = String(row[col] || "").toLowerCase(); + if (val.indexOf(colFilters[col].toLowerCase()) < 0) return false; + } } return true; }); @@ -677,6 +805,10 @@ def generate_html(data, title, projects_str, months, usage_info=None, } else if (key === "activity_type") { var color = ACTIVITY_COLORS[val] || "#9E9E9E"; html += '' + escapeHtml(val) + ''; + } else if (key === "is_bot") { + var label = val ? "Bot" : "Human"; + var cssClass = val ? "source-bot" : "source-human"; + html += '' + label + ''; } else { html += '' + escapeHtml(val) + ''; } @@ -751,6 +883,12 @@ def generate_html(data, title, projects_str, months, usage_info=None, html += ''; } html += ''; + } else if (c.key === "is_bot") { + html += ''; } else if (c.key === "summary") { // Skip — global search covers this continue; @@ -773,11 +911,15 @@ def generate_html(data, title, projects_str, months, usage_info=None, } function updateCount() { - var total = TABLE_DATA.length; + var viewTotal = TABLE_DATA.filter(function(r) { + if (currentViewFilter === "human" && r.is_bot) return false; + if (currentViewFilter === "bot" && !r.is_bot) return false; + return true; + }).length; var count = filteredData.length; - var isFiltered = count !== total; + var isFiltered = count !== viewTotal; document.getElementById("row-count").textContent = - isFiltered ? count + " of " + total + " issues" : total + " issues"; + isFiltered ? count + " of " + viewTotal + " issues" : viewTotal + " issues"; var hint = document.getElementById("jql-hint"); if (!isFiltered && count > JQL_URL_KEY_LIMIT) { hint.textContent = "Filter the table first, or use Copy JQL for large sets"; diff --git a/plugins/snowflake/scripts/sample_and_estimate.py b/plugins/snowflake/scripts/sample_and_estimate.py index c4578404b..9c2b23701 100644 --- a/plugins/snowflake/scripts/sample_and_estimate.py +++ b/plugins/snowflake/scripts/sample_and_estimate.py @@ -46,70 +46,82 @@ ] +def _get_is_bot(issue): + """Extract bot flag from an issue, handling both Snowflake and processed formats.""" + val = issue.get("IS_BOT", issue.get("is_bot", False)) + if isinstance(val, str): + return val.lower() in ("true", "1", "yes") + return bool(val) + + def stratified_sample(issues, sample_size, seed=42): - """Draw a stratified random sample proportional to project size. + """Draw a stratified random sample proportional to stratum size. - Ensures every project gets at least 1 issue in the sample (if possible), + Stratifies by (project, is_bot) to ensure both human and bot populations + are represented. Guarantees at least 1 issue per stratum (if possible), then allocates remaining slots proportionally. """ rng = random.Random(seed) - by_project = {} + by_stratum = {} for issue in issues: proj = issue.get("PROJECT_KEY", issue.get("project_key", "UNKNOWN")) - by_project.setdefault(proj, []).append(issue) + is_bot = _get_is_bot(issue) + stratum = (proj, "bot" if is_bot else "human") + by_stratum.setdefault(stratum, []).append(issue) total = len(issues) n = min(sample_size, total) if n >= total: - return list(issues), {p: len(v) for p, v in by_project.items()} + counts = {} + for s, v in by_stratum.items(): + counts[s] = len(v) + return list(issues), counts - # Guarantee at least 1 per project, then proportional allocation + # Guarantee at least 1 per stratum, then proportional allocation allocations = {} remaining = n - for proj, proj_issues in by_project.items(): - allocations[proj] = min(1, len(proj_issues)) - remaining -= allocations[proj] + for stratum, stratum_issues in by_stratum.items(): + allocations[stratum] = min(1, len(stratum_issues)) + remaining -= allocations[stratum] # Distribute remaining proportionally if remaining > 0: proportional = {} - for proj, proj_issues in by_project.items(): - proportional[proj] = len(proj_issues) / total * n - # Subtract already-allocated minimum - for proj in by_project: - proportional[proj] = max(0, proportional[proj] - allocations[proj]) - # Normalize to fill remaining slots + for stratum, stratum_issues in by_stratum.items(): + proportional[stratum] = len(stratum_issues) / total * n + for stratum in by_stratum: + proportional[stratum] = max(0, proportional[stratum] - allocations[stratum]) prop_total = sum(proportional.values()) if prop_total > 0: - for proj in by_project: - extra = int(proportional[proj] / prop_total * remaining) - extra = min(extra, len(by_project[proj]) - allocations[proj]) - allocations[proj] += extra + for stratum in by_stratum: + extra = int(proportional[stratum] / prop_total * remaining) + extra = min(extra, len(by_stratum[stratum]) - allocations[stratum]) + allocations[stratum] += extra remaining -= extra - # Distribute any leftover slots to largest projects + # Distribute any leftover slots to largest strata if remaining > 0: - projects_by_size = sorted(by_project.keys(), - key=lambda p: len(by_project[p]), - reverse=True) - for proj in projects_by_size: + strata_by_size = sorted(by_stratum.keys(), + key=lambda s: len(by_stratum[s]), + reverse=True) + for stratum in strata_by_size: if remaining <= 0: break - can_add = len(by_project[proj]) - allocations[proj] + can_add = len(by_stratum[stratum]) - allocations[stratum] add = min(can_add, remaining) - allocations[proj] += add + allocations[stratum] += add remaining -= add # Draw samples sample = [] sample_counts = {} - for proj, count in allocations.items(): - proj_issues = by_project[proj] - drawn = rng.sample(proj_issues, min(count, len(proj_issues))) + for stratum, count in allocations.items(): + stratum_issues = by_stratum[stratum] + drawn = rng.sample(stratum_issues, min(count, len(stratum_issues))) sample.extend(drawn) - sample_counts[proj] = len(drawn) + sample_counts[stratum] = len(drawn) rng.shuffle(sample) return sample, sample_counts @@ -261,6 +273,9 @@ def main(): with open(args.input) as f: all_issues = json.load(f) total = len(all_issues) + if total == 0: + print("No issues to process.", file=sys.stderr) + sys.exit(1) print(f"Total issues: {total}") # Auto-recommend sample size @@ -287,14 +302,35 @@ def main(): print(f"\nSample drawn: {len(sample)} of {total} issues " f"({len(sample)/total*100:.1f}%)") + + # Aggregate stratum counts to project-level for display + proj_sample = {} + for (proj, bot_status), count in sample_counts.items(): + proj_sample.setdefault(proj, {"human": 0, "bot": 0}) + proj_sample[proj][bot_status] = count + + proj_totals = {} + for i in all_issues: + proj = i.get("PROJECT_KEY", i.get("project_key", "UNKNOWN")) + proj_totals[proj] = proj_totals.get(proj, 0) + 1 + + has_bots = any(v.get("bot", 0) > 0 for v in proj_sample.values()) + print("Stratification by project:") - for proj in sorted(sample_counts.keys()): - proj_total = sum(1 for i in all_issues - if (i.get("PROJECT_KEY", i.get("project_key")) - == proj)) - pct = (sample_counts[proj] / proj_total * 100) if proj_total else 0.0 - print(f" {proj:<20s} {sample_counts[proj]:>4d} of {proj_total:>5d} " - f"({pct:.1f}%)") + for proj in sorted(proj_sample.keys()): + proj_total = proj_totals.get(proj, 0) + sampled = proj_sample[proj]["human"] + proj_sample[proj]["bot"] + pct = (sampled / proj_total * 100) if proj_total else 0.0 + bot_info = "" + if has_bots and proj_sample[proj]["bot"] > 0: + bot_info = f" (human: {proj_sample[proj]['human']}, bot: {proj_sample[proj]['bot']})" + print(f" {proj:<20s} {sampled:>4d} of {proj_total:>5d} " + f"({pct:.1f}%){bot_info}") + + if has_bots: + total_bot = sum(v["bot"] for v in proj_sample.values()) + total_human = sum(v["human"] for v in proj_sample.values()) + print(f"\n Total: {total_human} human + {total_bot} bot = {len(sample)} sampled") print(f"\nSample written to: {args.draw_sample}") print("Next: classify this sample with classify_issues.py, " @@ -308,6 +344,12 @@ def main(): print(f"Classified sample: {len(classified)} issues") + # Split by bot status + human_classified = [i for i in classified if not _get_is_bot(i)] + bot_classified = [i for i in classified if _get_is_bot(i)] + human_total = sum(1 for i in all_issues if not _get_is_bot(i)) + bot_total = total - human_total + # Overall estimates overall = estimate_distribution( classified, confidence=args.confidence, seed=args.seed @@ -318,6 +360,25 @@ def main(): classified, confidence=args.confidence, seed=args.seed ) + # Human-only and bot-only estimates + human_estimates = None + bot_estimates = None + if human_classified and bot_classified: + human_estimates = { + "population": human_total, + "sample_size": len(human_classified), + **estimate_distribution( + human_classified, confidence=args.confidence, seed=args.seed + ), + } + bot_estimates = { + "population": bot_total, + "sample_size": len(bot_classified), + **estimate_distribution( + bot_classified, confidence=args.confidence, seed=args.seed + ), + } + result = { "method": "Dirichlet-Multinomial Bayesian estimation", "total_population": total, @@ -326,6 +387,8 @@ def main(): "confidence": args.confidence, "seed": args.seed, "overall": overall, + "human": human_estimates, + "bot": bot_estimates, "by_project": per_project, } @@ -346,6 +409,22 @@ def main(): print(f"{est['category']:<45s} {mean_pct:>5.1f}% " f"[{lo_pct:>5.1f}% — {hi_pct:>5.1f}%]") + if human_estimates and bot_estimates: + print(f"\nBot/Human Split: {human_total} human + {bot_total} bot " + f"= {total} total") + print(f" Sample: {len(human_classified)} human + " + f"{len(bot_classified)} bot = {len(classified)}") + + print(f"\nHuman Work ({len(human_classified)} of {human_total}):") + for est in human_estimates["estimates"]: + mean_pct = est["posterior_mean"] * 100 + print(f" {est['category']:<45s} {mean_pct:>5.1f}%") + + print(f"\nAutomated/Bot Work ({len(bot_classified)} of {bot_total}):") + for est in bot_estimates["estimates"]: + mean_pct = est["posterior_mean"] * 100 + print(f" {est['category']:<45s} {mean_pct:>5.1f}%") + if args.output: os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) diff --git a/plugins/snowflake/scripts/test_sample_and_estimate.py b/plugins/snowflake/scripts/test_sample_and_estimate.py new file mode 100644 index 000000000..134f53a88 --- /dev/null +++ b/plugins/snowflake/scripts/test_sample_and_estimate.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""Tests for _get_is_bot() and stratified_sample() in sample_and_estimate.py.""" + +import unittest +from sample_and_estimate import _get_is_bot, stratified_sample + + +class TestGetIsBot(unittest.TestCase): + + def test_bool_true(self): + self.assertTrue(_get_is_bot({"IS_BOT": True})) + + def test_bool_false(self): + self.assertFalse(_get_is_bot({"IS_BOT": False})) + + def test_string_true_lowercase(self): + self.assertTrue(_get_is_bot({"IS_BOT": "true"})) + + def test_string_true_uppercase(self): + self.assertTrue(_get_is_bot({"IS_BOT": "TRUE"})) + + def test_string_true_mixed_case(self): + self.assertTrue(_get_is_bot({"IS_BOT": "True"})) + + def test_string_one(self): + self.assertTrue(_get_is_bot({"IS_BOT": "1"})) + + def test_string_yes(self): + self.assertTrue(_get_is_bot({"IS_BOT": "yes"})) + + def test_string_false(self): + self.assertFalse(_get_is_bot({"IS_BOT": "false"})) + + def test_string_zero(self): + self.assertFalse(_get_is_bot({"IS_BOT": "0"})) + + def test_string_no(self): + self.assertFalse(_get_is_bot({"IS_BOT": "no"})) + + def test_int_one(self): + self.assertTrue(_get_is_bot({"IS_BOT": 1})) + + def test_int_zero(self): + self.assertFalse(_get_is_bot({"IS_BOT": 0})) + + def test_none_value(self): + self.assertFalse(_get_is_bot({"IS_BOT": None})) + + def test_missing_key_defaults_false(self): + self.assertFalse(_get_is_bot({})) + + def test_lowercase_key(self): + self.assertTrue(_get_is_bot({"is_bot": True})) + + def test_lowercase_key_string(self): + self.assertFalse(_get_is_bot({"is_bot": "false"})) + + def test_uppercase_takes_precedence(self): + self.assertTrue(_get_is_bot({"IS_BOT": True, "is_bot": False})) + + +class TestStratifiedSample(unittest.TestCase): + + def _make_issues(self, specs): + """Build issue list from (project, is_bot, count) tuples.""" + issues = [] + for proj, is_bot, count in specs: + for i in range(count): + issues.append({ + "PROJECT_KEY": proj, + "IS_BOT": is_bot, + "ISSUEKEY": f"{proj}-{i}", + }) + return issues + + def test_sample_size_exceeds_total_returns_all(self): + issues = self._make_issues([("A", False, 5)]) + sample, counts = stratified_sample(issues, 100) + self.assertEqual(len(sample), 5) + self.assertEqual(counts[("A", "human")], 5) + + def test_sample_equals_total_returns_all(self): + issues = self._make_issues([("A", False, 10)]) + sample, counts = stratified_sample(issues, 10) + self.assertEqual(len(sample), 10) + + def test_every_stratum_gets_at_least_one(self): + issues = self._make_issues([ + ("A", False, 100), + ("A", True, 100), + ("B", False, 5), + ("B", True, 3), + ]) + sample, counts = stratified_sample(issues, 10) + self.assertEqual(len(sample), 10) + self.assertGreaterEqual(counts[("A", "human")], 1) + self.assertGreaterEqual(counts[("A", "bot")], 1) + self.assertGreaterEqual(counts[("B", "human")], 1) + self.assertGreaterEqual(counts[("B", "bot")], 1) + + def test_all_human_no_bot_strata(self): + issues = self._make_issues([ + ("A", False, 50), + ("B", False, 50), + ]) + sample, counts = stratified_sample(issues, 20) + self.assertEqual(len(sample), 20) + self.assertNotIn(("A", "bot"), counts) + self.assertNotIn(("B", "bot"), counts) + self.assertIn(("A", "human"), counts) + self.assertIn(("B", "human"), counts) + + def test_all_bot_no_human_strata(self): + issues = self._make_issues([("A", True, 30)]) + sample, counts = stratified_sample(issues, 10) + self.assertEqual(len(sample), 10) + self.assertIn(("A", "bot"), counts) + self.assertNotIn(("A", "human"), counts) + + def test_proportional_allocation(self): + issues = self._make_issues([ + ("A", False, 900), + ("A", True, 100), + ]) + sample, counts = stratified_sample(issues, 100) + self.assertEqual(len(sample), 100) + self.assertGreater(counts[("A", "human")], counts[("A", "bot")]) + + def test_deterministic_with_seed(self): + issues = self._make_issues([ + ("A", False, 50), + ("A", True, 50), + ]) + s1, c1 = stratified_sample(issues, 20, seed=123) + s2, c2 = stratified_sample(issues, 20, seed=123) + self.assertEqual([i["ISSUEKEY"] for i in s1], + [i["ISSUEKEY"] for i in s2]) + + def test_different_seeds_differ(self): + issues = self._make_issues([ + ("A", False, 100), + ("A", True, 100), + ]) + s1, _ = stratified_sample(issues, 20, seed=1) + s2, _ = stratified_sample(issues, 20, seed=2) + keys1 = set(i["ISSUEKEY"] for i in s1) + keys2 = set(i["ISSUEKEY"] for i in s2) + self.assertNotEqual(keys1, keys2) + + def test_return_counts_use_tuple_keys(self): + issues = self._make_issues([ + ("PROJ", False, 10), + ("PROJ", True, 5), + ]) + _, counts = stratified_sample(issues, 8) + for key in counts: + self.assertIsInstance(key, tuple) + self.assertEqual(len(key), 2) + + def test_lowercase_keys_handled(self): + issues = [ + {"project_key": "X", "is_bot": False, "ISSUEKEY": "X-1"}, + {"project_key": "X", "is_bot": True, "ISSUEKEY": "X-2"}, + ] + sample, counts = stratified_sample(issues, 2) + self.assertEqual(len(sample), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/snowflake/skills/setup-snowflake/SKILL.md b/plugins/snowflake/skills/setup-snowflake/SKILL.md index f829961d8..9f1d978cd 100644 --- a/plugins/snowflake/skills/setup-snowflake/SKILL.md +++ b/plugins/snowflake/skills/setup-snowflake/SKILL.md @@ -147,14 +147,14 @@ Then **abort the current command gracefully**. Do not attempt to proceed to Step Once the MCP tool is confirmed available, set the database, schema, and role for the session: ```text -mcp__snowflake__execute_sql(query="USE ROLE JIRA_CLOUDMARTS_GROUP") +mcp__snowflake__execute_sql(query="USE ROLE PUBLIC") mcp__snowflake__execute_sql(query="USE DATABASE JIRA_DB") mcp__snowflake__execute_sql(query="USE SCHEMA CLOUDRHAI_MARTS") ``` If any of these fail (e.g., role not granted), inform the user: -> Your Snowflake account does not have the `JIRA_CLOUDMARTS_GROUP` role. This role is required to access Jira data in Snowflake. Please request this role through the access provisioning process at: +> Your Snowflake account does not have the `PUBLIC` role. Please request access through the access provisioning process at: > > **https://dataverse.pages.redhat.com/data-docs/data-users/**