decocms · viniciusventura29 · Jan 7, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/blog-mcp/README.md b/blog-mcp/README.md
@@ -0,0 +1,159 @@
+# Blog Content Extractor MCP
+
+MCP para extração, deduplicação e sumarização de conteúdo de blogs usando Firecrawl e Supabase.
+
+## Funcionalidades
+
+- **Extração de Conteúdo**: Usa Firecrawl para extrair title, body, author e date de URLs de blog
+- **Deduplicação por Fingerprint**: Gera hash SHA-256 de title + body para identificar conteúdo único
+- **Persistência de Estado**: Armazena registros no Supabase para evitar reprocessamento
+- **Watermarks por Domínio**: Rastreia última vez que cada domínio foi processado
+- **Resumos Focados em Insights**: Gera resumos curtos extraindo frases-chave
+
+## Configuração
+
+### 1. Supabase - Criar Tabelas
+
+Execute no SQL Editor do seu projeto Supabase:
+
+```sql
+-- Tabela de conteúdo processado
+CREATE TABLE blog_content (
+  id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
+  url TEXT UNIQUE NOT NULL,
+  fingerprint TEXT NOT NULL,
+  domain TEXT NOT NULL,
+  title TEXT NOT NULL,
+  first_seen_at TIMESTAMPTZ NOT NULL,
+  last_seen_at TIMESTAMPTZ NOT NULL,
+  updated_count INTEGER DEFAULT 0,
+  created_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+-- Índices para performance
+CREATE INDEX idx_blog_content_domain ON blog_content(domain);
+CREATE INDEX idx_blog_content_fingerprint ON blog_content(fingerprint);
+CREATE INDEX idx_blog_content_url ON blog_content(url);
+
+-- Tabela de watermarks por domínio
+CREATE TABLE blog_watermarks (
+  domain TEXT PRIMARY KEY,
+  last_processed_at TIMESTAMPTZ NOT NULL,
+  created_at TIMESTAMPTZ DEFAULT NOW()
+);
+```
+
+### 2. Firecrawl API Key
+
+Obtenha sua API key em https://firecrawl.dev
+
+### 3. Instalar o MCP
+
+Ao instalar, preencha:
+- `firecrawlApiKey`: Sua chave de API do Firecrawl
+- `supabaseUrl`: URL do seu projeto Supabase (ex: https://xxx.supabase.co)
+- `supabaseKey`: Service role key ou anon key com RLS configurado
+
+## Tools Disponíveis
+
+### `process_urls`
+
+Processa uma lista de URLs de blog:
+- Extrai conteúdo limpo usando Firecrawl
+- Gera fingerprint único (SHA-256 de title + body normalizado)
+- Verifica se já existe no Supabase
+- Salva novo conteúdo ou atualiza se fingerprint mudou
+- Retorna resumo focado em insights
+
+**Input:**
+```json
+{
+  "urls": ["https://blog.example.com/post-1", "https://blog.example.com/post-2"],
+  "generateSummaries": true
+}
+```
+
+**Output:**
+```json
+{
+  "processed": [
+    {
+      "url": "https://blog.example.com/post-1",
+      "status": "new",
+      "title": "Post Title",
+      "summary": "Key insights from the post...",
+      "fingerprint": "abc123...",
+      "domain": "blog.example.com"
+    }
+  ],
+  "stats": {
+    "total": 2,
+    "new": 1,
+    "updated": 0,
+    "unchanged": 1,
+    "errors": 0
+  }
+}
+```
+
+### `check_updates`
+
+Verifica status de URLs processadas anteriormente sem re-extrair:
+
+**Input:**
+```json
+{
+  "domain": "blog.example.com"
+}
+```
+
+### `get_watermarks`
+
+Obtém watermarks (última vez processada) por domínio:
+
+**Input:**
+```json
+{
+  "domain": "blog.example.com"
+}
+```
+
+## Lógica de Deduplicação
+
+1. **Normalização**: title e body são normalizados (lowercase, whitespace colapsado, Unicode normalizado)
+2. **Fingerprint**: SHA-256 do texto normalizado `title|body`
+3. **Verificação**:
+   - Se URL não existe → conteúdo **novo**
+   - Se URL existe mas fingerprint diferente → **update**
+   - Se URL existe e fingerprint igual → **ignorar**
+
+## Desenvolvimento
+
+```bash
+cd blog-mcp
+bun install
+bun run dev     # Desenvolvimento local
+bun run deploy  # Deploy para produção
+```
+
+## Arquitetura
+
+```
+blog-mcp/
+├── server/
+│   ├── main.ts              # Entry point e StateSchema
+│   ├── lib/
+│   │   ├── firecrawl.ts     # Cliente Firecrawl API
+│   │   ├── supabase.ts      # Cliente Supabase para persistência
+│   │   ├── content.ts       # Normalização, fingerprint, resumo
+│   │   └── types.ts         # Tipos compartilhados
+│   └── tools/
+│       ├── index.ts         # Exporta todas as tools
+│       └── blog.ts          # Tools de processamento
+├── shared/
+│   └── deco.gen.ts          # Tipos gerados
+├── package.json
+├── wrangler.toml
+└── tsconfig.json
+```
+
diff --git a/blog-mcp/package.json b/blog-mcp/package.json
@@ -0,0 +1,36 @@
+{
+  "name": "mcp-blog",
+  "version": "1.0.0",
+  "description": "Blog content extraction and deduplication MCP using Firecrawl and Supabase",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "dev": "deco dev --vite",
+    "configure": "deco configure",
+    "gen": "deco gen --output=shared/deco.gen.ts",
+    "deploy": "npm run build && deco deploy ./dist/server",
+    "check": "tsc --noEmit",
+    "build": "bun --bun vite build"
+  },
+  "dependencies": {
+    "@decocms/runtime": "0.25.1",
+    "@supabase/supabase-js": "^2.49.0",
+    "zod": "^3.24.3"
+  },
+  "devDependencies": {
+    "@cloudflare/vite-plugin": "^1.13.4",
+    "@cloudflare/workers-types": "^4.20251014.0",
+    "@decocms/mcps-shared": "1.0.0",
+    "@mastra/core": "^0.24.0",
+    "@modelcontextprotocol/sdk": "^1.21.0",
+    "@types/mime-db": "^1.43.6",
+    "deco-cli": "^0.26.0",
+    "typescript": "^5.7.2",
+    "vite": "7.2.0",
+    "wrangler": "^4.28.0"
+  },
+  "engines": {
+    "node": ">=22.0.0"
+  }
+}
+
diff --git a/blog-mcp/server/lib/content.ts b/blog-mcp/server/lib/content.ts
@@ -0,0 +1,119 @@
+/**
+ * Content processing utilities.
+ * Handles normalization, fingerprinting, and summary generation.
+ */
+
+/**
+ * Normalize text content by:
+ * - Trimming whitespace
+ * - Normalizing Unicode
+ * - Removing excessive whitespace
+ * - Converting to lowercase for comparison
+ */
+export function normalizeText(text: string): string {
+  return text
+    .normalize("NFKC") // Normalize Unicode
+    .replace(/\s+/g, " ") // Collapse whitespace
+    .trim()
+    .toLowerCase();
+}
+
+/**
+ * Generate a fingerprint (hash) from title + body.
+ * Uses a simple but effective hash for deduplication.
+ */
+export async function generateFingerprint(
+  title: string,
+  body: string,
+): Promise<string> {
+  const normalized = normalizeText(title) + "|" + normalizeText(body);
+
+  // Use Web Crypto API (available in Cloudflare Workers)
+  const encoder = new TextEncoder();
+  const data = encoder.encode(normalized);
+  const hashBuffer = await crypto.subtle.digest("SHA-256", data);
+  const hashArray = Array.from(new Uint8Array(hashBuffer));
+  return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
+}
+
+/**
+ * Extract key sentences for summary generation.
+ * Simple extractive approach - first N sentences that are meaningful.
+ */
+function extractKeySentences(text: string, maxSentences = 3): string[] {
+  // Split by sentence-ending punctuation
+  const sentences = text
+    .split(/[.!?]+/)
+    .map((s) => s.trim())
+    .filter((s) => s.length > 20); // Only meaningful sentences
+
+  return sentences.slice(0, maxSentences);
+}
+
+/**
+ * Generate a short summary focused on insights.
+ * Uses extractive summarization for simplicity.
+ */
+export function generateSummary(
+  _title: string,
+  body: string,
+  maxLength = 300,
+): string {
+  const keySentences = extractKeySentences(body);
+
+  if (keySentences.length === 0) {
+    // Fallback to first N characters of body
+    return (
+      body.substring(0, maxLength).trim() +
+      (body.length > maxLength ? "..." : "")
+    );
+  }
+
+  const summary = keySentences.join(". ") + ".";
+
+  if (summary.length <= maxLength) {
+    return summary;
+  }
+
+  return summary.substring(0, maxLength - 3).trim() + "...";
+}
+
+/**
+ * Process extracted content into a normalized format
+ */
+export interface ProcessedContent {
+  url: string;
+  domain: string;
+  title: string;
+  body: string;
+  author: string | null;
+  publishedDate: string | null;
+  fingerprint: string;
+  summary: string;
+  normalizedTitle: string;
+}
+
+export async function processContent(
+  url: string,
+  domain: string,
+  title: string,
+  body: string,
+  author: string | null,
+  publishedDate: string | null,
+): Promise<ProcessedContent> {
+  const fingerprint = await generateFingerprint(title, body);
+  const summary = generateSummary(title, body);
+  const normalizedTitle = normalizeText(title);
+
+  return {
+    url,
+    domain,
+    title,
+    body,
+    author,
+    publishedDate,
+    fingerprint,
+    summary,
+    normalizedTitle,
+  };
+}