decocms · viniciusventura29 · Jan 7, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/bun.lock b/bun.lock
diff --git a/content-scraper/README.md b/content-scraper/README.md
@@ -0,0 +1,158 @@
+# Content Scraper MCP
+
+MCP para extração, deduplicação e sumarização de conteúdo web usando Firecrawl e Supabase.
+
+## Funcionalidades
+
+- **Extração de Conteúdo**: Usa Firecrawl para extrair title, body, author e date de URLs
+- **Deduplicação por Fingerprint**: Gera hash SHA-256 de title + body para identificar conteúdo único
+- **Persistência de Estado**: Armazena registros no Supabase para evitar reprocessamento
+- **Watermarks por Domínio**: Rastreia última vez que cada domínio foi processado
+- **Resumos Focados em Insights**: Gera resumos curtos extraindo frases-chave
+
+## Configuração
+
+### 1. Supabase - Criar Tabelas
+
+Execute no SQL Editor do seu projeto Supabase:
+
+```sql
+-- Tabela de conteúdo processado
+CREATE TABLE scraped_content (
+  id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
+  url TEXT UNIQUE NOT NULL,
+  fingerprint TEXT NOT NULL,
+  domain TEXT NOT NULL,
+  title TEXT NOT NULL,
+  first_seen_at TIMESTAMPTZ NOT NULL,
+  last_seen_at TIMESTAMPTZ NOT NULL,
+  updated_count INTEGER DEFAULT 0,
+  created_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+-- Índices para performance
+CREATE INDEX idx_scraped_content_domain ON scraped_content(domain);
+CREATE INDEX idx_scraped_content_fingerprint ON scraped_content(fingerprint);
+CREATE INDEX idx_scraped_content_url ON scraped_content(url);
+
+-- Tabela de watermarks por domínio
+CREATE TABLE scraper_watermarks (
+  domain TEXT PRIMARY KEY,
+  last_processed_at TIMESTAMPTZ NOT NULL,
+  created_at TIMESTAMPTZ DEFAULT NOW()
+);
+```
+
+### 2. Firecrawl API Key
+
+Obtenha sua API key em https://firecrawl.dev
+
+### 3. Instalar o MCP
+
+Ao instalar, preencha:
+- `firecrawlApiKey`: Sua chave de API do Firecrawl
+- `supabaseUrl`: URL do seu projeto Supabase (ex: https://xxx.supabase.co)
+- `supabaseKey`: Service role key ou anon key com RLS configurado
+
+## Tools Disponíveis
+
+### `process_urls`
+
+Processa uma lista de URLs:
+- Extrai conteúdo limpo usando Firecrawl
+- Gera fingerprint único (SHA-256 de title + body normalizado)
+- Verifica se já existe no Supabase
+- Salva novo conteúdo ou atualiza se fingerprint mudou
+- Retorna resumo focado em insights
+
+**Input:**
+```json
+{
+  "urls": ["https://example.com/article-1", "https://example.com/article-2"],
+  "generateSummaries": true
+}
+```
+
+**Output:**
+```json
+{
+  "processed": [
+    {
+      "url": "https://example.com/article-1",
+      "status": "new",
+      "title": "Article Title",
+      "summary": "Key insights from the content...",
+      "fingerprint": "abc123...",
+      "domain": "example.com"
+    }
+  ],
+  "stats": {
+    "total": 2,
+    "new": 1,
+    "updated": 0,
+    "unchanged": 1,
+    "errors": 0
+  }
+}
+```
+
+### `check_updates`
+
+Verifica status de URLs processadas anteriormente sem re-extrair:
+
+**Input:**
+```json
+{
+  "domain": "example.com"
+}
+```
+
+### `get_watermarks`
+
+Obtém watermarks (última vez processada) por domínio:
+
+**Input:**
+```json
+{
+  "domain": "example.com"
+}
+```
+
+## Lógica de Deduplicação
+
+1. **Normalização**: title e body são normalizados (lowercase, whitespace colapsado, Unicode normalizado)
+2. **Fingerprint**: SHA-256 do texto normalizado `title|body`
+3. **Verificação**:
+   - Se URL não existe → conteúdo **novo**
+   - Se URL existe mas fingerprint diferente → **update**
+   - Se URL existe e fingerprint igual → **ignorar**
+
+## Desenvolvimento
+
+```bash
+cd content-scraper
+bun install
+bun run dev     # Desenvolvimento local
+bun run deploy  # Deploy para produção
+```
+
+## Arquitetura
+
+```
+content-scraper/
+├── server/
+│   ├── main.ts              # Entry point e StateSchema
+│   ├── lib/
+│   │   ├── firecrawl.ts     # Cliente Firecrawl API
+│   │   ├── supabase.ts      # Cliente Supabase para persistência
+│   │   ├── content.ts       # Normalização, fingerprint, resumo
+│   │   └── types.ts         # Tipos compartilhados
+│   └── tools/
+│       ├── index.ts         # Exporta todas as tools
+│       └── scraper.ts       # Tools de processamento
+├── shared/
+│   └── deco.gen.ts          # Tipos gerados
+├── package.json
+├── wrangler.toml
+└── tsconfig.json
+```
diff --git a/content-scraper/package.json b/content-scraper/package.json
@@ -0,0 +1,36 @@
+{
+  "name": "mcp-content-scraper",
+  "version": "1.0.0",
+  "description": "Content extraction, deduplication and summarization MCP using Firecrawl and Supabase",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "dev": "deco dev --vite",
+    "configure": "deco configure",
+    "gen": "deco gen --output=shared/deco.gen.ts",
+    "deploy": "npm run build && deco deploy ./dist/server",
+    "check": "tsc --noEmit",
+    "build": "bun --bun vite build"
+  },
+  "dependencies": {
+    "@decocms/runtime": "0.25.1",
+    "@supabase/supabase-js": "^2.49.0",
+    "zod": "^3.24.3"
+  },
+  "devDependencies": {
+    "@cloudflare/vite-plugin": "^1.13.4",
+    "@cloudflare/workers-types": "^4.20251014.0",
+    "@decocms/mcps-shared": "1.0.0",
+    "@mastra/core": "^0.24.0",
+    "@modelcontextprotocol/sdk": "^1.21.0",
+    "@types/mime-db": "^1.43.6",
+    "deco-cli": "^0.26.0",
+    "typescript": "^5.7.2",
+    "vite": "7.2.0",
+    "wrangler": "^4.28.0"
+  },
+  "engines": {
+    "node": ">=22.0.0"
+  }
+}
+
diff --git a/content-scraper/server/main.ts b/content-scraper/server/main.ts
@@ -0,0 +1,27 @@
+/**
+ * Content Scraper MCP
+ *
+ * Simple MCP that scrapes web content via n8n webhook.
+ */
+import { DefaultEnv, withRuntime } from "@decocms/runtime";
+import { type Env as DecoEnv, StateSchema } from "../shared/deco.gen.ts";
+
+import { tools } from "./tools/index.ts";
+
+/**
+ * This Env type is the main context object that is passed to
+ * all of your Application.
+ */
+export type Env = DefaultEnv & DecoEnv;
+
+const runtime = withRuntime<Env, typeof StateSchema>({
+  oauth: {
+    scopes: [],
+    state: StateSchema,
+  },
+  tools,
+  fetch: (req: Request, env: Env) =>
+    (env as Env & { ASSETS: { fetch: typeof fetch } }).ASSETS.fetch(req),
+});
+
+export default runtime;
diff --git a/content-scraper/server/tools/index.ts b/content-scraper/server/tools/index.ts
@@ -0,0 +1,12 @@
+/**
+ * Central export point for all tools.
+ */
+import { userTools } from "@decocms/mcps-shared/tools/user";
+import { scraperTools } from "./scraper.ts";
+
+// Export all tools
+export const tools = [...userTools, ...scraperTools];
+
+// Re-export domain-specific tools for direct access if needed
+export { userTools } from "@decocms/mcps-shared/tools/user";
+export { scraperTools } from "./scraper.ts";
diff --git a/content-scraper/server/tools/scraper.ts b/content-scraper/server/tools/scraper.ts
@@ -0,0 +1,60 @@
+/**
+ * Content scraping tool via n8n webhook.
+ */
+import { z } from "zod";
+import { createTool } from "@decocms/runtime/mastra";
+import type { Env } from "../main.ts";
+
+const N8N_WEBHOOK_URL =
+  "https://ventura29.app.n8n.cloud/webhook-test/get-content-scrape";
+
+/**
+ * Call the n8n webhook to scrape content from a URL.
+ */
+export const scrapeContentTool = (env: Env) =>
+  createTool({
+    id: "scrape_content",
+    description:
+      "Scrape content from a URL using the n8n workflow. " +
+      "Extracts and processes web content through an automated pipeline.",
+    inputSchema: z.object({
+      url: z.string().url().describe("The URL to scrape content from"),
+    }),
+    outputSchema: z.object({
+      success: z.boolean(),
+      data: z.unknown().optional(),
+      error: z.string().optional(),
+    }),
+    execute: async ({ context: input }) => {
+      try {
+        const url = new URL(N8N_WEBHOOK_URL);
+        url.searchParams.set("url", input.url);
+
+        const response = await fetch(url.toString());
+
+        if (!response.ok) {
+          return {
+            success: false,
+            error: `Webhook returned ${response.status}: ${response.statusText}`,
+          };
+        }
+
+        const data = await response.json();
+
+        return {
+          success: true,
+          data,
+        };
+      } catch (error) {
+        return {
+          success: false,
+          error: error instanceof Error ? error.message : "Unknown error",
+        };
+      }
+    },
+  });
+
+/**
+ * Export all scraper tools
+ */
+export const scraperTools = [scrapeContentTool];