Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion bun.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
{
"lockfileVersion": 1,
"configVersion": 1,
"workspaces": {
"": {
"name": "@decocms/mcps",
Expand Down Expand Up @@ -34,6 +33,27 @@
"wrangler": "^4.28.0",
},
},
"blog-mcp": {
"name": "mcp-blog",
"version": "1.0.0",
"dependencies": {
"@decocms/runtime": "0.25.1",
"@supabase/supabase-js": "^2.49.0",
"zod": "^3.24.3",
},
"devDependencies": {
"@cloudflare/vite-plugin": "^1.13.4",
"@cloudflare/workers-types": "^4.20251014.0",
"@decocms/mcps-shared": "1.0.0",
"@mastra/core": "^0.24.0",
"@modelcontextprotocol/sdk": "^1.21.0",
"@types/mime-db": "^1.43.6",
"deco-cli": "^0.26.0",
"typescript": "^5.7.2",
"vite": "7.2.0",
"wrangler": "^4.28.0",
},
},
"data-for-seo": {
"name": "data-for-seo",
"version": "1.0.0",
Expand Down Expand Up @@ -1978,6 +1998,8 @@

"math-intrinsics": ["[email protected]", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],

"mcp-blog": ["mcp-blog@workspace:blog-mcp"],

"mcp-studio": ["mcp-studio@workspace:mcp-studio"],

"mcp-template-minimal": ["mcp-template-minimal@workspace:template-minimal"],
Expand Down Expand Up @@ -2790,6 +2812,12 @@

"log-symbols/chalk": ["[email protected]", "", { "dependencies": { "ansi-styles": "^4.1.0", "supports-color": "^7.1.0" } }, "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA=="],

"mcp-blog/@modelcontextprotocol/sdk": ["@modelcontextprotocol/[email protected]", "", { "dependencies": { "@hono/node-server": "^1.19.7", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.0.1", "express-rate-limit": "^7.5.0", "jose": "^6.1.1", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.0" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-yO28oVFFC7EBoiKdAn+VqRm+plcfv4v0xp6osG/VsCB0NlPZWi87ajbCZZ8f/RvOFLEu7//rSRmuZZ7lMoe3gQ=="],

"mcp-blog/deco-cli": ["[email protected]", "", { "dependencies": { "@deco-cx/warp-node": "0.3.16", "@modelcontextprotocol/sdk": "^1.19.1", "@supabase/ssr": "0.6.1", "@supabase/supabase-js": "2.50.0", "chalk": "^5.3.0", "commander": "^12.0.0", "glob": "^10.3.10", "ignore": "^7.0.5", "inquirer": "^9.2.15", "inquirer-search-checkbox": "^1.0.0", "inquirer-search-list": "^1.2.6", "jose": "^6.0.11", "json-schema-to-typescript": "^15.0.4", "object-hash": "^3.0.0", "prettier": "^3.6.2", "semver": "^7.6.0", "smol-toml": "^1.3.4", "ws": "^8.16.0", "zod": "^3.25.76" }, "bin": { "deco": "dist/cli.js", "deconfig": "dist/deconfig.js" } }, "sha512-fkYKYO81cK3NE4hb3zcPdMksKJiYM2mon0lKGBuvEOruVUfbhK0I7V777NZDrmaxVQXxDx0fa9i6fARjxT7muQ=="],

"mcp-blog/zod": ["[email protected]", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],

"mcp-studio/@decocms/runtime": ["@decocms/[email protected]", "", { "dependencies": { "@ai-sdk/provider": "^2.0.0", "@cloudflare/workers-types": "^4.20250617.0", "@decocms/bindings": "1.0.3", "@modelcontextprotocol/sdk": "1.25.1", "hono": "^4.10.7", "jose": "^6.0.11", "zod": "^4.0.0" } }, "sha512-+kacx94Oj1zNetWkg6aRDdAUaAIqXufP1T6j6JqnDRjRCpZeSkW8GU1Sp2mfCw4KDo/XbeB5jPzFKSHfUKH8JQ=="],

"mcp-studio/@modelcontextprotocol/sdk": ["@modelcontextprotocol/[email protected]", "", { "dependencies": { "@hono/node-server": "^1.19.7", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.0.1", "express-rate-limit": "^7.5.0", "jose": "^6.1.1", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.0" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-yO28oVFFC7EBoiKdAn+VqRm+plcfv4v0xp6osG/VsCB0NlPZWi87ajbCZZ8f/RvOFLEu7//rSRmuZZ7lMoe3gQ=="],
Expand Down Expand Up @@ -3118,6 +3146,12 @@

"log-symbols/chalk/supports-color": ["[email protected]", "", { "dependencies": { "has-flag": "^4.0.0" } }, "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw=="],

"mcp-blog/@modelcontextprotocol/sdk/ajv": ["[email protected]", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="],

"mcp-blog/@modelcontextprotocol/sdk/zod": ["[email protected]", "", {}, "sha512-k7Nwx6vuWx1IJ9Bjuf4Zt1PEllcwe7cls3VNzm4CQ1/hgtFUK2bRNG3rvnpPUhFjmqJKAKtjV576KnUkHocg/g=="],

"mcp-blog/deco-cli/@supabase/supabase-js": ["@supabase/[email protected]", "", { "dependencies": { "@supabase/auth-js": "2.70.0", "@supabase/functions-js": "2.4.4", "@supabase/node-fetch": "2.6.15", "@supabase/postgrest-js": "1.19.4", "@supabase/realtime-js": "2.11.10", "@supabase/storage-js": "2.7.1" } }, "sha512-M1Gd5tPaaghYZ9OjeO1iORRqbTWFEz/cF3pPubRnMPzA+A8SiUsXXWDP+DWsASZcjEcVEcVQIAF38i5wrijYOg=="],

"mcp-studio/@decocms/runtime/@ai-sdk/provider": ["@ai-sdk/[email protected]", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="],

"mcp-studio/@decocms/runtime/@decocms/bindings": ["@decocms/[email protected]", "", { "dependencies": { "@modelcontextprotocol/sdk": "1.25.1", "zod": "^3.25.76", "zod-from-json-schema": "^0.0.5" } }, "sha512-0qGrAcH74Td9Ruhx7SI31o9mvKlMeQGtiRf5BzDcSgG0cvgJhaMMSvz72tvbUVl77GLu93v02NlKupui8yeiMw=="],
Expand Down Expand Up @@ -3392,6 +3426,18 @@

"inquirer-search-list/inquirer/strip-ansi/ansi-regex": ["[email protected]", "", {}, "sha512-+O9Jct8wf++lXxxFc4hc8LsjaSq0HFzzL7cVsw8pRDIPdjKD2mT4ytDZlLuSBZ4cLKZFXIrMGO7DbQCtMJJMKw=="],

"mcp-blog/@modelcontextprotocol/sdk/ajv/json-schema-traverse": ["[email protected]", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],

"mcp-blog/deco-cli/@supabase/supabase-js/@supabase/auth-js": ["@supabase/[email protected]", "", { "dependencies": { "@supabase/node-fetch": "^2.6.14" } }, "sha512-BaAK/tOAZFJtzF1sE3gJ2FwTjLf4ky3PSvcvLGEgEmO4BSBkwWKu8l67rLLIBZPDnCyV7Owk2uPyKHa0kj5QGg=="],

"mcp-blog/deco-cli/@supabase/supabase-js/@supabase/functions-js": ["@supabase/[email protected]", "", { "dependencies": { "@supabase/node-fetch": "^2.6.14" } }, "sha512-WL2p6r4AXNGwop7iwvul2BvOtuJ1YQy8EbOd0dhG1oN1q8el/BIRSFCFnWAMM/vJJlHWLi4ad22sKbKr9mvjoA=="],

"mcp-blog/deco-cli/@supabase/supabase-js/@supabase/postgrest-js": ["@supabase/[email protected]", "", { "dependencies": { "@supabase/node-fetch": "^2.6.14" } }, "sha512-O4soKqKtZIW3olqmbXXbKugUtByD2jPa8kL2m2c1oozAO11uCcGrRhkZL0kVxjBLrXHE0mdSkFsMj7jDSfyNpw=="],

"mcp-blog/deco-cli/@supabase/supabase-js/@supabase/realtime-js": ["@supabase/[email protected]", "", { "dependencies": { "@supabase/node-fetch": "^2.6.13", "@types/phoenix": "^1.6.6", "@types/ws": "^8.18.1", "ws": "^8.18.2" } }, "sha512-SJKVa7EejnuyfImrbzx+HaD9i6T784khuw1zP+MBD7BmJYChegGxYigPzkKX8CK8nGuDntmeSD3fvriaH0EGZA=="],

"mcp-blog/deco-cli/@supabase/supabase-js/@supabase/storage-js": ["@supabase/[email protected]", "", { "dependencies": { "@supabase/node-fetch": "^2.6.14" } }, "sha512-asYHcyDR1fKqrMpytAS1zjyEfvxuOIp1CIXX7ji4lHHcJKqyk+sLl/Vxgm4sN6u8zvuUtae9e4kDxQP2qrwWBA=="],

"mcp-studio/@decocms/runtime/@decocms/bindings/zod": ["[email protected]", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],

"mcp-studio/@modelcontextprotocol/sdk/ajv/json-schema-traverse": ["[email protected]", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
Expand Down Expand Up @@ -3602,6 +3648,8 @@

"inquirer-search-list/inquirer/cli-cursor/restore-cursor/signal-exit": ["[email protected]", "", {}, "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="],

"mcp-blog/deco-cli/@supabase/supabase-js/@supabase/realtime-js/ws": ["[email protected]", "", { "peerDependencies": { "bufferutil": "^4.0.1", "utf-8-validate": ">=5.0.2" }, "optionalPeers": ["bufferutil", "utf-8-validate"] }, "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg=="],

"perplexity/@decocms/runtime/@mastra/core/@ai-sdk/anthropic-v5/@ai-sdk/provider": ["@ai-sdk/[email protected]", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="],

"perplexity/@decocms/runtime/@mastra/core/@ai-sdk/anthropic-v5/@ai-sdk/provider-utils": ["@ai-sdk/[email protected]", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-T1gZ76gEIwffep6MWI0QNy9jgoybUHE7TRaHB5k54K8mF91ciGFlbtCGxDYhMH3nCRergKwYFIDeFF0hJSIQHQ=="],
Expand Down
158 changes: 158 additions & 0 deletions content-scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Content Scraper MCP

MCP para extração, deduplicação e sumarização de conteúdo web usando Firecrawl e Supabase.

## Funcionalidades

- **Extração de Conteúdo**: Usa Firecrawl para extrair title, body, author e date de URLs
- **Deduplicação por Fingerprint**: Gera hash SHA-256 de title + body para identificar conteúdo único
- **Persistência de Estado**: Armazena registros no Supabase para evitar reprocessamento
- **Watermarks por Domínio**: Rastreia última vez que cada domínio foi processado
- **Resumos Focados em Insights**: Gera resumos curtos extraindo frases-chave

## Configuração

### 1. Supabase - Criar Tabelas

Execute no SQL Editor do seu projeto Supabase:

```sql
-- Tabela de conteúdo processado
CREATE TABLE scraped_content (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
url TEXT UNIQUE NOT NULL,
fingerprint TEXT NOT NULL,
domain TEXT NOT NULL,
title TEXT NOT NULL,
first_seen_at TIMESTAMPTZ NOT NULL,
last_seen_at TIMESTAMPTZ NOT NULL,
updated_count INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Índices para performance
CREATE INDEX idx_scraped_content_domain ON scraped_content(domain);
CREATE INDEX idx_scraped_content_fingerprint ON scraped_content(fingerprint);
CREATE INDEX idx_scraped_content_url ON scraped_content(url);

-- Tabela de watermarks por domínio
CREATE TABLE scraper_watermarks (
domain TEXT PRIMARY KEY,
last_processed_at TIMESTAMPTZ NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW()
);
```

### 2. Firecrawl API Key

Obtenha sua API key em https://firecrawl.dev

### 3. Instalar o MCP

Ao instalar, preencha:
- `firecrawlApiKey`: Sua chave de API do Firecrawl
- `supabaseUrl`: URL do seu projeto Supabase (ex: https://xxx.supabase.co)
- `supabaseKey`: Service role key ou anon key com RLS configurado

## Tools Disponíveis

### `process_urls`

Processa uma lista de URLs:
- Extrai conteúdo limpo usando Firecrawl
- Gera fingerprint único (SHA-256 de title + body normalizado)
- Verifica se já existe no Supabase
- Salva novo conteúdo ou atualiza se fingerprint mudou
- Retorna resumo focado em insights

**Input:**
```json
{
"urls": ["https://example.com/article-1", "https://example.com/article-2"],
"generateSummaries": true
}
```

**Output:**
```json
{
"processed": [
{
"url": "https://example.com/article-1",
"status": "new",
"title": "Article Title",
"summary": "Key insights from the content...",
"fingerprint": "abc123...",
"domain": "example.com"
}
],
"stats": {
"total": 2,
"new": 1,
"updated": 0,
"unchanged": 1,
"errors": 0
}
}
```

### `check_updates`

Verifica status de URLs processadas anteriormente sem re-extrair:

**Input:**
```json
{
"domain": "example.com"
}
```

### `get_watermarks`

Obtém watermarks (última vez processada) por domínio:

**Input:**
```json
{
"domain": "example.com"
}
```

## Lógica de Deduplicação

1. **Normalização**: title e body são normalizados (lowercase, whitespace colapsado, Unicode normalizado)
2. **Fingerprint**: SHA-256 do texto normalizado `title|body`
3. **Verificação**:
- Se URL não existe → conteúdo **novo**
- Se URL existe mas fingerprint diferente → **update**
- Se URL existe e fingerprint igual → **ignorar**

## Desenvolvimento

```bash
cd content-scraper
bun install
bun run dev # Desenvolvimento local
bun run deploy # Deploy para produção
```

## Arquitetura

```
content-scraper/
├── server/
│ ├── main.ts # Entry point e StateSchema
│ ├── lib/
│ │ ├── firecrawl.ts # Cliente Firecrawl API
│ │ ├── supabase.ts # Cliente Supabase para persistência
│ │ ├── content.ts # Normalização, fingerprint, resumo
│ │ └── types.ts # Tipos compartilhados
│ └── tools/
│ ├── index.ts # Exporta todas as tools
│ └── scraper.ts # Tools de processamento
├── shared/
│ └── deco.gen.ts # Tipos gerados
├── package.json
├── wrangler.toml
└── tsconfig.json
```
36 changes: 36 additions & 0 deletions content-scraper/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"name": "mcp-content-scraper",
"version": "1.0.0",
"description": "Content extraction, deduplication and summarization MCP using Firecrawl and Supabase",
"private": true,
"type": "module",
"scripts": {
"dev": "deco dev --vite",
"configure": "deco configure",
"gen": "deco gen --output=shared/deco.gen.ts",
"deploy": "npm run build && deco deploy ./dist/server",
"check": "tsc --noEmit",
"build": "bun --bun vite build"
},
"dependencies": {
"@decocms/runtime": "0.25.1",
"@supabase/supabase-js": "^2.49.0",
"zod": "^3.24.3"
},
"devDependencies": {
"@cloudflare/vite-plugin": "^1.13.4",
"@cloudflare/workers-types": "^4.20251014.0",
"@decocms/mcps-shared": "1.0.0",
"@mastra/core": "^0.24.0",
"@modelcontextprotocol/sdk": "^1.21.0",
"@types/mime-db": "^1.43.6",
"deco-cli": "^0.26.0",
"typescript": "^5.7.2",
"vite": "7.2.0",
"wrangler": "^4.28.0"
},
"engines": {
"node": ">=22.0.0"
}
}

Loading
Loading