Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# Meilisearch Configuration
VITE_MEILISEARCH_HOST=https://search.bettergov.ph
VITE_MEILISEARCH_PORT=443
VITE_MEILISEARCH_SEARCH_API_KEY= # Meilisearch Search API Key
VITE_MEILISEARCH_SEARCH_API_KEY= # Meilisearch Search API Key

# Cloudflare Configuration
BETTERGOV_ACCOUNT_ID= # Cloudflare Account ID
BETTERGOV_BROWSER_KV_ID= # Browser KV Namespace ID
BETTERGOV_FOREX_KV_ID= # Forex KV Namespace ID
BETTERGOV_WEATHER_KV_ID= # Weather KV Namespace ID

# API Keys
WEATHER_API_KEY= # Weather API key
OPENWEATHERMAP_API_KEY= # OpenWeatherMap API key
FOREX_API_KEY= # Forex API key
JINA_API_KEY= # Jina.ai API key
CF_ACCOUNT_ID= # Cloudflare Account ID (for Browser API)
CF_API_TOKEN= # Cloudflare API Token (for Browser API)
251 changes: 231 additions & 20 deletions functions/api/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,104 @@
import { Env } from '../types';
import { fetchAndSaveContent, setDefaultCrawler } from '../lib/crawler';

/**
* Validate URL to prevent SSRF and other attacks
* @param url The URL to validate
* @returns Whether the URL is valid
*/
function isValidUrl(url: string): boolean {
try {
const parsedUrl = new URL(url);

// Only allow HTTP/HTTPS protocols
if (parsedUrl.protocol !== 'http:' && parsedUrl.protocol !== 'https:') {
return false;
}

// Prevent localhost and private networks
const hostname = parsedUrl.hostname;
if (
hostname === 'localhost' ||
hostname === '127.0.0.1' ||
hostname === '0.0.0.0'
) {
return false;
}

// Prevent IP addresses (only allow domain names)
if (/^(\d{1,3}\.){3}\d{1,3}$/.test(hostname)) {
return false;
}

// Only allow .gov.ph domains for government services
if (!hostname.endsWith('.gov.ph')) {
return false;
}

return true;
} catch {
return false;
}
}

/**
* Simple rate limiting using KV storage
* @param env Environment variables
* @param clientIP Client IP address
* @returns Whether request is allowed
*/
async function checkRateLimit(env: Env, clientIP: string): Promise<boolean> {
const rateLimitKey = `rate_limit:${clientIP}`;
const now = Date.now();
const windowMs = 60 * 1000; // 1 minute window
const maxRequests = 10; // Max 10 requests per minute

try {
const kv = env.BROWSER_KV;
const existing = await kv.get(rateLimitKey);

if (!existing) {
// First request from this IP
await kv.put(
rateLimitKey,
JSON.stringify({
count: 1,
resetTime: now + windowMs,
}),
{ expirationTtl: Math.ceil(windowMs / 1000) }
);
return true;
}

const data = JSON.parse(existing);

if (now > data.resetTime) {
// Window expired, reset
await kv.put(
rateLimitKey,
JSON.stringify({
count: 1,
resetTime: now + windowMs,
}),
{ expirationTtl: Math.ceil(windowMs / 1000) }
);
return true;
}

if (data.count >= maxRequests) {
return false; // Rate limited
}

// Increment count
data.count++;
await kv.put(rateLimitKey, JSON.stringify(data));
return true;
} catch (error) {
console.error('Rate limit check failed:', error);
return true; // Allow request if rate limit fails
}
}

/**
* Handler for HTTP requests to the web crawling endpoint
* This is a generic interface for crawling web content, currently using Jina.ai
Expand All @@ -15,6 +113,7 @@ export async function onRequest(context: {
// Handle CORS preflight requests
if (request.method === 'OPTIONS') {
return new Response(null, {
status: 204,
headers: {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET, OPTIONS',
Expand Down Expand Up @@ -60,49 +159,161 @@ export async function onRequest(context: {
);
}

// Validate URL before processing
if (!isValidUrl(targetUrl)) {
return new Response(
JSON.stringify({
error: 'Invalid URL',
message: 'Only .gov.ph domains are allowed for crawling',
}),
{
status: 400,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
}

// Check rate limit
const clientIP = request.headers.get('CF-Connecting-IP') || 'unknown';
const rateLimitAllowed = await checkRateLimit(env, clientIP);

if (!rateLimitAllowed) {
return new Response(
JSON.stringify({
error: 'Rate limit exceeded',
message: 'Maximum 10 requests per minute per IP address',
}),
{
status: 429,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
'Retry-After': '60', // Retry after 60 seconds
},
}
);
}

// If force update is requested, fetch it
if (forceUpdate) {
const result = await fetchAndSaveContent(env, targetUrl, crawler);
try {
const result = await fetchAndSaveContent(env, targetUrl, crawler);

if (!result.success) {
// Return the response with CORS headers
return new Response(
JSON.stringify({
...result,
crawler: crawler || 'default',
}),
{
status: 500,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
}

if (!result.success) {
// Return the response with CORS headers
return new Response(
JSON.stringify({
...result,
...result.data,
source: 'crawler',
crawler: crawler || 'default',
}),
{
status: 500,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
} catch (error) {
console.error('Crawl error:', error);
return new Response(
JSON.stringify({
error: 'Crawl operation failed',
message: error instanceof Error ? error.message : 'Unknown error',
details:
'The crawl service may be temporarily unavailable or misconfigured. Please try again later.',
}),
{
status: 503,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
}
} else {
// Try to get existing content from database
try {
const existingContent = await getContentByUrl(env, targetUrl, crawler);

return new Response(
JSON.stringify({
...result.data,
source: 'crawler',
crawler: crawler || 'default',
}),
{
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
if (existingContent) {
return new Response(
JSON.stringify({
...existingContent,
source: 'database',
crawler: crawler || 'default',
}),
{
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
} else {
return new Response(
JSON.stringify({
error: 'Content not found',
message:
'No cached content available for this URL. Use ?force=true to crawl the content.',
url: targetUrl,
}),
{
status: 404,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
}
);
} catch (error) {
console.error('Database error:', error);
return new Response(
JSON.stringify({
error: 'Database operation failed',
message: error instanceof Error ? error.message : 'Unknown error',
details: 'The database service may be temporarily unavailable.',
}),
{
status: 503,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
}
);
}
}
} catch (error) {
console.error('Crawl endpoint error:', error);
return new Response(
JSON.stringify({
error: (error as Error).message,
status: 'error',
error: 'Request processing failed',
message: error instanceof Error ? error.message : 'Unknown error',
details: 'Please check the URL parameter and try again.',
}),
{
status: 500,
status: 400,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
Expand Down
1 change: 1 addition & 0 deletions functions/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export default {
// Handle OPTIONS requests for CORS
if (request.method === 'OPTIONS') {
return new Response(null, {
status: 204,
headers: corsHeaders,
});
}
Expand Down
Loading