Skip to content

feat: add crawling endpoint #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions scrapegraph-js/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,105 @@ const prompt = 'What is the latest version of Python and what are its main featu
})();
```

### Crawl API

Start a crawl job to extract structured data from a website and its linked pages, using a custom schema.

```javascript
import { crawl, getCrawlRequest } from 'scrapegraph-js';
import 'dotenv/config';

const apiKey = process.env.SGAI_APIKEY;
const url = 'https://scrapegraphai.com/';
const prompt = 'What does the company do? and I need text content from there privacy and terms';

const schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "ScrapeGraphAI Website Content",
"type": "object",
"properties": {
"company": {
"type": "object",
"properties": {
"name": { "type": "string" },
"description": { "type": "string" },
"features": { "type": "array", "items": { "type": "string" } },
"contact_email": { "type": "string", "format": "email" },
"social_links": {
"type": "object",
"properties": {
"github": { "type": "string", "format": "uri" },
"linkedin": { "type": "string", "format": "uri" },
"twitter": { "type": "string", "format": "uri" }
},
"additionalProperties": false
}
},
"required": ["name", "description"]
},
"services": {
"type": "array",
"items": {
"type": "object",
"properties": {
"service_name": { "type": "string" },
"description": { "type": "string" },
"features": { "type": "array", "items": { "type": "string" } }
},
"required": ["service_name", "description"]
}
},
"legal": {
"type": "object",
"properties": {
"privacy_policy": { "type": "string" },
"terms_of_service": { "type": "string" }
},
"required": ["privacy_policy", "terms_of_service"]
}
},
"required": ["company", "services", "legal"]
};

(async () => {
try {
// Start the crawl job
const crawlResponse = await crawl(apiKey, url, prompt, schema, {
cacheWebsite: true,
depth: 2,
maxPages: 2,
sameDomainOnly: true,
batchSize: 1,
});
console.log('Crawl job started. Response:', crawlResponse);

// If the crawl is asynchronous and returns an ID, fetch the result
const crawlId = crawlResponse.id || crawlResponse.task_id;
if (crawlId) {
for (let i = 0; i < 10; i++) {
await new Promise((resolve) => setTimeout(resolve, 5000));
const result = await getCrawlRequest(apiKey, crawlId);
if (result.status === 'success' && result.result) {
console.log('Crawl completed. Result:', result.result.llm_result);
break;
} else if (result.status === 'failed') {
console.log('Crawl failed. Result:', result);
break;
} else {
console.log(`Status: ${result.status}, waiting...`);
}
}
} else {
console.log('No crawl ID found in response. Synchronous result:', crawlResponse);
}
} catch (error) {
console.error('Error occurred:', error);
}
})();
```

You can use a plain JSON schema or a [Zod](https://www.npmjs.com/package/zod) schema for the `schema` parameter. The crawl API supports options for crawl depth, max pages, domain restriction, and batch size.

### Scraping local HTML

Extract structured data from local HTML content
Expand Down
105 changes: 105 additions & 0 deletions scrapegraph-js/examples/crawl_example.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { crawl, getCrawlRequest } from '../index.js';
import 'dotenv/config';

// Example .env file:
// SGAI_APIKEY=your_sgai_api_key

const apiKey = process.env.SGAI_APIKEY;

const schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "ScrapeGraphAI Website Content",
"type": "object",
"properties": {
"company": {
"type": "object",
"properties": {
"name": { "type": "string" },
"description": { "type": "string" },
"features": { "type": "array", "items": { "type": "string" } },
"contact_email": { "type": "string", "format": "email" },
"social_links": {
"type": "object",
"properties": {
"github": { "type": "string", "format": "uri" },
"linkedin": { "type": "string", "format": "uri" },
"twitter": { "type": "string", "format": "uri" }
},
"additionalProperties": false
}
},
"required": ["name", "description"]
},
"services": {
"type": "array",
"items": {
"type": "object",
"properties": {
"service_name": { "type": "string" },
"description": { "type": "string" },
"features": { "type": "array", "items": { "type": "string" } }
},
"required": ["service_name", "description"]
}
},
"legal": {
"type": "object",
"properties": {
"privacy_policy": { "type": "string" },
"terms_of_service": { "type": "string" }
},
"required": ["privacy_policy", "terms_of_service"]
}
},
"required": ["company", "services", "legal"]
};

const url = 'https://scrapegraphai.com/';
const prompt = 'What does the company do? and I need text content from there privacy and terms';

(async () => {
if (!apiKey) {
console.error('SGAI_APIKEY not found in environment. Please set it in your .env file.');
process.exit(1);
}

try {
// Start the crawl job
console.log(`\nStarting crawl for: ${url}`);
const crawlResponse = await crawl(apiKey, url, prompt, schema, {
cacheWebsite: true,
depth: 2,
maxPages: 2,
sameDomainOnly: true,
batchSize: 1,
});
console.log('\nCrawl job started. Response:');
console.log(JSON.stringify(crawlResponse, null, 2));

// If the crawl is asynchronous and returns an ID, fetch the result
const crawlId = crawlResponse.id || crawlResponse.task_id;
if (crawlId) {
console.log('\nPolling for crawl result...');
for (let i = 0; i < 10; i++) {
await new Promise((resolve) => setTimeout(resolve, 5000));
const result = await getCrawlRequest(apiKey, crawlId);
if (result.status === 'success' && result.result) {
console.log(`\nCrawl completed. Result:`);
console.log(JSON.stringify(result.result.llm_result, null, 2));
break;
} else if (result.status === 'failed') {
console.log('\nCrawl failed. Result:');
console.log(JSON.stringify(result, null, 2));
break;
} else {
console.log(`Status: ${result.status}, waiting...`);
}
}
} else {
console.log('No crawl ID found in response. Synchronous result:');
console.log(JSON.stringify(crawlResponse, null, 2));
}
} catch (error) {
console.error('Error occurred:', error);
}
})();
1 change: 1 addition & 0 deletions scrapegraph-js/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ export { markdownify, getMarkdownifyRequest } from './src/markdownify.js';
export { searchScraper, getSearchScraperRequest } from './src/searchScraper.js';
export { getCredits } from './src/credits.js';
export { sendFeedback } from './src/feedback.js';
export { crawl, getCrawlRequest } from './src/crawl.js';
93 changes: 93 additions & 0 deletions scrapegraph-js/src/crawl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import axios from 'axios';
import handleError from './utils/handleError.js';
import { ZodType } from 'zod';
import { zodToJsonSchema } from 'zod-to-json-schema';

/**
* Start a crawl job using the ScrapeGraphAI API.
*
* @param {string} apiKey - Your ScrapeGraph AI API key
* @param {string} url - The starting URL for the crawl
* @param {string} prompt - The prompt to guide the crawl and extraction
* @param {Object|ZodType} schema - JSON schema or Zod schema defining the structure of the extracted data
* @param {Object} [options] - Optional crawl parameters
* @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
* @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
* @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
* @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
* @param {number} [options.batchSize=1] - Batch size for processing pages (1-10)
* @returns {Promise<Object>} The crawl job response
* @throws {Error} Throws an error if the HTTP request fails
*/
export async function crawl(
apiKey,
url,
prompt,
schema,
options = {}
) {
const endpoint = 'https://api.scrapegraphai.com/v1/crawl';
const headers = {
'accept': 'application/json',
'SGAI-APIKEY': apiKey,
'Content-Type': 'application/json',
};

let schemaPayload;
if (schema instanceof ZodType) {
schemaPayload = zodToJsonSchema(schema);
} else if (typeof schema === 'object' && schema !== null) {
schemaPayload = schema;
} else {
throw new Error('The schema must be a Zod schema or a plain object');
}

const {
cacheWebsite = true,
depth = 2,
maxPages = 2,
sameDomainOnly = true,
batchSize = 1,
} = options;

const payload = {
url,
prompt,
schema: schemaPayload,
cache_website: cacheWebsite,
depth,
max_pages: maxPages,
same_domain_only: sameDomainOnly,
batch_size: batchSize,
};

try {
const response = await axios.post(endpoint, payload, { headers });
return response.data;
} catch (error) {
handleError(error);
}
}

/**
* Get the result of a crawl job by ID.
*
* @param {string} apiKey - Your ScrapeGraph AI API key
* @param {string} crawlId - The crawl job ID
* @returns {Promise<Object>} The crawl result
* @throws {Error} Throws an error if the HTTP request fails
*/
export async function getCrawlRequest(apiKey, crawlId) {
const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`;
const headers = {
'accept': 'application/json',
'SGAI-APIKEY': apiKey,
};

try {
const response = await axios.get(endpoint, { headers });
return response.data;
} catch (error) {
handleError(error);
}
}
1 change: 1 addition & 0 deletions scrapegraph-py/examples/async/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SGAI_API_KEY="your_sgai_api_key"
Loading