-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
95 lines (78 loc) · 2.81 KB
/
app.py
File metadata and controls
95 lines (78 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Modern web interface for OGscraper
"""
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
import asyncio
import json
from ogscraper.scraper import WebScraper
from ogscraper.models import ScrapingResult
app = Flask(__name__)
CORS(app)
@app.route('/')
def index():
"""Serve the main UI"""
return render_template('index.html')
@app.route('/health')
def health():
"""Health check endpoint for Railway"""
return jsonify({
'status': 'healthy',
'service': 'ogscraper',
'version': '1.0.0'
})
@app.route('/api/scrape', methods=['POST'])
def scrape_endpoint():
"""API endpoint for scraping URLs with production optimizations"""
import time
import logging
start_time = time.time()
logger = logging.getLogger(__name__)
try:
data = request.get_json()
url = data.get('url', '').strip()
max_items = min(data.get('max_items', 10), 20) # Cap at 20 for production
use_browser = data.get('use_browser', False)
if not url:
return jsonify({'error': 'URL is required'}), 400
# Validate URL format
if not (url.startswith('http://') or url.startswith('https://')):
url = 'https://' + url
logger.info(f"Starting scrape request for {url} (max_items={max_items})")
# Create scraper with production-optimized settings
scraper = WebScraper(
url,
use_browser=use_browser,
max_concurrent=10, # Reduced for production stability
chunk_size=6000 # Smaller chunks for faster processing
)
result = scraper.scrape(max_items=max_items)
processing_time = time.time() - start_time
logger.info(f"Scraping completed in {processing_time:.2f}s, found {len(result.items)} items")
# Convert to dict for JSON response
response_data = {
'site': result.site,
'items': [
{
'title': item.title,
'content': item.content[:5000], # Truncate content for production
'content_type': item.content_type,
'source_url': item.source_url
}
for item in result.items
],
'total_items': len(result.items),
'processing_time': round(processing_time, 2)
}
return jsonify(response_data)
except Exception as e:
processing_time = time.time() - start_time
logger.error(f"Scraping failed after {processing_time:.2f}s: {str(e)}")
return jsonify({
'error': str(e),
'processing_time': round(processing_time, 2)
}), 500
if __name__ == '__main__':
import os
port = int(os.environ.get('PORT', 8080))
app.run(debug=False, host='0.0.0.0', port=port)