@@ -140,34 +140,28 @@ results = pathik.crawl_to_r2([
140140import pathik
141141import uuid
142142
143- # Generate a session ID to track this batch of streams
143+ # Generate a session ID for tracking
144144session_id = str (uuid.uuid4())
145145
146- # URLs to crawl and stream
147- urls = [
148- " https://www.wikipedia.org" ,
149- " https://www.github.com" ,
150- " https://news.ycombinator.com"
151- ]
146+ # Stream a single URL to Kafka
147+ result = pathik.stream_to_kafka(" https://example.com" , session = session_id)
148+ print (f " Success: { result[' https://example.com' ][' success' ]} " )
152149
153- # Stream content to Kafka
150+ # Stream multiple URLs with custom options
154151results = pathik.stream_to_kafka(
155- urls = urls, # URLs to crawl and stream
156- content_type = " both " , # Stream both HTML and Markdown
157- session = session_id , # Add session ID to messages
158- topic = " pathik.crawl " , # Set Kafka topic
159- parallel = True # Process URLs in parallel
152+ urls = [ " https://example.com " , " https://httpbin.org/html " ],
153+ content_type = " html " , # Options: "html", "markdown", or "both"
154+ topic = " custom_topic " , # Optional custom topic
155+ session = session_id , # Optional session ID
156+ parallel = True # Process URLs in parallel (default)
160157)
161158
162- # Print results
163- for url, result in results.items():
164- if result [" success" ]:
165- print (f " ✅ Successfully streamed { url} " )
159+ # Check results
160+ for url, status in results.items():
161+ if status [" success" ]:
162+ print (f " Successfully streamed { url} " )
166163 else :
167- print (f " ❌ Failed to stream { url} : { result.get(' error' , ' Unknown error' )} " )
168-
169- # You can use this session ID to filter messages when consuming from Kafka
170- print (f " Session ID for filtering: { session_id} " )
164+ print (f " Failed to stream { url} : { status.get(' error' )} " )
171165```
172166
173167### Command Line
@@ -211,42 +205,6 @@ pathik kafka -c html -t user1_data --session user123 https://example.com
211205
212206Pathik supports streaming crawled content directly to Kafka. This is useful for real-time processing pipelines.
213207
214- ### Basic Usage
215-
216- ``` python
217- import pathik
218- import uuid
219-
220- # Generate a session ID to track this batch of streams
221- session_id = str (uuid.uuid4())
222-
223- # URLs to crawl and stream
224- urls = [
225- " https://www.wikipedia.org" ,
226- " https://www.github.com" ,
227- " https://news.ycombinator.com"
228- ]
229-
230- # Stream content to Kafka
231- results = pathik.stream_to_kafka(
232- urls = urls, # URLs to crawl and stream
233- content_type = " both" , # Stream both HTML and Markdown
234- session = session_id, # Add session ID to messages
235- topic = " pathik.crawl" , # Set Kafka topic
236- parallel = True # Process URLs in parallel
237- )
238-
239- # Print results
240- for url, result in results.items():
241- if result[" success" ]:
242- print (f " ✅ Successfully streamed { url} " )
243- else :
244- print (f " ❌ Failed to stream { url} : { result.get(' error' , ' Unknown error' )} " )
245-
246- # You can use this session ID to filter messages when consuming from Kafka
247- print (f " Session ID for filtering: { session_id} " )
248- ```
249-
250208### Kafka Configuration
251209
252210Configure Kafka connection details in the ` .env ` file:
@@ -260,24 +218,6 @@ KAFKA_CLIENT_ID=pathik-crawler # Client ID for Kafka
260218KAFKA_USE_TLS=false # Whether to use TLS
261219```
262220
263- Alternatively, you can configure these settings in your code with the CLI-based approach:
264-
265- ``` python
266- from pathik.cli import crawl
267-
268- results = crawl(
269- urls = [" https://example.com" ],
270- kafka = True ,
271- kafka_brokers = " localhost:9092" ,
272- kafka_topic = " my.topic" ,
273- kafka_username = " user" ,
274- kafka_password = " pass" ,
275- kafka_client_id = " pathik-client" ,
276- kafka_use_tls = True ,
277- session_id = " my-session-id"
278- )
279- ```
280-
281221### Kafka Message Format
282222
283223When streaming to Kafka, Pathik sends two messages per URL:
0 commit comments