Skip to content

Commit a6fd70b

Browse files
committed
added new changes which make it work with python, proper
1 parent 91525c8 commit a6fd70b

File tree

7 files changed

+113
-18
lines changed

7 files changed

+113
-18
lines changed

pathik/__init__.py

Lines changed: 69 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,77 @@
2020
from typing import List, Dict, Optional
2121

2222
def crawl(urls: List[str], output_dir: Optional[str] = None, parallel: bool = True) -> Dict[str, Dict[str, str]]:
23-
"""Fallback implementation of crawl function"""
24-
print("Using fallback crawl implementation")
25-
# Basic implementation here
26-
# ...
23+
"""Crawl function that uses the binary to crawl the given URLs"""
24+
from .crawler import get_binary_path
25+
26+
result = {}
27+
temp_dir = None
28+
29+
if not output_dir:
30+
# Create temporary directory if no output_dir specified
31+
temp_dir = tempfile.mkdtemp(prefix="pathik_")
32+
output_dir = temp_dir
33+
print(f"Created temporary directory: {temp_dir}")
34+
35+
try:
36+
binary_path = get_binary_path()
37+
38+
# Prepare command
39+
cmd = [binary_path, "-crawl", f"-outdir={output_dir}"]
40+
if not parallel:
41+
cmd.append("-parallel=false")
42+
cmd.extend(urls)
43+
44+
print(f"crawl() called with urls={urls}, output_dir={output_dir}, parallel={parallel}")
45+
46+
# Run command
47+
subprocess.run(cmd, check=True)
48+
49+
# Collect results
50+
for url in urls:
51+
result[url] = {}
52+
# TODO: Add result collection logic
53+
except Exception as e:
54+
print(f"Error in crawl function: {e}")
55+
for url in urls:
56+
if url not in result:
57+
result[url] = {"error": str(e)}
58+
59+
return result
2760

2861
def crawl_to_r2(urls: List[str], uuid_str: Optional[str] = None, parallel: bool = True) -> Dict[str, Dict[str, str]]:
29-
"""Fallback implementation of crawl_to_r2 function"""
30-
print("Using fallback crawl_to_r2 implementation")
31-
# Basic implementation here
32-
# ...
62+
"""Upload to R2 function that uses the binary to upload the given URLs"""
63+
from .crawler import get_binary_path
64+
65+
result = {}
66+
if not uuid_str:
67+
uuid_str = str(uuid.uuid4())
68+
69+
try:
70+
binary_path = get_binary_path()
71+
72+
# Prepare command
73+
cmd = [binary_path, "-r2", f"-uuid={uuid_str}"]
74+
if not parallel:
75+
cmd.append("-parallel=false")
76+
cmd.extend(urls)
77+
78+
# Run command
79+
subprocess.run(cmd, check=True)
80+
81+
# Collect results
82+
for url in urls:
83+
result[url] = {
84+
"uuid": uuid_str,
85+
# TODO: Add more result details
86+
}
87+
except Exception as e:
88+
print(f"Error in crawl_to_r2 function: {e}")
89+
for url in urls:
90+
if url not in result:
91+
result[url] = {"error": str(e)}
92+
93+
return result
3394

3495
# Export the functions
3596
__all__ = ["crawl", "crawl_to_r2", "__version__"]
1.99 KB
Binary file not shown.
9.61 KB
Binary file not shown.

pathik/bin/darwin_arm64/pathik_bin

1.87 MB
Binary file not shown.

pathik/cli.py

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,27 @@
77
import os
88
import json
99
import subprocess
10-
from . import crawl, crawl_to_r2
11-
from .crawler import get_binary_path
10+
11+
# Fix the import to use direct import instead of relative
12+
import pathik
13+
from pathik.crawler import get_binary_path
1214

1315
def main():
1416
"""Main entry point for the CLI"""
15-
parser = argparse.ArgumentParser(description="Pathik - A fast web crawler with Python integration")
17+
parser = argparse.ArgumentParser(
18+
description="Pathik - A fast web crawler with Python integration",
19+
epilog="""
20+
Note: This Python CLI uses subcommands (crawl, r2, kafka, version) rather than flags.
21+
For example:
22+
pathik kafka https://example.com
23+
pathik crawl -o ./output https://example.com
24+
25+
If you prefer flag-style syntax, use the Go binary directly:
26+
./pathik -kafka https://example.com
27+
./pathik -crawl -outdir ./output https://example.com
28+
""",
29+
formatter_class=argparse.RawDescriptionHelpFormatter
30+
)
1631
subparsers = parser.add_subparsers(dest="command", help="Command to run")
1732

1833
# Crawl command
@@ -35,19 +50,33 @@ def main():
3550
kafka_parser.add_argument("-t", "--topic", help="Kafka topic to stream to")
3651
kafka_parser.add_argument("-c", "--content", choices=["html", "markdown", "both"], default="both",
3752
help="Content type to stream (html, markdown, or both)")
53+
kafka_parser.add_argument("--session", help="Session ID to include with messages (for multi-user environments)")
3854

3955
# Version command
4056
version_parser = subparsers.add_parser("version", help="Print version information")
4157

42-
args = parser.parse_args()
58+
try:
59+
args = parser.parse_args()
60+
except SystemExit as e:
61+
# Check if user might be using Go binary syntax with dashes
62+
for i, arg in enumerate(sys.argv[1:]):
63+
if arg.startswith('-') and not arg.startswith('--') and arg not in ['-o', '-s', '-u', '-b', '-t', '-c']:
64+
print("\nError: It seems you're using Go binary syntax with the Python CLI.")
65+
print("The Python CLI uses subcommands instead of flags:")
66+
print(" ✅ Correct: pathik kafka https://example.com")
67+
print(" ❌ Incorrect: pathik -kafka https://example.com")
68+
print("\nAvailable subcommands: crawl, r2, kafka, version")
69+
return 1
70+
# If not caught by our check, let the original error propagate
71+
return e.code
4372

4473
if not args.command:
4574
parser.print_help()
4675
return 1
4776

4877
try:
4978
if args.command == "crawl":
50-
result = crawl(
79+
result = pathik.crawl(
5180
urls=args.urls,
5281
output_dir=args.outdir,
5382
parallel=not args.sequential
@@ -72,7 +101,7 @@ def main():
72101
print(f"\nResults saved to: {results_file}")
73102

74103
elif args.command == "r2":
75-
result = crawl_to_r2(
104+
result = pathik.crawl_to_r2(
76105
urls=args.urls,
77106
uuid_str=args.uuid,
78107
parallel=not args.sequential
@@ -101,11 +130,17 @@ def main():
101130
if args.content and args.content != "both":
102131
cmd.extend(["-content", args.content])
103132

133+
# Add topic if specified
134+
if args.topic:
135+
cmd.extend(["-topic", args.topic])
136+
137+
# Add session ID if provided
138+
if args.session:
139+
cmd.extend(["-session", args.session])
140+
104141
# Add Kafka-specific options if provided
105142
if args.brokers:
106143
os.environ["KAFKA_BROKERS"] = args.brokers
107-
if args.topic:
108-
os.environ["KAFKA_TOPIC"] = args.topic
109144

110145
# Add URLs
111146
cmd.extend(args.urls)
@@ -126,8 +161,7 @@ def main():
126161
return 1
127162

128163
elif args.command == "version":
129-
from . import __version__ # Importing here to avoid circular imports
130-
print(f"Pathik v{__version__}")
164+
print(f"Pathik v{pathik.__version__}")
131165
return 0
132166

133167
return 0

pathik/pathik

0 Bytes
Binary file not shown.

pathik_bin

21.7 MB
Binary file not shown.

0 commit comments

Comments
 (0)