From d9987f5679c7f9661c850b9817d486068287aa06 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 7 May 2026 13:39:16 +0530 Subject: [PATCH 01/15] feat(repl): add interactive REPL mode (Phase 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bare `scrapingbee` (no subcommand) now drops into a themed REPL with tab completion, history, and inline command help. Adds: - src/scrapingbee_cli/interactive.py — REPL loop, splash, completer - src/scrapingbee_cli/theme.py — ScrapingBee brand theme + spinner - src/scrapingbee_cli/help_formatter.py — Rich-styled click help - pyproject.toml: prompt_toolkit>=3.0, rich>=13.0 Hooks `cli.py` so the click group is invoke_without_command=True and falls into run_repl() when no subcommand is given. Schedule hint is suppressed inside the REPL to avoid per-command noise. Phase 2 (theme integration in command files for inline spinners during command runs) will follow as a separate commit. --- pyproject.toml | 2 + src/scrapingbee_cli/cli.py | 17 +- src/scrapingbee_cli/help_formatter.py | 156 +++++ src/scrapingbee_cli/interactive.py | 482 +++++++++++++++ src/scrapingbee_cli/theme.py | 825 ++++++++++++++++++++++++++ 5 files changed, 1481 insertions(+), 1 deletion(-) create mode 100644 src/scrapingbee_cli/help_formatter.py create mode 100644 src/scrapingbee_cli/interactive.py create mode 100644 src/scrapingbee_cli/theme.py diff --git a/pyproject.toml b/pyproject.toml index 7bf5f92..eaca5b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "certifi", "click>=8.0", "click-option-group>=0.5.6", + "prompt_toolkit>=3.0", + "rich>=13.0", "scrapy>=2.11", "scrapy-scrapingbee>=0.0.5", ] diff --git a/src/scrapingbee_cli/cli.py b/src/scrapingbee_cli/cli.py index cb45f93..ff913bc 100644 --- a/src/scrapingbee_cli/cli.py +++ b/src/scrapingbee_cli/cli.py @@ -8,9 +8,15 @@ from .commands import register_commands from .config import load_dotenv +# Guard against REPL re-entry when cli.main(args) is called from within REPL +_in_repl = False + def _show_active_schedules_hint() -> None: """If there are active schedules, print a one-line hint to stderr.""" + if _in_repl: + return # Don't show on every REPL command + import json import sys from pathlib import Path @@ -63,7 +69,7 @@ def _show_active_schedules_hint() -> None: ) -@click.group() +@click.group(invoke_without_command=True) @click.version_option(version=__version__) @click.pass_context def cli(ctx: click.Context) -> None: @@ -77,6 +83,15 @@ def cli(ctx: click.Context) -> None: load_dotenv() _show_active_schedules_hint() ctx.ensure_object(dict) + global _in_repl # noqa: PLW0603 + if ctx.invoked_subcommand is None and not _in_repl: + from .interactive import run_repl + + _in_repl = True + try: + run_repl(cli, __version__) + finally: + _in_repl = False register_commands(cli) diff --git a/src/scrapingbee_cli/help_formatter.py b/src/scrapingbee_cli/help_formatter.py new file mode 100644 index 0000000..da2c0b2 --- /dev/null +++ b/src/scrapingbee_cli/help_formatter.py @@ -0,0 +1,156 @@ +"""Custom Rich-powered help formatter for ScrapingBee CLI.""" + +from __future__ import annotations + +import sys +from typing import Any + +import click + +from .theme import BEE_AMBER, BEE_YELLOW, err_console + + +def _should_style() -> bool: + """True when stderr is a real TTY (styled help goes to stderr).""" + return sys.stderr.isatty() + + +class BeeHelpFormatter(click.HelpFormatter): + """Click help formatter that outputs styled text via Rich.""" + + def write(self, string: str) -> None: + """Collect raw text — we'll style it in getvalue().""" + super().write(string) + + +class BeeCommand(click.Command): + """Command subclass that renders help through Rich.""" + + def format_help(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + """Override to render help with Rich styling.""" + self.format_usage(ctx, formatter) + self.format_help_text(ctx, formatter) + self.format_options(ctx, formatter) + self.format_epilog(ctx, formatter) + + def get_help(self, ctx: click.Context) -> str: + """Return plain help AND print styled version to stderr if TTY.""" + formatter = ctx.make_formatter() + self.format_help(ctx, formatter) + plain = formatter.getvalue() + if _should_style(): + _print_styled_help(plain, self.name or "") + return plain + + +class BeeGroup(click.Group): + """Group subclass that renders help through Rich.""" + + def get_help(self, ctx: click.Context) -> str: + formatter = ctx.make_formatter() + self.format_help(ctx, formatter) + plain = formatter.getvalue() + if _should_style(): + _print_styled_help(plain, self.name or "scrapingbee") + return plain + + def format_help(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + self.format_usage(ctx, formatter) + self.format_help_text(ctx, formatter) + self.format_options(ctx, formatter) + self.format_commands(ctx, formatter) + self.format_epilog(ctx, formatter) + + def command(self, *args: Any, **kwargs: Any) -> Any: + kwargs.setdefault("cls", BeeCommand) + return super().command(*args, **kwargs) + + def group(self, *args: Any, **kwargs: Any) -> Any: + kwargs.setdefault("cls", BeeGroup) + return super().group(*args, **kwargs) + + +def _print_styled_help(plain_help: str, command_name: str) -> None: + """Parse plain Click help text and render it with Rich styling.""" + from rich.text import Text + + lines = plain_help.split("\n") + + # Header + err_console.print() + header = Text() + header.append(f" {command_name}", style=f"bold {BEE_YELLOW}") + err_console.print(header) + err_console.print() + + in_commands = False + + for line in lines: + stripped = line.strip() + + # Skip the "Usage:" line (we already printed header) + if stripped.startswith("Usage:"): + # Print usage in dim + err_console.print(f" [dim]{stripped}[/dim]") + continue + + # Section headers + if stripped.endswith(":") and not stripped.startswith("-") and not stripped.startswith("["): + in_commands = stripped == "Commands:" + err_console.print( + f" [bold {BEE_YELLOW}]~~ {stripped[:-1]} ~~{'~' * (36 - len(stripped))}[/]" + ) + continue + + # Option group headers (from click-option-group) + if stripped.endswith(":") and len(stripped) < 40 and not stripped.startswith("-"): + err_console.print( + f" [bold {BEE_YELLOW}]~~ {stripped[:-1]} ~~{'~' * (36 - len(stripped))}[/]" + ) + continue + + # Empty lines + if not stripped: + err_console.print() + continue + + # Description text (not indented or lightly indented, not starting with -) + if not line.startswith(" ") or ( + line.startswith(" ") and not line.startswith(" ") and not stripped.startswith("-") + ): + if stripped and not stripped.startswith("-"): + err_console.print(f" [dim]{stripped}[/dim]") + continue + + # Options: --flag Description + if stripped.startswith("-") or stripped.startswith("["): + # Split on double space to separate flag from description + parts = stripped.split(" ", 1) + if len(parts) == 2: + flag, desc = parts[0].strip(), parts[1].strip() + text = Text() + text.append(f" {flag:<30}", style=f"bold {BEE_AMBER}") + text.append(f" {desc}", style="dim") + err_console.print(text) + else: + err_console.print(f" [{BEE_AMBER}]{stripped}[/]") + continue + + # Commands list + if in_commands and stripped: + parts = stripped.split(" ", 1) + if len(parts) == 2: + cmd, desc = parts[0].strip(), parts[1].strip() + text = Text() + text.append(f" {cmd:<20}", style=f"bold {BEE_YELLOW}") + text.append(f" {desc}", style="dim") + err_console.print(text) + else: + err_console.print(f" [{BEE_YELLOW}]{stripped}[/]") + continue + + # Indented description continuation + if stripped: + err_console.print(f" [dim]{stripped}[/dim]") + + err_console.print() diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py new file mode 100644 index 0000000..d8757bd --- /dev/null +++ b/src/scrapingbee_cli/interactive.py @@ -0,0 +1,482 @@ +"""Interactive REPL mode for ScrapingBee CLI.""" + +from __future__ import annotations + +import shlex +import sys +import time + +from rich.text import Text + +from .theme import BEE_DIM, BEE_RED, BEE_YELLOW, err_console + +# Secondary brand colour for accents (footer, dimmed elements) +_BEE_ORANGE = "#FFB13D" + +# --------------------------------------------------------------------------- +# Splash animation +# --------------------------------------------------------------------------- + +_SCRAPINGBEE_LOGO = [ + " ███████╗ ██████╗██████╗ █████╗ ██████╗ ██╗███╗ ██╗ ██████╗ ", + " ██╔════╝██╔════╝██╔══██╗██╔══██╗██╔══██╗██║████╗ ██║██╔════╝ ", + " ███████╗██║ ██████╔╝███████║██████╔╝██║██╔██╗ ██║██║ ███╗", + " ╚════██║██║ ██╔══██╗██╔══██║██╔═══╝ ██║██║╚██╗██║██║ ██║", + " ███████║╚██████╗██║ ██║██║ ██║██║ ██║██║ ╚████║╚██████╔╝", + " ╚══════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚═════╝ ", +] + +_BEE_LOGO = [ + " ██████╗ ███████╗███████╗", + " ██╔══██╗██╔════╝██╔════╝", + " ██████╔╝█████╗ █████╗ ", + " ██╔══██╗██╔══╝ ██╔══╝ ", + " ██████╔╝███████╗███████╗", + " ╚═════╝ ╚══════╝╚══════╝", +] + +_BEE_FRAMES = ["\\(o_o)/", "_(o_o)_", "/(o_o)\\", "_(o_o)_"] + + +def play_splash(version: str) -> None: + """Bee accelerates across screen with bounce, then logo reveal.""" + if not sys.stderr.isatty(): + return + + import shutil + + width = shutil.get_terminal_size((80, 24)).columns + max_pos = min(width - 12, 55) + + # Phase 1: Bee accelerates right (ease-in), then bounces back slightly + total_steps = 40 + positions: list[int] = [] + for s in range(total_steps): + # ease-in-out cubic: accelerate → decelerate + t = s / (total_steps - 1) + eased = t * t * (3 - 2 * t) # smoothstep + positions.append(int(eased * max_pos)) + # Add a small bounce at the end + bounce_back = max(0, max_pos - 4) + positions.extend([bounce_back, max_pos - 2, max_pos]) + + for i, pos in enumerate(positions): + bee = _BEE_FRAMES[i % len(_BEE_FRAMES)] + # Fading honeycomb trail + trail_len = min(pos, 25) + trail = Text() + trail.append(" " * (pos - trail_len)) + for t_i in range(trail_len): + # Fade trail: older chars dimmer + age = trail_len - t_i + if age > 18: + trail.append("·", style="dim") + elif age > 10: + trail.append("~", style=_BEE_ORANGE) + else: + trail.append("~", style=f"bold {BEE_YELLOW}") + trail.append(bee, style=f"bold {BEE_YELLOW}") + + with err_console.capture() as cap: + err_console.print(trail, end="") + sys.stderr.write("\r\033[K" + cap.get()) + sys.stderr.flush() + # Speed: fast start, slow near end + delay = 0.012 + 0.015 * (i / len(positions)) + time.sleep(delay) + + sys.stderr.write("\r\033[K") + sys.stderr.flush() + time.sleep(0.12) + + # Phase 2: Logo appears line by line + err_console.print() + for logo_line in _SCRAPINGBEE_LOGO: + err_console.print(f"[bold {BEE_YELLOW}]{logo_line}[/]") + time.sleep(0.03) + for logo_line in _BEE_LOGO: + err_console.print(f"[bold white]{logo_line}[/]") + time.sleep(0.03) + + err_console.print() + ver = Text() + ver.append(f" v{version}", style=f"bold {BEE_YELLOW}") + ver.append(" \u2502 ", style="dim") + ver.append("Web scraping from the terminal", style="dim") + err_console.print(ver) + err_console.print() + time.sleep(0.15) + + +# --------------------------------------------------------------------------- +# Command registry & help +# --------------------------------------------------------------------------- + +_COMMANDS = [ + "scrape", + "crawl", + "google", + "fast-search", + "amazon-product", + "amazon-search", + "walmart-product", + "walmart-search", + "youtube-search", + "youtube-metadata", + "chatgpt", + "auth", + "logout", + "usage", + "schedule", + "export", + "docs", +] + +_COMMAND_HELP: dict[str, str] = { + "scrape": "Scrape a web page (single or batch)", + "crawl": "Crawl a site following links", + "google": "Google Search API", + "fast-search": "Fast Search API (sub-second)", + "amazon-product": "Amazon product details", + "amazon-search": "Search Amazon products", + "walmart-product": "Walmart product details", + "walmart-search": "Search Walmart products", + "youtube-search": "Search YouTube videos", + "youtube-metadata": "YouTube video metadata", + "chatgpt": "Query ChatGPT API", + "auth": "Save your API key", + "logout": "Remove stored API key", + "usage": "Check credits and concurrency", + "schedule": "Schedule recurring scrapes", + "export": "Merge batch output files", + "docs": "Open ScrapingBee documentation", + "help": "Show this command list", + "clear": "Clear the screen", + "exit": "Quit the REPL", +} + +_COMMON_FLAGS = [ + "--verbose", + "--output-file", + "--retries", + "--backoff", + "--render-js", + "--premium-proxy", + "--stealth-proxy", + "--country-code", + "--return-page-markdown", + "--return-page-text", + "--extract-rules", + "--ai-extract-rules", + "--ai-query", + "--input-file", + "--output-dir", + "--output-format", + "--concurrency", + "--screenshot", + "--json-response", + "--help", +] + + +def _print_repl_help() -> None: + err_console.print() + groups = { + "Scraping": ["scrape", "crawl"], + "Search": ["google", "fast-search"], + "Marketplaces": [ + "amazon-product", + "amazon-search", + "walmart-product", + "walmart-search", + ], + "Media": ["youtube-search", "youtube-metadata"], + "AI": ["chatgpt"], + "Account": ["auth", "logout", "usage", "schedule", "export", "docs"], + } + for group_name, cmds in groups.items(): + err_console.print(f" [{BEE_DIM}]{group_name}[/]") + for cmd in cmds: + err_console.print( + f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{_COMMAND_HELP.get(cmd, '')}[/]" + ) + err_console.print() + err_console.print(f" [{BEE_DIM}]Session[/]") + for cmd in ("help", "clear", "exit"): + err_console.print( + f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{_COMMAND_HELP.get(cmd, '')}[/]" + ) + err_console.print() + + +# --------------------------------------------------------------------------- +# prompt_toolkit setup (ScrapingBee brand theme) +# --------------------------------------------------------------------------- + +_STYLE_DICT = { + # Prompt: powerline arrow tag + "prompt.tag": "bg:#FFCD23 #000000 bold", + "prompt.arrow": "#FFCD23 bold", + "prompt.space": "", + # Completion dropdown + "completion-menu": "bg:#1a1400", + "completion-menu.completion": "bg:#1a1400 #FFCD23", + "completion-menu.completion.current": "bg:#FFCD23 #000000 bold", + "completion-menu.meta.completion": "bg:#1a1400 #886600", + "completion-menu.meta.completion.current": "bg:#FFCD23 #000000", + "scrollbar.background": "bg:#1a1400", + "scrollbar.button": "bg:#FFCD23", + # Ghost text + "auto-suggestion": "fg:#554400 italic", + # Hint line (above prompt) + "prompt.hint": "#665500 italic", +} + +_POWERLINE_ARROW = "\ue0b0" + +_STATIC_PROMPT = [ + ( + "class:prompt.hint", + " Tab complete \u2502 \u2191\u2193 history \u2502 \u2192 accept \u2502 Ctrl+C exit\n", + ), + ("", "\n"), + ("class:prompt.tag", " ScrapingBee "), + ("class:prompt.arrow", _POWERLINE_ARROW), + ("class:prompt.space", " "), +] + + +# --------------------------------------------------------------------------- +# Flag value completions +# --------------------------------------------------------------------------- + +_BOOL_FLAGS = frozenset( + { + "--render-js", + "--block-ads", + "--block-resources", + "--premium-proxy", + "--stealth-proxy", + "--forward-headers", + "--forward-headers-pure", + "--json-response", + "--screenshot", + "--screenshot-full-page", + "--return-page-source", + "--return-page-markdown", + "--return-page-text", + "--custom-google", + "--transparent-status-code", + "--add-html", + "--light-request", + "--deduplicate", + "--resume", + "--autothrottle", + } +) + +_CHOICE_FLAGS: dict[str, list[str]] = { + "--device": ["desktop", "mobile"], + "--output-format": ["files", "csv", "ndjson"], + "--method": ["GET", "POST", "PUT"], + "--wait-browser": ["domcontentloaded", "load", "networkidle0", "networkidle2"], + "--sort-by": ["best-match", "price-low", "price-high", "best-seller", "most-recent"], + "--search-type": ["web", "images", "news", "videos", "shopping"], + "--type": ["video", "channel", "playlist", "movie"], + "--duration": ["short", "medium", "long"], + "--upload-date": ["today", "last-hour", "this-week", "this-month", "this-year"], + "--preset": [ + "screenshot", + "screenshot-and-html", + "fetch", + "extract-links", + "extract-emails", + "extract-phones", + "scroll-page", + ], +} + + +def _make_completer(): + from prompt_toolkit.completion import Completer, Completion + + class BeeCompleter(Completer): + def get_completions(self, document, complete_event): + stripped = document.text_before_cursor.lstrip() + words = stripped.split() + + on_first_word = (not stripped) or (len(words) == 1 and not stripped.endswith(" ")) + if on_first_word: + partial = words[0].lower() if words else "" + for cmd in sorted(_COMMANDS + ["help", "clear", "exit"]): + if cmd.startswith(partial): + yield Completion( + cmd, + start_position=-len(partial), + display_meta=_COMMAND_HELP.get(cmd, ""), + ) + return + + last = words[-1] if words else "" + prev = words[-2] if len(words) >= 2 else "" + + if stripped.endswith(" ") and prev in _BOOL_FLAGS: + yield Completion("true", display_meta="enable") + yield Completion("false", display_meta="disable") + return + if stripped.endswith(" ") and prev in _CHOICE_FLAGS: + for val in _CHOICE_FLAGS[prev]: + yield Completion(val) + return + if len(words) >= 2 and not last.startswith("-"): + flag = words[-2] + if flag in _BOOL_FLAGS: + for val in ("true", "false"): + if val.startswith(last.lower()): + yield Completion(val, start_position=-len(last)) + return + if flag in _CHOICE_FLAGS: + for val in _CHOICE_FLAGS[flag]: + if val.startswith(last.lower()): + yield Completion(val, start_position=-len(last)) + return + if last.startswith("-"): + for flag in _COMMON_FLAGS: + if flag.startswith(last): + yield Completion(flag, start_position=-len(last)) + + return BeeCompleter() + + +def _make_key_bindings(): + from prompt_toolkit.filters import has_completions + from prompt_toolkit.key_binding import KeyBindings + + kb = KeyBindings() + + @kb.add("enter", filter=has_completions) + def _accept_completion(event): + """Enter with completion menu: keep current selection, close menu.""" + # prompt_toolkit already applies the selected completion as a preview + # in the buffer during navigation. Just dismiss the menu. + event.current_buffer.complete_state = None + + @kb.add("enter", filter=~has_completions) + def _submit_or_ignore(event): + """Enter without completion menu: submit if non-empty, else do nothing.""" + buf = event.current_buffer + if buf.text.strip(): + buf.validate_and_handle() + # Empty buffer: do nothing — cursor stays, no duplicate prompt + + return kb + + +def _build_session(history_path: str): + from prompt_toolkit import PromptSession + from prompt_toolkit.auto_suggest import AutoSuggestFromHistory + from prompt_toolkit.history import FileHistory + from prompt_toolkit.styles import Style + + try: + history = FileHistory(history_path) + except Exception: + history = None # type: ignore[assignment] + + return PromptSession( + history=history, + completer=_make_completer(), + complete_while_typing=False, + auto_suggest=AutoSuggestFromHistory(), + style=Style.from_dict(_STYLE_DICT), + key_bindings=_make_key_bindings(), + mouse_support=False, + enable_history_search=False, + vi_mode=False, + ) + + +# --------------------------------------------------------------------------- +# REPL main loop +# --------------------------------------------------------------------------- + + +def run_repl(cli_group: object, version: str) -> None: + from pathlib import Path + + import click + + from .theme import MiniBeeSpinner, set_repl_mode + + set_repl_mode(True) + + play_splash(version) + _print_repl_help() + + history_path = str(Path.home() / ".config" / "scrapingbee-cli" / ".history") + Path(history_path).parent.mkdir(parents=True, exist_ok=True) + + session = _build_session(history_path) + + while True: + try: + line = session.prompt(_STATIC_PROMPT).strip() + except KeyboardInterrupt: + err_console.print() + err_console.print(f" [bold {BEE_YELLOW}]Buzz off! See you next time.[/]") + break + except EOFError: + continue + + if not line: + continue + + lower = line.lower() + if lower in ("exit", "quit", "q"): + err_console.print(f" [bold {BEE_YELLOW}]Buzz off! See you next time.[/]") + break + + if lower in ("help", "?"): + _print_repl_help() + continue + + if lower == "clear": + import shutil + + rows = shutil.get_terminal_size((80, 24)).lines + # Print enough blank lines to scroll old content off screen, + # then move cursor up a few rows so prompt lands near the bottom + # (where the toolbar is) rather than stuck at the very top. + sys.stderr.write("\n" * rows) + sys.stderr.write(f"\033[{rows}A\033[J") + sys.stderr.flush() + continue + + if lower.startswith("scrapingbee "): + line = line[len("scrapingbee ") :].strip() + + try: + args = shlex.split(line) + except ValueError as e: + err_console.print(f" [bold {BEE_RED}]Parse error: {e}[/]") + continue + + if not args: + continue + + # Gap between prompt and command output + sys.stderr.write("\n") + + # Run command with bee spinner + spinner = MiniBeeSpinner(args[0]) + spinner.start() + try: + cli_group.main(args, standalone_mode=False) # type: ignore[union-attr] + except click.ClickException as e: + e.show() + except SystemExit: + pass + except Exception as e: + err_console.print(f" [bold {BEE_RED}]Error: {e}[/]") + finally: + spinner.stop() diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py new file mode 100644 index 0000000..40909db --- /dev/null +++ b/src/scrapingbee_cli/theme.py @@ -0,0 +1,825 @@ +"""ScrapingBee CLI theme: colors, styled output, and flapping-bee spinner. + +The spinner shows a single-line coloured bee with flapping wings and rotating +fun status messages tailored to each command. +""" + +from __future__ import annotations + +import os +import sys +import threading + +from rich.console import Console +from rich.text import Text +from rich.theme import Theme + +# -- ScrapingBee brand colours ----------------------------------------------- + +BEE_YELLOW = "#FFCD23" +BEE_DARK = "#0F0F0E" +BEE_WHITE = "#FFFFFF" +BEE_AMBER = "#E5A800" +BEE_GREEN = "#22C55E" +BEE_RED = "#EF4444" +BEE_DIM = "#888888" + +SCRAPINGBEE_THEME = Theme( + { + "bee": f"bold {BEE_YELLOW}", + "bee.dim": BEE_AMBER, + "info": f"bold {BEE_YELLOW}", + "success": f"bold {BEE_GREEN}", + "error": f"bold {BEE_RED}", + "warn": f"bold {BEE_AMBER}", + "dim": BEE_DIM, + "header": f"bold {BEE_WHITE}", + "key": f"bold {BEE_YELLOW}", + "value": BEE_WHITE, + } +) + + +def _want_color() -> bool | None: + if os.environ.get("NO_COLOR"): + return False + if os.environ.get("FORCE_COLOR"): + return True + return None + + +_color = _want_color() + +err_console = Console(stderr=True, theme=SCRAPINGBEE_THEME, highlight=False, force_terminal=_color) +console = Console(theme=SCRAPINGBEE_THEME, highlight=False, force_terminal=_color) + +# -- REPL mode flag ----------------------------------------------------------- +# When True, fancy visuals (panels, honeycomb, personality errors, styled help) +# are enabled. Direct CLI commands (scrapingbee scrape ...) keep plain output. + +_repl_mode = False + + +def set_repl_mode(enabled: bool = True) -> None: + """Enable or disable REPL-mode visuals.""" + global _repl_mode # noqa: PLW0603 + _repl_mode = enabled + + +def is_repl_mode() -> bool: + """Return True when running inside the interactive REPL.""" + return _repl_mode + + +# -- Single-line bee frames -------------------------------------------------- + +# Each frame is a tuple of (segment, style) pairs rendered inline. +# The bee body is yellow, wings are white, and they alternate to create a flap. +_BEE_INLINE_FRAMES: list[list[tuple[str, str]]] = [ + [ + ("\\", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("/", "bold white"), + ], + [ + ("᎑", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("᎑", "bold white"), + ], + [ + ("/", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("\\", "bold white"), + ], + [ + ("᎑", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("᎑", "bold white"), + ], +] + + +def _render_inline_bee(frame_idx: int) -> Text: + """Return a single-line bee Text for the given frame.""" + parts = _BEE_INLINE_FRAMES[frame_idx % len(_BEE_INLINE_FRAMES)] + text = Text() + for content, style in parts: + text.append(content, style=style) + return text + + +# -- Contextual status messages per command ---------------------------------- + +_BEE_FACTS = [ + "Did you know? Bees can fly up to 15 mph", + "Did you know? A bee visits 50-100 flowers per trip", + "Did you know? Bees have 5 eyes", + "Did you know? Honey never spoils", + "Did you know? Bees communicate by dancing", + "Did you know? A hive can have 60,000 bees", + "Did you know? Bees flap 200 times per second", + "Did you know? Bees can recognize human faces", + "Did you know? One bee makes 1/12 tsp of honey in its life", + "Did you know? Bees navigate using the sun", +] + +MESSAGES: dict[str, list[str]] = { + "scrape": [ + "Scraping", + "Extracting honey", + "Buzzing through HTML", + "Parsing the nectar", + "Dodging bot traps", + *_BEE_FACTS[:3], + ], + "google": [ + "Googling", + "Searching the hive", + "Pollinating results", + "Crawling the web", + "Fetching SERPs", + *_BEE_FACTS[3:6], + ], + "fast-search": [ + "Searching", + "Speed-buzzing", + "Zipping through results", + "Lightning fast", + *_BEE_FACTS[6:8], + ], + "crawl": [ + "Crawling", + "Following the trail", + "Exploring links", + "Mapping the web", + "Discovering pages", + *_BEE_FACTS[1:4], + ], + "usage": [ + "Checking the honeypot", + "Counting credits", + "Buzzing to the API", + *_BEE_FACTS[4:6], + ], + "amazon-product": [ + "Fetching product", + "Browsing the jungle", + "Hunting for deals", + "Reading reviews", + *_BEE_FACTS[7:9], + ], + "amazon-search": [ + "Searching Amazon", + "Flying through the jungle", + "Comparing prices", + "Scanning listings", + *_BEE_FACTS[0:2], + ], + "walmart-search": [ + "Searching Walmart", + "Rolling back prices", + "Scanning the shelves", + *_BEE_FACTS[5:7], + ], + "walmart-product": [ + "Fetching product", + "Checking the aisle", + "Reading the label", + *_BEE_FACTS[8:10], + ], + "youtube-search": [ + "Searching YouTube", + "Streaming honey", + "Tuning in", + "Browsing videos", + *_BEE_FACTS[2:4], + ], + "youtube-metadata": [ + "Fetching metadata", + "Reading the description", + "Counting views", + *_BEE_FACTS[9:10], + ], + "chatgpt": [ + "Querying ChatGPT", + "Consulting the hive mind", + "Thinking bee thoughts", + "Processing prompt", + *_BEE_FACTS[4:6], + ], + "sitemap": [ + "Fetching sitemap", + "Reading the map", + "Charting the course", + *_BEE_FACTS[6:8], + ], + "_default": [ + "Working", + "Buzzing", + "zZZzzzZZ", + "Bee patient", + "Almost done", + *_BEE_FACTS[:5], + ], +} + +# How many spinner ticks before rotating to the next message. +_MSG_ROTATE_TICKS = 18 # ~0.9s at 50ms per tick + + +# -- Flapping-bee spinner (single-line) -------------------------------------- + + +class MiniBeeSpinner: + """Single-line flapping-bee spinner with rotating contextual messages. + + Usage:: + + with MiniBeeSpinner("scrape"): + await do_request() + + The *message* argument is a command key into ``MESSAGES``. If the key is + not found it is used as a literal first message with ``_default`` extras. + """ + + def __init__(self, message: str = "scrape") -> None: + # Resolve message list. + if message in MESSAGES: + self._messages = MESSAGES[message] + else: + self._messages = [message] + MESSAGES["_default"] + self._messages = self._messages + _time_flavor() + self._stop = threading.Event() + self._thread: threading.Thread | None = None + + def _animate(self) -> None: + idx = 0 + msg_idx = 0 + while not self._stop.is_set(): + # Rotate message every N ticks. + if idx > 0 and idx % _MSG_ROTATE_TICKS == 0: + msg_idx = (msg_idx + 1) % len(self._messages) + + bee = _render_inline_bee(idx) + msg = self._messages[msg_idx] + dots = "." * ((idx % 3) + 1) + + line = Text() + line.append(" ") + line.append_text(bee) + line.append(" ") + line.append(msg, style=f"bold {BEE_YELLOW}") + line.append(dots.ljust(4), style="dim") + + with err_console.capture() as capture: + err_console.print(line, end="") + sys.stderr.write("\r\033[K" + capture.get()) + sys.stderr.flush() + + idx += 1 + self._stop.wait(0.05) + + # Clear the spinner line. + sys.stderr.write("\r\033[K") + sys.stderr.flush() + + def start(self) -> None: + if not sys.stderr.isatty(): + return + self._thread = threading.Thread(target=self._animate, daemon=True) + self._thread.start() + + def stop(self) -> None: + self._stop.set() + if self._thread is not None: + self._thread.join(timeout=1) + + def __enter__(self) -> MiniBeeSpinner: + self.start() + return self + + def __exit__(self, *_: object) -> None: + self.stop() + + +# -- Styled output helpers --------------------------------------------------- + + +def print_banner() -> None: + """Print the ScrapingBee CLI banner to stderr.""" + banner = Text() + bee = _render_inline_bee(0) + banner.append(" ") + banner.append_text(bee) + banner.append(" ScrapingBee", style=f"bold {BEE_YELLOW}") + banner.append(" CLI", style="bold white") + err_console.print(banner) + + +def styled_echo(message: str, *, style: str = "info", err: bool = True) -> None: + c = err_console if err else console + c.print(f"[{style}]{message}[/{style}]") + + +def echo_success(message: str) -> None: + err_console.print(f"[success]{message}[/success]") + + +def echo_error(message: str) -> None: + err_console.print(f"[error]{message}[/error]") + + +def echo_warning(message: str) -> None: + err_console.print(f"[warn]{message}[/warn]") + + +def echo_key_value(key: str, value: str) -> None: + text = Text() + text.append(f" {key}: ", style=f"bold {BEE_YELLOW}") + text.append(value, style="white") + err_console.print(text) + + +def echo_separator() -> None: + err_console.print(f"[dim]{'─' * 40}[/dim]") + + +def format_progress_line( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, +) -> Text: + width = 20 + filled = int(width * completed / total) if total > 0 else 0 + bar = "█" * filled + "░" * (width - filled) + + text = Text() + text.append(" ") + text.append(bar, style=f"bold {BEE_YELLOW}") + text.append(f" {completed}/{total}", style="bold white") + if rps is not None: + text.append(f" {rps:.0f} req/s", style="dim") + if eta is not None: + text.append(f" ETA {eta}", style="dim") + if failure_pct is not None and failure_pct > 0: + text.append(f" Failures: {failure_pct:.0f}%", style=f"bold {BEE_RED}") + return text + + +# -- Live credit tracker (polls usage API during batch/crawl) ---------------- + + +class LiveCreditTracker: + """Background thread that polls the usage API every 20 seconds and prints + an updating honeycomb credit line to stderr. Only active in REPL mode. + + Usage:: + + with LiveCreditTracker(api_key, initial_remaining=33_000_000, total=50_000_000): + run_batch(...) + """ + + _POLL_INTERVAL = 20 # seconds (safe: 3× per minute, limit is 6×) + + def __init__( + self, + api_key: str, + *, + initial_remaining: int | None = None, + total: int | None = None, + ) -> None: + self._api_key = api_key + self._remaining = initial_remaining + self._total = total + self._start_remaining = initial_remaining + self._stop = threading.Event() + self._thread: threading.Thread | None = None + + # -- internal ------------------------------------------------------------ + + def _fetch(self) -> tuple[int, int] | None: + """Return (remaining, total) or None on error.""" + import asyncio + import json as _json + + from .client import Client + from .config import BASE_URL + + try: + async def _go() -> tuple[int, int] | None: + async with Client(self._api_key, BASE_URL, timeout=10) as c: + body, _, code = await c.usage() + if code == 200: + raw = _json.loads(body) + used = raw.get("used_api_credit", 0) or 0 + total = raw.get("max_api_credit", 0) or 0 + return total - used, total + return None + + return asyncio.run(_go()) + except Exception: + return None + + def _print_meter(self) -> None: + if self._remaining is None or self._total is None: + return + line = Text() + line.append(" ⬡ Credits: ", style=f"bold {BEE_YELLOW}") + line.append_text(format_honeycomb_meter( + self._total - self._remaining, self._total + )) + if self._start_remaining is not None: + consumed = self._start_remaining - self._remaining + if consumed > 0: + line.append(f" (−{consumed:,} this session)", style="dim") + err_console.print(line) + + def _run(self) -> None: + while not self._stop.wait(self._POLL_INTERVAL): + if self._stop.is_set(): + break + result = self._fetch() + if result: + self._remaining, self._total = result + self._print_meter() + + # -- public -------------------------------------------------------------- + + def start(self) -> None: + if not _repl_mode: + return + # Print initial meter immediately if we have data + if self._remaining is not None: + self._print_meter() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def stop(self) -> None: + self._stop.set() + if self._thread is not None: + self._thread.join(timeout=2) + + def __enter__(self) -> LiveCreditTracker: + self.start() + return self + + def __exit__(self, *_: object) -> None: + self.stop() + + +# -- Honeycomb credit meter -------------------------------------------------- + + +def format_honeycomb_meter(used: int, total: int) -> Text: + """Render a honeycomb-style credit meter. ⬡ = used, ⬢ = remaining.""" + width = 20 + if total <= 0: + pct = 0.0 + else: + pct = (total - used) / total + remaining = total - used + filled = int(width * pct) # remaining portion (yellow) + empty = width - filled # used portion (dim) + + text = Text() + text.append(" ") + text.append("⬡" * filled, style=f"bold {BEE_YELLOW}") + text.append("⬢" * empty, style="dim") + text.append(f" {remaining:,} / {total:,} credits remaining", style="bold white") + + # Color the percentage based on health + pct_val = pct * 100 + if pct_val > 50: + pct_style = f"bold {BEE_GREEN}" + elif pct_val > 20: + pct_style = f"bold {BEE_AMBER}" + else: + pct_style = f"bold {BEE_RED}" + text.append(f" ({pct_val:.0f}%)", style=pct_style) + return text + + +# -- Completion summary panel ------------------------------------------------ + + +def print_completion_summary( + *, + succeeded: int, + failed: int, + duration_s: float | None = None, + credits_used: int | None = None, + output_path: str | None = None, + is_crawl: bool = False, +) -> None: + """Print a styled completion summary panel to stderr.""" + from rich.panel import Panel + from rich.table import Table + + total = succeeded + failed + table = Table(show_header=False, box=None, padding=(0, 1)) + table.add_column(style=f"bold {BEE_YELLOW}", min_width=12) + table.add_column(style="bold white") + + # Status line + if failed == 0: + status = Text() + status.append(" \\(◉ω◉)/ ", style=f"bold {BEE_YELLOW}") + status.append("Mission accomplished!", style=f"bold {BEE_GREEN}") + else: + status = Text() + status.append(" /(◉ω◉)\\ ", style=f"bold {BEE_YELLOW}") + status.append(f"{succeeded} succeeded, {failed} failed", style=f"bold {BEE_AMBER}") + + table.add_row( + "Items", + f"{succeeded}/{total} succeeded" + (f" ({failed} failed)" if failed else ""), + ) + if credits_used is not None: + table.add_row("Credits", f"{credits_used:,} used") + if duration_s is not None: + if duration_s < 60: + dur_str = f"{duration_s:.1f}s" + else: + m, s = divmod(int(duration_s), 60) + dur_str = f"{m}m {s}s" + table.add_row("Duration", dur_str) + if total > 0 and duration_s > 0: + table.add_row("Avg speed", f"{total / duration_s:.1f} req/s") + if output_path: + table.add_row("Output", output_path) + if failed > 0: + tip = ( + "Tip: Retry failures with --resume" + if not is_crawl + else "Tip: Re-run with --resume to retry" + ) + table.add_row("", Text(tip, style="dim")) + + title = "Crawl Complete" if is_crawl else "Batch Complete" + panel = Panel( + table, + title=f"[bold {BEE_YELLOW}]{title}[/]", + subtitle=str(status), + border_style=BEE_YELLOW, + padding=(1, 2), + ) + err_console.print(panel) + + +# -- Honeycomb trail progress ------------------------------------------------ + + +def format_honeycomb_trail( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, +) -> Text: + """Bee flying across a honeycomb trail: ⬡⬡⬡\\(◉ω◉)/⬢⬢⬢""" + width = 25 + if total <= 0: + pos = 0 + else: + pos = int(width * completed / total) + pos = min(pos, width) + + trail_done = "⬡" * pos + trail_left = "⬢" * (width - pos) + + bee_frames = ["\\(◉ω◉)/", "᎑(◉ω◉)᎑", "/(◉ω◉)\\", "᎑(◉ω◉)᎑"] + bee = bee_frames[completed % len(bee_frames)] + + text = Text() + text.append(" ") + text.append(trail_done, style=f"bold {BEE_YELLOW}") + text.append(bee, style=f"bold {BEE_YELLOW}") + text.append(trail_left, style="dim") + text.append(f" {completed}/{total}", style="bold white") + if rps is not None: + text.append(f" {rps:.1f} req/s", style="dim") + if eta is not None: + text.append(f" ETA {eta}", style="dim") + if failure_pct is not None and failure_pct > 0: + text.append(f" Failures: {failure_pct:.0f}%", style=f"bold {BEE_RED}") + return text + + +# -- Notification helper (cross-platform) ------------------------------------ + + +def notify_completion(title: str, body: str) -> None: + """Send a desktop notification + terminal bell. Cross-platform.""" + import shutil + import subprocess + + # Terminal bell + sys.stderr.write("\a") + sys.stderr.flush() + + try: + if sys.platform == "darwin": + subprocess.run( + [ + "osascript", + "-e", + f'display notification "{body}" with title "{title}"', + ], + capture_output=True, + timeout=5, + ) + elif sys.platform == "win32": + # PowerShell toast notification + ps_cmd = ( + f"[Windows.UI.Notifications.ToastNotificationManager, Windows.UI.Notifications, " + f"ContentType = WindowsRuntime] > $null; " + f"$template = [Windows.UI.Notifications.ToastNotificationManager]::" + f"GetTemplateContent([Windows.UI.Notifications.ToastTemplateType]::ToastText02); " + f"$textNodes = $template.GetElementsByTagName('text'); " + f"$textNodes.Item(0).AppendChild($template.CreateTextNode('{title}')) > $null; " + f"$textNodes.Item(1).AppendChild($template.CreateTextNode('{body}')) > $null; " + f"$toast = [Windows.UI.Notifications.ToastNotification]::new($template); " + f"[Windows.UI.Notifications.ToastNotificationManager]::" + f"CreateToastNotifier('ScrapingBee CLI').Show($toast)" + ) + subprocess.run( + ["powershell", "-Command", ps_cmd], + capture_output=True, + timeout=10, + ) + elif shutil.which("notify-send"): + subprocess.run( + ["notify-send", title, body, "-i", "dialog-information"], + capture_output=True, + timeout=5, + ) + except Exception: + pass # Notification is best-effort + + +# -- Styled version output --------------------------------------------------- + + +def print_styled_version(version: str) -> None: + """Print a branded version line to stderr.""" + import platform + + bee = _render_inline_bee(0) + text = Text() + text.append(" ") + text.append_text(bee) + text.append(" ScrapingBee CLI ", style=f"bold {BEE_YELLOW}") + text.append(f"v{version}", style="bold white") + err_console.print(text) + err_console.print(f" [dim]Python {platform.python_version()} | {sys.platform}[/dim]") + # Try to show credit balance + try: + from .config import get_api_key + + api_key = get_api_key(None) + if api_key: + import asyncio + + from .client import Client + from .config import BASE_URL + + async def _check(): + async with Client(api_key, BASE_URL, timeout=10) as c: + body, _, code = await c.usage() + if code == 200: + from .client import parse_usage + + return parse_usage(body) + return None + + usage = asyncio.run(_check()) + if usage: + remaining = usage.get("credits", 0) + err_console.print( + f" [dim]API credits remaining:[/dim] [bold {BEE_GREEN}]{remaining:,}[/bold {BEE_GREEN}]" + ) + except Exception: + pass + + +# -- Welcome banner with grouped commands ------------------------------------ + + +def print_welcome_banner(version: str, commands: dict[str, list[tuple[str, str]]]) -> None: + """Print a branded welcome screen with grouped commands. + + commands: dict mapping group name to list of (cmd_name, description) tuples. + """ + # Header + bee = _render_inline_bee(0) + header = Text() + header.append(" ") + header.append_text(bee) + header.append(" ScrapingBee CLI ", style=f"bold {BEE_YELLOW}") + header.append(f"v{version}", style="bold white") + err_console.print(header) + err_console.print(" [dim]Web scraping from the terminal, powered by bees.[/dim]") + err_console.print() + + # Command groups + for group_name, cmds in commands.items(): + err_console.print(f" [bold {BEE_YELLOW}]~~ {group_name} ~~[/]") + for cmd_name, description in cmds: + err_console.print(f" [bold {BEE_YELLOW}]{cmd_name:<20}[/] [dim]{description}[/dim]") + err_console.print() + + err_console.print( + " [dim]Run[/dim] [bold white]scrapingbee --help[/] [dim]for details.[/dim]" + ) + err_console.print() + + +# -- Personality error messages ---------------------------------------------- + +_ERROR_MESSAGES: dict[int, tuple[str, str]] = { + 401: ("Bzzt! Invalid API key", "Run: scrapingbee auth"), + 403: ( + "The page stung back! (403 Forbidden)", + "Try --premium-proxy or --stealth-proxy", + ), + 404: ("The page flew away! (404 Not Found)", "Double-check your URL"), + 429: ( + "Whoa, too fast! The hive needs a breather (429)", + "Use --concurrency to slow down, or wait a moment", + ), + 500: ( + "Something went wrong on their end (500)", + "Use --retries to try again automatically", + ), + 502: ("The upstream hive is down (502)", "Try again in a moment"), + 503: ( + "Service temporarily unavailable (503)", + "The target is overloaded — retry shortly", + ), +} + + +def echo_bee_error(status_code: int, fallback_msg: str = "") -> None: + """Print a bee-personality error with actionable tip.""" + if status_code in _ERROR_MESSAGES: + msg, tip = _ERROR_MESSAGES[status_code] + bee = _render_inline_bee(2) # wings-down frame for errors + line = Text() + line.append(" ") + line.append_text(bee) + line.append(f" {msg}", style=f"bold {BEE_RED}") + err_console.print(line) + err_console.print(f" [dim]Tip: {tip}[/dim]") + else: + echo_error(fallback_msg or f"Error: HTTP {status_code}") + + +# -- Time-aware messages ----------------------------------------------------- + + +def _time_flavor() -> list[str]: + """Return extra messages based on time of day.""" + from datetime import datetime + + hour = datetime.now().hour + day = datetime.now().weekday() + + extras: list[str] = [] + if 0 <= hour < 6: + extras = ["The web never sleeps", "Late night data hunt", "Nocturnal bee mode"] + elif 6 <= hour < 12: + extras = [ + "Rise and scrape!", + "Fresh morning data", + "Early bird gets the data", + ] + elif 12 <= hour < 18: + extras = ["Afternoon buzz", "Peak pollination hours"] + else: + extras = ["Evening crawl session", "Burning the midnight nectar"] + + if day == 0: + extras.append("Monday motivation: fresh data!") + elif day == 4: + extras.append("TGIF — last scrape of the week?") + return extras From 68d8cabe4907062872e84b873a8bbf72071e53c2 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 7 May 2026 14:17:34 +0530 Subject: [PATCH 02/15] feat(repl): theme integration in commands (Phase 2) Inside the REPL, commands now show a MiniBeeSpinner during the API call, batches show a live honeycomb credit meter via LiveCreditTracker, and verbose / completion output is rendered with rich-styled helpers from theme.py. Changes: - batch.py: new _batch_done helper; honeycomb-trail progress; styled batch-start banner; LiveCreditTracker wrap around the batch run; usage_info kwarg on run_api_batch - cli_utils.py: REPL-mode branches in _validate_range, check_api_response, scrape_with_escalation, and write_output verbose section - client.py: parse_usage now exposes max_api_credit (needed by LiveCreditTracker) - commands/{amazon,chatgpt,fast_search,google,walmart,youtube}.py: MiniBeeSpinner around single API calls; usage_info pass-through to run_api_batch - commands/scrape.py: spinner around single scrape; LiveCreditTracker around batch; REPL-styled error on HTTP 4xx/5xx - commands/usage.py: full styled dashboard (honeycomb meter, credits used/remaining/total, concurrency, renewal date) when invoked from the REPL; plain JSON kept for non-REPL - commands/crawl.py: LiveCreditTracker wrap around run_urls_spider so credit drain during long crawls is visible Plain (non-REPL) output is unchanged for every code path. All 653 existing unit tests still pass. --- src/scrapingbee_cli/batch.py | 154 ++++++++++++++------ src/scrapingbee_cli/cli_utils.py | 103 +++++++++---- src/scrapingbee_cli/client.py | 1 + src/scrapingbee_cli/commands/amazon.py | 80 +++++----- src/scrapingbee_cli/commands/chatgpt.py | 23 +-- src/scrapingbee_cli/commands/crawl.py | 42 +++--- src/scrapingbee_cli/commands/fast_search.py | 23 +-- src/scrapingbee_cli/commands/google.py | 35 +++-- src/scrapingbee_cli/commands/scrape.py | 36 +++-- src/scrapingbee_cli/commands/usage.py | 94 ++++++++++-- src/scrapingbee_cli/commands/walmart.py | 72 +++++---- src/scrapingbee_cli/commands/youtube.py | 62 ++++---- 12 files changed, 483 insertions(+), 242 deletions(-) diff --git a/src/scrapingbee_cli/batch.py b/src/scrapingbee_cli/batch.py index 2b7a94b..d59c204 100644 --- a/src/scrapingbee_cli/batch.py +++ b/src/scrapingbee_cli/batch.py @@ -17,6 +17,16 @@ from .client import Client, parse_usage from .config import BASE_URL, get_api_key +from .theme import ( + LiveCreditTracker, + echo_warning, + err_console, + format_honeycomb_trail, + is_repl_mode, + notify_completion, + print_completion_summary, + styled_echo, +) # Map Content-Type (main part, lowercased) to file extension for batch output. CONTENT_TYPE_EXTENSION: dict[str, str] = { @@ -442,11 +452,17 @@ def resolve_batch_concurrency( if user_concurrency > 0: cap = min(from_usage, CONCURRENCY_CAP) if user_concurrency > cap and warn: - click.echo( - f"Warning: concurrency capped at {cap} (plan limit or max {CONCURRENCY_CAP}). " - "Very high concurrency can overload your network.", - err=True, - ) + if is_repl_mode(): + echo_warning( + f"Concurrency capped at {cap} (plan limit or max {CONCURRENCY_CAP}). " + "Very high concurrency can overload your network." + ) + else: + click.echo( + f"Warning: concurrency capped at {cap} (plan limit or max {CONCURRENCY_CAP}). " + "Very high concurrency can overload your network.", + err=True, + ) return min(user_concurrency, cap) return max(1, from_usage) @@ -524,7 +540,10 @@ async def run_batch_async( concurrency = min(max(1, concurrency), len(inputs)) source = "from --concurrency" if from_user else "from usage API" total = len(inputs) - click.echo(f"Batch: {total} items, concurrency {concurrency} ({source})", err=True) + if is_repl_mode(): + styled_echo(f"Batch: {total} items, concurrency {concurrency} ({source})", style="info") + else: + click.echo(f"Batch: {total} items, concurrency {concurrency} ({source})", err=True) sem = asyncio.Semaphore(concurrency) completed = 0 failure_count = 0 @@ -567,17 +586,30 @@ async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: failure_count += 1 if show_progress: elapsed = time.monotonic() - start_time - parts = [f"[{completed}/{total}]"] + rps_val = None + eta_val = None + fail_pct = None if elapsed > 0: - rps = completed / elapsed - parts.append(f"{rps:.0f} req/s") + rps_val = completed / elapsed remaining = total - completed - if rps > 0 and remaining > 0: - parts.append(f"ETA {_format_eta(remaining / rps)}") + if rps_val > 0 and remaining > 0: + eta_val = _format_eta(remaining / rps_val) if failure_count > 0: - pct = failure_count / completed * 100 - parts.append(f"Failures: {pct:.0f}%") - click.echo(f" {' | '.join(parts)}", err=True) + fail_pct = failure_count / completed * 100 + if is_repl_mode(): + progress = format_honeycomb_trail( + completed, total, rps=rps_val, eta=eta_val, failure_pct=fail_pct + ) + err_console.print(progress) + else: + parts = [f"[{completed}/{total}]"] + if rps_val is not None: + parts.append(f"{rps_val:.0f} req/s") + if eta_val is not None: + parts.append(f"ETA {eta_val}") + if fail_pct is not None and fail_pct > 0: + parts.append(f"Failures: {fail_pct:.0f}%") + click.echo(f" {' | '.join(parts)}", err=True) if on_result is not None: on_result(result) return i, result @@ -1072,6 +1104,25 @@ def write_batch_output_csv( ApiCallFn = Callable[[Client, str], Awaitable[tuple[bytes, dict, int]]] +def _batch_done( + plain_msg: str, + *, + succeeded: int = 0, + failed: int = 0, + duration_s: float | None = None, + output_path: str | None = None, + err: bool = True, +) -> None: + """Print batch completion — fancy panel in REPL, plain line otherwise.""" + if is_repl_mode(): + print_completion_summary( + succeeded=succeeded, failed=failed, duration_s=duration_s, output_path=output_path + ) + notify_completion("ScrapingBee", plain_msg) + else: + click.echo(plain_msg, err=err) + + async def _run_api_batch_async( key: str, inputs: list[str], @@ -1091,6 +1142,7 @@ async def _run_api_batch_async( extract_field: str | None = None, fields: str | None = None, ) -> None: + _batch_start = time.monotonic() ndjson_pp = post_process if output_format == "ndjson" else None ndjson_fh = None if output_format == "ndjson" and output_file: @@ -1162,6 +1214,7 @@ async def do_one(item: str): out_dir_resolved = "" out_file_resolved = "" + _duration = time.monotonic() - _batch_start if update_csv_path: out_file_resolved, succeeded, failed = update_csv_with_results( update_csv_path, @@ -1169,9 +1222,12 @@ async def do_one(item: str): results, output_file, ) - click.echo( + _batch_done( f"CSV updated: {succeeded} succeeded, {failed} failed. Output: {out_file_resolved}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_file_resolved, ) elif output_format == "ndjson": if ndjson_fh: @@ -1180,9 +1236,12 @@ async def do_one(item: str): failed = sum(1 for r in results if r.error and not r.skipped) out_file_resolved = output_file or "" out_label = out_file_resolved or "" - click.echo( + _batch_done( f"Batch complete: {succeeded} succeeded, {failed} failed. Output: {out_label}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_file_resolved or None, ) elif output_format == "csv": if post_process: @@ -1192,9 +1251,12 @@ async def do_one(item: str): out_file_resolved, succeeded, failed = write_batch_output_csv( results, output_file, fields=fields ) - click.echo( + _batch_done( f"Batch complete: {succeeded} succeeded, {failed} failed. Output: {out_file_resolved}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_file_resolved, ) else: out_dir_resolved, succeeded, failed = write_batch_output_to_dir( @@ -1203,9 +1265,12 @@ async def do_one(item: str): verbose, post_process=post_process, ) - click.echo( + _batch_done( f"Batch complete: {succeeded} succeeded, {failed} failed. Output: {out_dir_resolved}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_dir_resolved, ) if on_complete: from .cli_utils import run_on_complete @@ -1239,26 +1304,31 @@ def run_api_batch( output_file: str | None = None, extract_field: str | None = None, fields: str | None = None, + usage_info: dict | None = None, ) -> None: """Run a batch of single-item API calls and write results.""" - asyncio.run( - _run_api_batch_async( - key=key, - inputs=inputs, - concurrency=concurrency, - from_user=from_user, - skip_n=skip_n, - output_dir=output_dir, - verbose=verbose, - show_progress=show_progress, - api_call=api_call, - on_complete=on_complete, - output_format=output_format, - post_process=post_process, - update_csv_path=update_csv_path, - input_column=input_column, - output_file=output_file, - extract_field=extract_field, - fields=fields, + # In REPL mode show live credit updates every 20s during the batch. + initial_remaining = usage_info.get("credits") if usage_info else None + initial_total = usage_info.get("max_api_credit") if usage_info else None + with LiveCreditTracker(key, initial_remaining=initial_remaining, total=initial_total): + asyncio.run( + _run_api_batch_async( + key=key, + inputs=inputs, + concurrency=concurrency, + from_user=from_user, + skip_n=skip_n, + output_dir=output_dir, + verbose=verbose, + show_progress=show_progress, + api_call=api_call, + on_complete=on_complete, + output_format=output_format, + post_process=post_process, + update_csv_path=update_csv_path, + input_column=input_column, + output_file=output_file, + extract_field=extract_field, + fields=fields, + ) ) - ) diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index 42e0d60..efd0526 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -10,6 +10,16 @@ import click +from .theme import ( + echo_bee_error, + echo_error, + echo_key_value, + echo_separator, + echo_warning, + is_repl_mode, + styled_echo, +) + class NormalizedChoice(click.Choice): """Choice type that accepts both hyphens and underscores. @@ -1220,7 +1230,10 @@ def _validate_range( return if value < min_val or value > max_val: u = f" {unit}" if unit else "" - click.echo(f"{name} must be between {min_val} and {max_val}{u}", err=True) + if is_repl_mode(): + echo_error(f"{name} must be between {min_val} and {max_val}{u}") + else: + click.echo(f"{name} must be between {min_val} and {max_val}{u}", err=True) raise SystemExit(1) @@ -1372,7 +1385,10 @@ def check_api_response(data: bytes, status_code: int, err_prefix: str = "Error") from .client import pretty_json if status_code >= 400: - click.echo(f"{err_prefix}: HTTP {status_code}", err=True) + if is_repl_mode(): + echo_bee_error(status_code, f"{err_prefix}: HTTP {status_code}") + else: + click.echo(f"{err_prefix}: HTTP {status_code}", err=True) try: click.echo(pretty_json(data), err=True) except Exception: @@ -1459,7 +1475,12 @@ async def scrape_with_escalation( already = any(scrape_kwargs.get(k) for k in tier_overrides) if already: continue - click.echo(f"[escalate-proxy] {url}: blocked, retrying with {tier_name} proxy", err=True) + if is_repl_mode(): + echo_warning(f"[escalate-proxy] {url}: blocked, retrying with {tier_name} proxy") + else: + click.echo( + f"[escalate-proxy] {url}: blocked, retrying with {tier_name} proxy", err=True + ) escalated = {**scrape_kwargs, **tier_overrides} data, headers, status_code = await client.scrape(url, **escalated) if verbose: @@ -1557,29 +1578,59 @@ def write_output( Precedence: *smart_extract* > *extract_field* > *fields*. """ if verbose: - click.echo(f"HTTP Status: {status_code}", err=True) - headers_lower = {k.lower(): (k, v) for k, v in headers.items()} - spb_cost_present = False - for key, label in [ - ("spb-cost", "Credit Cost"), - ("spb-resolved-url", "Resolved URL"), - ("spb-initial-status-code", "Initial Status Code"), - ]: - if key in headers_lower: - _, val = headers_lower[key] - if val: - click.echo(f"{label}: {val}", err=True) - if key == "spb-cost": - spb_cost_present = True - if not spb_cost_present: - if credit_cost is not None: - click.echo(f"Credit Cost: {credit_cost}", err=True) - elif command: - from scrapingbee_cli.credits import ESTIMATED_CREDITS - - if command in ESTIMATED_CREDITS: - click.echo(f"Credit Cost (estimated): {ESTIMATED_CREDITS[command]}", err=True) - click.echo("---", err=True) + if is_repl_mode(): + status_style = "success" if status_code < 400 else "error" + styled_echo(f"HTTP Status: {status_code}", style=status_style) + headers_lower = {k.lower(): (k, v) for k, v in headers.items()} + spb_cost_present = False + for key, label in [ + ("spb-cost", "Credit Cost"), + ("spb-resolved-url", "Resolved URL"), + ("spb-initial-status-code", "Initial Status Code"), + ]: + if key in headers_lower: + _, val = headers_lower[key] + if val: + echo_key_value(label, str(val)) + if key == "spb-cost": + spb_cost_present = True + if not spb_cost_present: + if credit_cost is not None: + echo_key_value("Credit Cost", str(credit_cost)) + elif command: + from scrapingbee_cli.credits import ESTIMATED_CREDITS + + if command in ESTIMATED_CREDITS: + echo_key_value( + "Credit Cost (estimated)", str(ESTIMATED_CREDITS[command]) + ) + echo_separator() + else: + click.echo(f"HTTP Status: {status_code}", err=True) + headers_lower = {k.lower(): (k, v) for k, v in headers.items()} + spb_cost_present = False + for key, label in [ + ("spb-cost", "Credit Cost"), + ("spb-resolved-url", "Resolved URL"), + ("spb-initial-status-code", "Initial Status Code"), + ]: + if key in headers_lower: + _, val = headers_lower[key] + if val: + click.echo(f"{label}: {val}", err=True) + if key == "spb-cost": + spb_cost_present = True + if not spb_cost_present: + if credit_cost is not None: + click.echo(f"Credit Cost: {credit_cost}", err=True) + elif command: + from scrapingbee_cli.credits import ESTIMATED_CREDITS + + if command in ESTIMATED_CREDITS: + click.echo( + f"Credit Cost (estimated): {ESTIMATED_CREDITS[command]}", err=True + ) + click.echo("---", err=True) if smart_extract: from .extract import smart_extract as _smart_extract_fn diff --git a/src/scrapingbee_cli/client.py b/src/scrapingbee_cli/client.py index 32b420a..a6a768c 100644 --- a/src/scrapingbee_cli/client.py +++ b/src/scrapingbee_cli/client.py @@ -588,6 +588,7 @@ def parse_usage(body: bytes) -> dict: avail = int(max_credit) - int(used_credit) if avail >= 0: out["credits"] = avail + out["max_api_credit"] = int(max_credit) max_concurrency_val = data.get("max_concurrency") if max_concurrency_val is not None and isinstance(max_concurrency_val, (int, float)): diff --git a/src/scrapingbee_cli/commands/amazon.py b/src/scrapingbee_cli/commands/amazon.py index 7a01a1c..c4b4cfb 100644 --- a/src/scrapingbee_cli/commands/amazon.py +++ b/src/scrapingbee_cli/commands/amazon.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from contextlib import nullcontext import click from click_option_group import optgroup @@ -29,6 +30,7 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode AMAZON_SORT_BY = [ "most-recent", @@ -147,6 +149,7 @@ async def api_call(client, a): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -155,21 +158,23 @@ async def api_call(client, a): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.amazon_product( - asin, - device=device, - domain=domain, - country=country, - zip_code=zip_code, - language=language, - currency=currency, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("amazon-product") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.amazon_product( + asin, + device=device, + domain=domain, + country=country, + zip_code=zip_code, + language=language, + currency=currency, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import amazon_credits @@ -319,6 +324,7 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -327,27 +333,29 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.amazon_search( - query, - start_page=start_page, - pages=pages, - sort_by=norm_val(sort_by), - device=device, - domain=domain, - country=country, - zip_code=zip_code, - language=language, - currency=currency, - category_id=category_id, - merchant_id=merchant_id, - autoselect_variant=parse_bool(autoselect_variant), - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("amazon-search") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.amazon_search( + query, + start_page=start_page, + pages=pages, + sort_by=norm_val(sort_by), + device=device, + domain=domain, + country=country, + zip_code=zip_code, + language=language, + currency=currency, + category_id=category_id, + merchant_id=merchant_id, + autoselect_variant=parse_bool(autoselect_variant), + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import amazon_credits diff --git a/src/scrapingbee_cli/commands/chatgpt.py b/src/scrapingbee_cli/commands/chatgpt.py index 7ac63cd..351ee8c 100644 --- a/src/scrapingbee_cli/commands/chatgpt.py +++ b/src/scrapingbee_cli/commands/chatgpt.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from contextlib import nullcontext import click @@ -24,6 +25,7 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode @click.command() @@ -115,6 +117,7 @@ async def api_call(client, p): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -125,15 +128,17 @@ async def api_call(client, p): prompt_str = " ".join(prompt) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.chatgpt( - prompt_str, - search=parse_bool(search), - add_html=parse_bool(add_html), - country_code=country_code, - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("chatgpt") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.chatgpt( + prompt_str, + search=parse_bool(search), + add_html=parse_bool(add_html), + country_code=country_code, + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) write_output( data, diff --git a/src/scrapingbee_cli/commands/crawl.py b/src/scrapingbee_cli/commands/crawl.py index e854b22..4f234da 100644 --- a/src/scrapingbee_cli/commands/crawl.py +++ b/src/scrapingbee_cli/commands/crawl.py @@ -24,6 +24,7 @@ run_project_spider, run_urls_spider, ) +from ..theme import LiveCreditTracker def _crawl_build_params( @@ -537,25 +538,30 @@ def crawl_cmd( allowed_list: list[str] | None = None if allowed_domains: allowed_list = [d.strip() for d in allowed_domains.split(",") if d.strip()] + _initial_remaining = usage_info.get("credits") if usage_info else None + _initial_total = usage_info.get("max_api_credit") if usage_info else None try: - run_urls_spider( - urls, - key, - scrape_params=scrape_params or None, - custom_headers=custom_headers or None, - max_depth=max_depth, - max_pages=max_pages, - concurrency=concurrency, - output_dir=out_dir, - allowed_domains=allowed_list, - allow_external_domains=allow_external_domains, - download_delay=download_delay, - autothrottle_enabled=autothrottle or None, - resume=obj.get("resume", False), - include_pattern=include_pattern, - exclude_pattern=exclude_pattern, - save_pattern=save_pattern, - ) + with LiveCreditTracker( + key, initial_remaining=_initial_remaining, total=_initial_total + ): + run_urls_spider( + urls, + key, + scrape_params=scrape_params or None, + custom_headers=custom_headers or None, + max_depth=max_depth, + max_pages=max_pages, + concurrency=concurrency, + output_dir=out_dir, + allowed_domains=allowed_list, + allow_external_domains=allow_external_domains, + download_delay=download_delay, + autothrottle_enabled=autothrottle or None, + resume=obj.get("resume", False), + include_pattern=include_pattern, + exclude_pattern=exclude_pattern, + save_pattern=save_pattern, + ) except ValueError as e: click.echo(str(e), err=True) raise SystemExit(1) diff --git a/src/scrapingbee_cli/commands/fast_search.py b/src/scrapingbee_cli/commands/fast_search.py index 776b340..9f2b372 100644 --- a/src/scrapingbee_cli/commands/fast_search.py +++ b/src/scrapingbee_cli/commands/fast_search.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from contextlib import nullcontext import click from click_option_group import optgroup @@ -25,6 +26,7 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode @click.command("fast-search") @@ -108,6 +110,7 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -116,15 +119,17 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.fast_search( - query, - page=page, - country_code=country_code, - language=language, - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("fast-search") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.fast_search( + query, + page=page, + country_code=country_code, + language=language, + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import fast_search_credits diff --git a/src/scrapingbee_cli/commands/google.py b/src/scrapingbee_cli/commands/google.py index 2ce4c51..eec7ae1 100644 --- a/src/scrapingbee_cli/commands/google.py +++ b/src/scrapingbee_cli/commands/google.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from contextlib import nullcontext import click from click_option_group import optgroup @@ -29,6 +30,7 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode def _warn_empty_organic(data: bytes, search_type: str | None) -> None: @@ -179,6 +181,7 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -187,21 +190,23 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.google_search( - query, - search_type=norm_val(search_type), - country_code=country_code, - device=device, - page=page, - language=language, - nfpr=parse_bool(nfpr), - extra_params=extra_params, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("google") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.google_search( + query, + search_type=norm_val(search_type), + country_code=country_code, + device=device, + page=page, + language=language, + nfpr=parse_bool(nfpr), + extra_params=extra_params, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) _warn_empty_organic(data, search_type) from ..credits import google_credits diff --git a/src/scrapingbee_cli/commands/scrape.py b/src/scrapingbee_cli/commands/scrape.py index 53cba9a..38fd818 100644 --- a/src/scrapingbee_cli/commands/scrape.py +++ b/src/scrapingbee_cli/commands/scrape.py @@ -5,6 +5,7 @@ import asyncio import json import os +from contextlib import nullcontext import click from click_option_group import optgroup @@ -38,6 +39,7 @@ from ..client import Client, pretty_json from ..config import BASE_URL, get_api_key from ..crawl import _preferred_extension_from_scrape_params +from ..theme import LiveCreditTracker, MiniBeeSpinner, echo_error, is_repl_mode def _apply_chunking(url: str, data: bytes, chunk_size: int, chunk_overlap: int) -> bytes: @@ -703,7 +705,10 @@ def _ndjson_cb(result): if failed: raise SystemExit(1) - asyncio.run(_batch()) + _rem = usage_info.get("credits") if usage_info else None + _tot = usage_info.get("max_api_credit") if usage_info else None + with LiveCreditTracker(key, initial_remaining=_rem, total=_tot): + asyncio.run(_batch()) return if not url and not scraping_config: @@ -715,18 +720,25 @@ def _ndjson_cb(result): async def _single() -> None: scrape_url = url or "" # empty when using --scraping-config (API uses config's URL) - async with Client(key, BASE_URL, timeout=client_timeout) as client: - if escalate_proxy: - data, resp_headers, status_code = await scrape_with_escalation( - client, - scrape_url, - scrape_kwargs, - verbose=obj["verbose"], - ) - else: - data, resp_headers, status_code = await client.scrape(scrape_url, **scrape_kwargs) + _spinner = MiniBeeSpinner("scrape") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL, timeout=client_timeout) as client: + if escalate_proxy: + data, resp_headers, status_code = await scrape_with_escalation( + client, + scrape_url, + scrape_kwargs, + verbose=obj["verbose"], + ) + else: + data, resp_headers, status_code = await client.scrape( + scrape_url, **scrape_kwargs + ) if not scrape_kwargs.get("transparent_status_code") and status_code >= 400: - click.echo(f"Error: HTTP {status_code}", err=True) + if is_repl_mode(): + echo_error(f"Error: HTTP {status_code}") + else: + click.echo(f"Error: HTTP {status_code}", err=True) try: click.echo(pretty_json(data), err=True) except Exception: diff --git a/src/scrapingbee_cli/commands/usage.py b/src/scrapingbee_cli/commands/usage.py index 418fc15..beadfe7 100644 --- a/src/scrapingbee_cli/commands/usage.py +++ b/src/scrapingbee_cli/commands/usage.py @@ -3,6 +3,8 @@ from __future__ import annotations import asyncio +import json as _json +from contextlib import nullcontext import click @@ -10,6 +12,7 @@ from ..cli_utils import _output_options, store_common_options from ..client import Client, parse_usage, pretty_json from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode @click.command() @@ -27,25 +30,84 @@ def usage_cmd(obj: dict, **kwargs) -> None: backoff = float(obj.get("backoff") or 2.0) async def _run() -> None: - async with Client(key, BASE_URL) as client: - data, _, status_code = await client.usage(retries=retries, backoff=backoff) - if status_code != 200: - click.echo( - f"API returned status {status_code}: {data.decode('utf-8', errors='replace')}", - err=True, - ) - raise SystemExit(1) - # Warm the shared file cache so concurrent batch subprocesses skip the API call. - write_usage_file_cache(key, parse_usage(data)) - output_file = obj.get("output_file") - if output_file: - with open(output_file, "w", encoding="utf-8") as f: - f.write(pretty_json(data) + "\n") - else: - click.echo(pretty_json(data)) + _spinner = MiniBeeSpinner("usage") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, _, status_code = await client.usage(retries=retries, backoff=backoff) + if status_code != 200: + click.echo( + f"API returned status {status_code}: {data.decode('utf-8', errors='replace')}", + err=True, + ) + raise SystemExit(1) + # Warm the shared file cache so concurrent batch subprocesses skip the API call. + write_usage_file_cache(key, parse_usage(data)) + + if is_repl_mode(): + _show_repl_usage(data) + else: + output_file = obj.get("output_file") + if output_file: + with open(output_file, "w", encoding="utf-8") as f: + f.write(pretty_json(data) + "\n") + else: + click.echo(pretty_json(data)) asyncio.run(_run()) +def _show_repl_usage(data: bytes) -> None: + """Render a fully styled usage dashboard to stderr (REPL mode only).""" + from rich.text import Text + + from ..theme import ( + BEE_YELLOW, + _render_inline_bee, + echo_key_value, + echo_separator, + err_console, + format_honeycomb_meter, + ) + + raw = _json.loads(data) + + header = Text() + header.append(" ") + header.append_text(_render_inline_bee(0)) + header.append(" Credit Usage", style=f"bold {BEE_YELLOW}") + err_console.print(header) + err_console.print() + + used = raw.get("used_api_credit", 0) or 0 + total = raw.get("max_api_credit", 0) or 0 + remaining = total - used + + meter = format_honeycomb_meter(used, total) + err_console.print(meter) + err_console.print() + + echo_key_value("Credits used", f"{used:,}") + echo_key_value("Credits remaining", f"{remaining:,}") + echo_key_value("Total credits", f"{total:,}") + err_console.print() + + max_conc = raw.get("max_concurrency", "N/A") + cur_conc = raw.get("current_concurrency", 0) + echo_key_value("Max concurrency", str(max_conc)) + echo_key_value("Current concurrency", str(cur_conc)) + err_console.print() + + renewal = raw.get("renewal_subscription_date", "") + if renewal: + try: + date_part, time_part = renewal.split("T") + time_clean = time_part.split(".")[0][:5] + echo_key_value("Renewal date", f"{date_part} {time_clean} UTC") + except Exception: + echo_key_value("Renewal date", renewal) + + echo_separator() + + def register(cli: click.Group) -> None: cli.add_command(usage_cmd, "usage") diff --git a/src/scrapingbee_cli/commands/walmart.py b/src/scrapingbee_cli/commands/walmart.py index a9100a2..6b43918 100644 --- a/src/scrapingbee_cli/commands/walmart.py +++ b/src/scrapingbee_cli/commands/walmart.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from contextlib import nullcontext import click from click_option_group import optgroup @@ -30,6 +31,7 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode WALMART_SORT_BY = ["best-match", "price-low", "price-high", "best-seller"] @@ -163,6 +165,7 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -171,25 +174,27 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.walmart_search( - query, - start_page=start_page, - min_price=min_price, - max_price=max_price, - sort_by=norm_val(sort_by), - device=device, - domain=domain, - fulfillment_speed=norm_val(fulfillment_speed), - fulfillment_type=norm_val(fulfillment_type), - delivery_zip=delivery_zip, - store_id=store_id, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("walmart-search") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.walmart_search( + query, + start_page=start_page, + min_price=min_price, + max_price=max_price, + sort_by=norm_val(sort_by), + device=device, + domain=domain, + fulfillment_speed=norm_val(fulfillment_speed), + fulfillment_type=norm_val(fulfillment_type), + delivery_zip=delivery_zip, + store_id=store_id, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import walmart_credits @@ -302,6 +307,7 @@ async def api_call(client, pid): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -310,19 +316,21 @@ async def api_call(client, pid): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.walmart_product( - product_id, - device=device, - domain=domain, - delivery_zip=delivery_zip, - store_id=store_id, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("walmart-product") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.walmart_product( + product_id, + device=device, + domain=domain, + delivery_zip=delivery_zip, + store_id=store_id, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import walmart_credits diff --git a/src/scrapingbee_cli/commands/youtube.py b/src/scrapingbee_cli/commands/youtube.py index b41e436..f5ee828 100644 --- a/src/scrapingbee_cli/commands/youtube.py +++ b/src/scrapingbee_cli/commands/youtube.py @@ -4,6 +4,7 @@ import asyncio import re +from contextlib import nullcontext import click from click_option_group import optgroup @@ -28,6 +29,7 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key +from ..theme import MiniBeeSpinner, is_repl_mode YOUTUBE_UPLOAD_DATE = ["today", "last-hour", "this-week", "this-month", "this-year"] @@ -250,6 +252,7 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -258,27 +261,29 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.youtube_search( - query, - upload_date=norm_val(upload_date), - type=type_, - duration=duration, - sort_by=norm_val(sort_by), - hd=parse_bool(hd), - is_4k=parse_bool(is_4k), - subtitles=parse_bool(subtitles), - creative_commons=parse_bool(creative_commons), - live=parse_bool(live), - is_360=parse_bool(is_360), - is_3d=parse_bool(is_3d), - hdr=parse_bool(hdr), - location=parse_bool(location), - vr180=parse_bool(vr180), - purchased=parse_bool(purchased), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("youtube-search") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.youtube_search( + query, + upload_date=norm_val(upload_date), + type=type_, + duration=duration, + sort_by=norm_val(sort_by), + hd=parse_bool(hd), + is_4k=parse_bool(is_4k), + subtitles=parse_bool(subtitles), + creative_commons=parse_bool(creative_commons), + live=parse_bool(live), + is_360=parse_bool(is_360), + is_3d=parse_bool(is_3d), + hdr=parse_bool(hdr), + location=parse_bool(location), + vr180=parse_bool(vr180), + purchased=parse_bool(purchased), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) data = _normalize_youtube_search(data) write_output( @@ -362,6 +367,7 @@ async def api_call(client, vid): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), + usage_info=usage_info, ) return @@ -370,12 +376,14 @@ async def api_call(client, vid): raise SystemExit(1) async def _single() -> None: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.youtube_metadata( - _extract_video_id(video_id), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + _spinner = MiniBeeSpinner("youtube-metadata") if is_repl_mode() else nullcontext() + with _spinner: + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.youtube_metadata( + _extract_video_id(video_id), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) write_output( data, From 9be548dcc40fa0d6feb45f1a2b34d0acf3035763 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 7 May 2026 20:41:24 +0530 Subject: [PATCH 03/15] fix(repl): bug fixes + unified prompt rendering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug fix: remove the outer REPL spinner that wrapped every command. It blocked interactive commands (`tutorial`, `auth`) from prompting the user, masked their output, and double-stacked with the inner MiniBeeSpinner already added in Phase 2 for network commands. Add `tutorial` and `unsafe` to the REPL command list and tab completion (introduced in v1.4.0/v1.4.1, were missing). Prompt: drop the Powerline-arrow protrusion in favour of a single unified yellow tag — ` ScrapingBee ❯ ` — with the chevron inside the tag. Renders identically in every terminal/font (Mac Terminal, Warp, iTerm2, etc.) since it uses only standard BMP glyphs. Set SCRAPINGBEE_POWERLINE=1 to opt back into the Powerline arrow if you have a patched font (Nerd Font / Powerline-patched). --- src/scrapingbee_cli/interactive.py | 60 +++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index d8757bd..ceec930 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -124,12 +124,14 @@ def play_splash(version: str) -> None: "youtube-search", "youtube-metadata", "chatgpt", + "tutorial", "auth", "logout", "usage", "schedule", "export", "docs", + "unsafe", ] _COMMAND_HELP: dict[str, str] = { @@ -144,12 +146,14 @@ def play_splash(version: str) -> None: "youtube-search": "Search YouTube videos", "youtube-metadata": "YouTube video metadata", "chatgpt": "Query ChatGPT API", + "tutorial": "Interactive tutorial walkthrough", "auth": "Save your API key", "logout": "Remove stored API key", "usage": "Check credits and concurrency", "schedule": "Schedule recurring scrapes", "export": "Merge batch output files", "docs": "Open ScrapingBee documentation", + "unsafe": "Run an arbitrary scrapingbee URL", "help": "Show this command list", "clear": "Clear the screen", "exit": "Quit the REPL", @@ -192,7 +196,8 @@ def _print_repl_help() -> None: ], "Media": ["youtube-search", "youtube-metadata"], "AI": ["chatgpt"], - "Account": ["auth", "logout", "usage", "schedule", "export", "docs"], + "Learn": ["tutorial"], + "Account": ["auth", "logout", "usage", "schedule", "export", "docs", "unsafe"], } for group_name, cmds in groups.items(): err_console.print(f" [{BEE_DIM}]{group_name}[/]") @@ -232,18 +237,40 @@ def _print_repl_help() -> None: "prompt.hint": "#665500 italic", } -_POWERLINE_ARROW = "\ue0b0" +def _build_static_prompt() -> list[tuple[str, str]]: + """Build the prompt segments. -_STATIC_PROMPT = [ - ( + Default: a single unified yellow tag \u2014 ` ScrapingBee \u276f ` \u2014 with the + chevron rendered *inside* the tag. Identical in every terminal/font: + no protruding shape, no Private Use Area glyphs. + + Set SCRAPINGBEE_POWERLINE=1 to use the classic Powerline arrow that + *protrudes* from the tag (requires a patched font like a Nerd Font). + """ + import os + + hint = ( "class:prompt.hint", " Tab complete \u2502 \u2191\u2193 history \u2502 \u2192 accept \u2502 Ctrl+C exit\n", - ), - ("", "\n"), - ("class:prompt.tag", " ScrapingBee "), - ("class:prompt.arrow", _POWERLINE_ARROW), - ("class:prompt.space", " "), -] + ) + blank = ("", "\n") + space = ("class:prompt.space", " ") + + if os.environ.get("SCRAPINGBEE_POWERLINE", "").lower() in ("1", "true", "yes"): + return [ + hint, + blank, + ("class:prompt.tag", " ScrapingBee "), + ("class:prompt.arrow", "\ue0b0"), + space, + ] + + return [ + hint, + blank, + ("class:prompt.tag", " ScrapingBee \u276f "), + space, + ] # --------------------------------------------------------------------------- @@ -406,7 +433,7 @@ def run_repl(cli_group: object, version: str) -> None: import click - from .theme import MiniBeeSpinner, set_repl_mode + from .theme import set_repl_mode set_repl_mode(True) @@ -417,10 +444,11 @@ def run_repl(cli_group: object, version: str) -> None: Path(history_path).parent.mkdir(parents=True, exist_ok=True) session = _build_session(history_path) + static_prompt = _build_static_prompt() while True: try: - line = session.prompt(_STATIC_PROMPT).strip() + line = session.prompt(static_prompt).strip() except KeyboardInterrupt: err_console.print() err_console.print(f" [bold {BEE_YELLOW}]Buzz off! See you next time.[/]") @@ -467,9 +495,9 @@ def run_repl(cli_group: object, version: str) -> None: # Gap between prompt and command output sys.stderr.write("\n") - # Run command with bee spinner - spinner = MiniBeeSpinner(args[0]) - spinner.start() + # No outer spinner: commands that benefit show their own MiniBeeSpinner + # via is_repl_mode() (network calls). An outer spinner here would also + # block interactive commands like `tutorial` / `auth` from prompting. try: cli_group.main(args, standalone_mode=False) # type: ignore[union-attr] except click.ClickException as e: @@ -478,5 +506,3 @@ def run_repl(cli_group: object, version: str) -> None: pass except Exception as e: err_console.print(f" [bold {BEE_RED}]Error: {e}[/]") - finally: - spinner.stop() From ceae0b89c6b5ad2f4864b7ee63df5ce9d0012f48 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 7 May 2026 20:51:17 +0530 Subject: [PATCH 04/15] refactor(repl): full rewrite for a clean, professional UX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat the REPL as a tool, not a mascot. The previous version prioritised personality (splash, ASCII logo, bee emoticons, rotating fun facts, cute exit) over getting out of the user's way. This rewrite swaps that for psql/redis-cli/gh-style density and consistency. interactive.py — full rewrite: - Remove the bee splash animation, ASCII-art logos, repeated hint line on every prompt. - One-line banner on startup, then prompt. - Slash-prefixed REPL meta-commands (`:help`, `:q`, `:clear`, `:set`, `:unset`, `:show`) so they don't collide with click commands. Bare aliases (`help`, `exit`, `quit`, `q`, `clear`) still work for muscle memory. - Per-command tab completion driven by walking the click tree at startup — `youtube-search --` now shows YouTube flags, `scrape --` shows scrape flags. Bool/Choice flags auto- detected from click param types (no more flat `_COMMON_FLAGS` list that drifts from reality). - Uniform output frame around every command: `─── cmd ─── ` divider on top, `[ok]/[fail] 1.23s` line on the bottom. - Bottom toolbar with live state: credits remaining (read from the existing usage cache), last command name + status + duration, active session settings. - "Did you mean?" suggestions on unknown commands and on click "no such option" errors (Levenshtein distance, threshold 2). - Multi-line input via trailing backslash continuation. - Session settings via `:set country-code=fr`, applied as default flags to subsequent commands when not explicitly overridden. - `:clear` uses standard `\033[2J\033[H` instead of the previous scroll-and-jump heuristic. - Silent exit (no "Buzz off!" message). theme.py: - Replace MiniBeeSpinner's emoticon flap frames + rotating "Bee facts" + time-of-day flavour messages with a single line: ten braille-dot frames + the command name. Same API (`with MiniBeeSpinner("scrape"):`) — call sites unchanged. - Drop dead module-level state: MESSAGES, _BEE_FACTS, _MSG_ROTATE_TICKS, _time_flavor. All 653 unit tests still pass. SCRAPINGBEE_POWERLINE=1 still opts into the protruding Powerline arrow for users with patched fonts. --- src/scrapingbee_cli/interactive.py | 855 +++++++++++++++++------------ src/scrapingbee_cli/theme.py | 184 +------ 2 files changed, 527 insertions(+), 512 deletions(-) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index ceec930..cad95ab 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -1,374 +1,307 @@ -"""Interactive REPL mode for ScrapingBee CLI.""" +"""Interactive REPL mode for ScrapingBee CLI. + +Goals — explicit on purpose, since the previous version drifted from these: +- Get out of the user's way: no splash, no logos, no animation. +- One-line banner. Single unified prompt tag. +- Output frame uniform across every command. +- Slash-prefixed REPL meta-commands so they don't collide with click. +- Per-command tab completion driven by the click tree (no flag duplication). +- Bottom toolbar with live state (credits, last status, duration). +- "Did you mean?" on typos. Multi-line input via trailing backslash. +- Session settings via `:set KEY=VAL` and `:show`. +""" from __future__ import annotations +import os import shlex import sys import time +from typing import TYPE_CHECKING, Any, Iterable from rich.text import Text from .theme import BEE_DIM, BEE_RED, BEE_YELLOW, err_console -# Secondary brand colour for accents (footer, dimmed elements) -_BEE_ORANGE = "#FFB13D" +if TYPE_CHECKING: + import click + # --------------------------------------------------------------------------- -# Splash animation +# Banner & first-launch hint # --------------------------------------------------------------------------- -_SCRAPINGBEE_LOGO = [ - " ███████╗ ██████╗██████╗ █████╗ ██████╗ ██╗███╗ ██╗ ██████╗ ", - " ██╔════╝██╔════╝██╔══██╗██╔══██╗██╔══██╗██║████╗ ██║██╔════╝ ", - " ███████╗██║ ██████╔╝███████║██████╔╝██║██╔██╗ ██║██║ ███╗", - " ╚════██║██║ ██╔══██╗██╔══██║██╔═══╝ ██║██║╚██╗██║██║ ██║", - " ███████║╚██████╗██║ ██║██║ ██║██║ ██║██║ ╚████║╚██████╔╝", - " ╚══════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚═════╝ ", -] - -_BEE_LOGO = [ - " ██████╗ ███████╗███████╗", - " ██╔══██╗██╔════╝██╔════╝", - " ██████╔╝█████╗ █████╗ ", - " ██╔══██╗██╔══╝ ██╔══╝ ", - " ██████╔╝███████╗███████╗", - " ╚═════╝ ╚══════╝╚══════╝", -] - -_BEE_FRAMES = ["\\(o_o)/", "_(o_o)_", "/(o_o)\\", "_(o_o)_"] - - -def play_splash(version: str) -> None: - """Bee accelerates across screen with bounce, then logo reveal.""" - if not sys.stderr.isatty(): - return - - import shutil - - width = shutil.get_terminal_size((80, 24)).columns - max_pos = min(width - 12, 55) - - # Phase 1: Bee accelerates right (ease-in), then bounces back slightly - total_steps = 40 - positions: list[int] = [] - for s in range(total_steps): - # ease-in-out cubic: accelerate → decelerate - t = s / (total_steps - 1) - eased = t * t * (3 - 2 * t) # smoothstep - positions.append(int(eased * max_pos)) - # Add a small bounce at the end - bounce_back = max(0, max_pos - 4) - positions.extend([bounce_back, max_pos - 2, max_pos]) - - for i, pos in enumerate(positions): - bee = _BEE_FRAMES[i % len(_BEE_FRAMES)] - # Fading honeycomb trail - trail_len = min(pos, 25) - trail = Text() - trail.append(" " * (pos - trail_len)) - for t_i in range(trail_len): - # Fade trail: older chars dimmer - age = trail_len - t_i - if age > 18: - trail.append("·", style="dim") - elif age > 10: - trail.append("~", style=_BEE_ORANGE) - else: - trail.append("~", style=f"bold {BEE_YELLOW}") - trail.append(bee, style=f"bold {BEE_YELLOW}") - - with err_console.capture() as cap: - err_console.print(trail, end="") - sys.stderr.write("\r\033[K" + cap.get()) - sys.stderr.flush() - # Speed: fast start, slow near end - delay = 0.012 + 0.015 * (i / len(positions)) - time.sleep(delay) - - sys.stderr.write("\r\033[K") - sys.stderr.flush() - time.sleep(0.12) - - # Phase 2: Logo appears line by line - err_console.print() - for logo_line in _SCRAPINGBEE_LOGO: - err_console.print(f"[bold {BEE_YELLOW}]{logo_line}[/]") - time.sleep(0.03) - for logo_line in _BEE_LOGO: - err_console.print(f"[bold white]{logo_line}[/]") - time.sleep(0.03) +def _print_banner(version: str) -> None: + """One-line banner. No animation, no logo, no nonsense.""" + line = Text() + line.append(" ScrapingBee ", style=f"bold black on {BEE_YELLOW}") + line.append(" ") + line.append(f"v{version}", style=f"bold {BEE_YELLOW}") + line.append(" ") + line.append("Type ", style=BEE_DIM) + line.append(":help", style=f"bold {BEE_YELLOW}") + line.append(" for commands, ", style=BEE_DIM) + line.append(":q", style=f"bold {BEE_YELLOW}") + line.append(" to quit.", style=BEE_DIM) err_console.print() - ver = Text() - ver.append(f" v{version}", style=f"bold {BEE_YELLOW}") - ver.append(" \u2502 ", style="dim") - ver.append("Web scraping from the terminal", style="dim") - err_console.print(ver) + err_console.print(line) err_console.print() - time.sleep(0.15) -# --------------------------------------------------------------------------- -# Command registry & help -# --------------------------------------------------------------------------- - -_COMMANDS = [ - "scrape", - "crawl", - "google", - "fast-search", - "amazon-product", - "amazon-search", - "walmart-product", - "walmart-search", - "youtube-search", - "youtube-metadata", - "chatgpt", - "tutorial", - "auth", - "logout", - "usage", - "schedule", - "export", - "docs", - "unsafe", -] - -_COMMAND_HELP: dict[str, str] = { - "scrape": "Scrape a web page (single or batch)", - "crawl": "Crawl a site following links", - "google": "Google Search API", - "fast-search": "Fast Search API (sub-second)", - "amazon-product": "Amazon product details", - "amazon-search": "Search Amazon products", - "walmart-product": "Walmart product details", - "walmart-search": "Search Walmart products", - "youtube-search": "Search YouTube videos", - "youtube-metadata": "YouTube video metadata", - "chatgpt": "Query ChatGPT API", - "tutorial": "Interactive tutorial walkthrough", - "auth": "Save your API key", - "logout": "Remove stored API key", - "usage": "Check credits and concurrency", - "schedule": "Schedule recurring scrapes", - "export": "Merge batch output files", - "docs": "Open ScrapingBee documentation", - "unsafe": "Run an arbitrary scrapingbee URL", - "help": "Show this command list", - "clear": "Clear the screen", - "exit": "Quit the REPL", -} - -_COMMON_FLAGS = [ - "--verbose", - "--output-file", - "--retries", - "--backoff", - "--render-js", - "--premium-proxy", - "--stealth-proxy", - "--country-code", - "--return-page-markdown", - "--return-page-text", - "--extract-rules", - "--ai-extract-rules", - "--ai-query", - "--input-file", - "--output-dir", - "--output-format", - "--concurrency", - "--screenshot", - "--json-response", - "--help", -] - - -def _print_repl_help() -> None: +def _print_help(commands: dict[str, str]) -> None: + """Print the command list, grouped, plus the slash-command meta list.""" err_console.print() - groups = { - "Scraping": ["scrape", "crawl"], - "Search": ["google", "fast-search"], - "Marketplaces": [ - "amazon-product", - "amazon-search", - "walmart-product", - "walmart-search", - ], - "Media": ["youtube-search", "youtube-metadata"], - "AI": ["chatgpt"], - "Learn": ["tutorial"], - "Account": ["auth", "logout", "usage", "schedule", "export", "docs", "unsafe"], + groups: dict[str, list[str]] = { + "Pages": ["scrape", "crawl"], + "Search": ["google", "fast-search"], + "Marketplaces": ["amazon-product", "amazon-search", + "walmart-product", "walmart-search"], + "Media": ["youtube-search", "youtube-metadata"], + "AI": ["chatgpt"], + "Learn": ["tutorial"], + "Account": ["auth", "logout"], + "Tools": ["usage", "schedule", "export", "docs", "unsafe"], } + for group_name, cmds in groups.items(): err_console.print(f" [{BEE_DIM}]{group_name}[/]") for cmd in cmds: + help_text = commands.get(cmd, "") err_console.print( - f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{_COMMAND_HELP.get(cmd, '')}[/]" + f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{help_text}[/]" ) + err_console.print() - err_console.print(f" [{BEE_DIM}]Session[/]") - for cmd in ("help", "clear", "exit"): + err_console.print(f" [{BEE_DIM}]REPL[/]") + meta_cmds = [ + (":help, :?", "Show this command list"), + (":clear", "Clear the screen"), + (":set K=V", "Set a session default (e.g. :set country-code=fr)"), + (":unset K", "Remove a session default"), + (":show", "Show current session defaults"), + (":q, :quit", "Quit the REPL"), + ] + for cmd, desc in meta_cmds: err_console.print( - f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{_COMMAND_HELP.get(cmd, '')}[/]" + f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{desc}[/]" ) err_console.print() # --------------------------------------------------------------------------- -# prompt_toolkit setup (ScrapingBee brand theme) +# Click tree introspection (per-command flags / values) # --------------------------------------------------------------------------- + +def _walk_click_tree(cli_group: Any) -> tuple[ + dict[str, str], # command -> short help + dict[str, list[str]], # command -> [flag, ...] + set[str], # bool flags (any command) + dict[str, list[str]], # flag -> [choice, ...] +]: + """Inspect the click group and return discovery data for completion + help. + + Returns (command_help, command_flags, bool_flags, choice_flags). + """ + import click + + command_help: dict[str, str] = {} + command_flags: dict[str, list[str]] = {} + bool_flags: set[str] = set() + choice_flags: dict[str, list[str]] = {} + + for name, cmd in cli_group.commands.items(): + command_help[name] = (cmd.short_help or cmd.help or "").strip().splitlines()[0:1] and \ + (cmd.short_help or cmd.help or "").strip().splitlines()[0] or "" + + flags: list[str] = [] + for param in cmd.params: + if not isinstance(param, click.Option): + continue + for opt in param.opts: + if opt.startswith("--"): + flags.append(opt) + if param.is_flag: + bool_flags.add(opt) + if isinstance(param.type, click.Choice): + choice_flags[opt] = list(param.type.choices) + command_flags[name] = sorted(set(flags)) + + return command_help, command_flags, bool_flags, choice_flags + + +# --------------------------------------------------------------------------- +# Prompt segment builder +# --------------------------------------------------------------------------- + + _STYLE_DICT = { - # Prompt: powerline arrow tag - "prompt.tag": "bg:#FFCD23 #000000 bold", - "prompt.arrow": "#FFCD23 bold", + # Prompt: yellow tag with chevron inside (or Powerline arrow if opted in) + "prompt.tag": f"bg:{BEE_YELLOW} #000000 bold", + "prompt.arrow": f"{BEE_YELLOW} bold", + "prompt.cont": f"{BEE_DIM}", "prompt.space": "", # Completion dropdown - "completion-menu": "bg:#1a1400", - "completion-menu.completion": "bg:#1a1400 #FFCD23", - "completion-menu.completion.current": "bg:#FFCD23 #000000 bold", - "completion-menu.meta.completion": "bg:#1a1400 #886600", - "completion-menu.meta.completion.current": "bg:#FFCD23 #000000", - "scrollbar.background": "bg:#1a1400", - "scrollbar.button": "bg:#FFCD23", + "completion-menu": "bg:#1a1400", + "completion-menu.completion": f"bg:#1a1400 {BEE_YELLOW}", + "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", + "completion-menu.meta.completion": "bg:#1a1400 #886600", + "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", + "scrollbar.background": "bg:#1a1400", + "scrollbar.button": f"bg:{BEE_YELLOW}", # Ghost text - "auto-suggestion": "fg:#554400 italic", - # Hint line (above prompt) - "prompt.hint": "#665500 italic", + "auto-suggestion": f"fg:#554400 italic", + # Bottom toolbar + "bottom-toolbar": f"bg:#1a1400 {BEE_DIM}", + "bottom-toolbar.label": f"bg:#1a1400 {BEE_DIM}", + "bottom-toolbar.value": f"bg:#1a1400 {BEE_YELLOW} bold", + "bottom-toolbar.ok": f"bg:#1a1400 #22C55E bold", + "bottom-toolbar.fail": f"bg:#1a1400 {BEE_RED} bold", } -def _build_static_prompt() -> list[tuple[str, str]]: - """Build the prompt segments. - - Default: a single unified yellow tag \u2014 ` ScrapingBee \u276f ` \u2014 with the - chevron rendered *inside* the tag. Identical in every terminal/font: - no protruding shape, no Private Use Area glyphs. - Set SCRAPINGBEE_POWERLINE=1 to use the classic Powerline arrow that - *protrudes* from the tag (requires a patched font like a Nerd Font). - """ - import os +def _powerline_mode() -> bool: + return os.environ.get("SCRAPINGBEE_POWERLINE", "").lower() in ("1", "true", "yes") - hint = ( - "class:prompt.hint", - " Tab complete \u2502 \u2191\u2193 history \u2502 \u2192 accept \u2502 Ctrl+C exit\n", - ) - blank = ("", "\n") - space = ("class:prompt.space", " ") - if os.environ.get("SCRAPINGBEE_POWERLINE", "").lower() in ("1", "true", "yes"): +def _build_main_prompt() -> list[tuple[str, str]]: + """Primary prompt segments. No hint line — that's only on startup.""" + if _powerline_mode(): return [ - hint, - blank, - ("class:prompt.tag", " ScrapingBee "), - ("class:prompt.arrow", "\ue0b0"), - space, + ("class:prompt.tag", " ScrapingBee "), + ("class:prompt.arrow", ""), + ("class:prompt.space", " "), ] - return [ - hint, - blank, - ("class:prompt.tag", " ScrapingBee \u276f "), - space, + ("class:prompt.tag", " ScrapingBee ❯ "), + ("class:prompt.space", " "), ] +def _build_continuation_prompt() -> list[tuple[str, str]]: + """Continuation prompt for multi-line input (after a trailing `\\`).""" + return [("class:prompt.cont", " … ")] + + # --------------------------------------------------------------------------- -# Flag value completions +# Session state # --------------------------------------------------------------------------- -_BOOL_FLAGS = frozenset( - { - "--render-js", - "--block-ads", - "--block-resources", - "--premium-proxy", - "--stealth-proxy", - "--forward-headers", - "--forward-headers-pure", - "--json-response", - "--screenshot", - "--screenshot-full-page", - "--return-page-source", - "--return-page-markdown", - "--return-page-text", - "--custom-google", - "--transparent-status-code", - "--add-html", - "--light-request", - "--deduplicate", - "--resume", - "--autothrottle", - } -) - -_CHOICE_FLAGS: dict[str, list[str]] = { - "--device": ["desktop", "mobile"], - "--output-format": ["files", "csv", "ndjson"], - "--method": ["GET", "POST", "PUT"], - "--wait-browser": ["domcontentloaded", "load", "networkidle0", "networkidle2"], - "--sort-by": ["best-match", "price-low", "price-high", "best-seller", "most-recent"], - "--search-type": ["web", "images", "news", "videos", "shopping"], - "--type": ["video", "channel", "playlist", "movie"], - "--duration": ["short", "medium", "long"], - "--upload-date": ["today", "last-hour", "this-week", "this-month", "this-year"], - "--preset": [ - "screenshot", - "screenshot-and-html", - "fetch", - "extract-links", - "extract-emails", - "extract-phones", - "scroll-page", - ], -} +class SessionState: + """REPL-wide mutable state. + + Holds the bottom-toolbar inputs and `:set` defaults. Settings keys are + stored without the `--` prefix and applied as `--key value` if the + user's command doesn't already include that flag. + """ + + def __init__(self) -> None: + self.last_status: str | None = None # "ok" | "fail" | None + self.last_duration: float | None = None + self.last_command: str | None = None + self.credits: int | None = None + self.settings: dict[str, str] = {} + + def apply_settings_to_args(self, args: list[str]) -> list[str]: + """Inject session defaults as flags, unless the user passed them already.""" + if not self.settings: + return args + present = {a for a in args if a.startswith("--")} + out = list(args) + for key, value in self.settings.items(): + flag = f"--{key}" + if flag in present: + continue + out.extend([flag, value]) + return out + + def refresh_credits_from_cache(self) -> None: + """Read cached usage from disk if available — non-blocking, best-effort.""" + try: + import json + from pathlib import Path + + cache = Path.home() / ".config" / "scrapingbee-cli" / "usage_cache.json" + if not cache.exists(): + return + data = json.loads(cache.read_text(encoding="utf-8")) + for entry in data.values() if isinstance(data, dict) else []: + creds = entry.get("credits") if isinstance(entry, dict) else None + if isinstance(creds, int): + self.credits = creds + return + except Exception: + return + + +# --------------------------------------------------------------------------- +# prompt_toolkit machinery +# --------------------------------------------------------------------------- -def _make_completer(): + +def _make_completer( + commands: list[str], + command_flags: dict[str, list[str]], + bool_flags: set[str], + choice_flags: dict[str, list[str]], + command_help: dict[str, str], +): + """Per-command tab completion driven by the click tree.""" from prompt_toolkit.completion import Completer, Completion + meta_cmds = [":help", ":?", ":clear", ":set", ":unset", ":show", ":q", ":quit"] + class BeeCompleter(Completer): def get_completions(self, document, complete_event): - stripped = document.text_before_cursor.lstrip() - words = stripped.split() + text = document.text_before_cursor.lstrip() + words = text.split() + on_first = (not text) or (len(words) == 1 and not text.endswith(" ")) - on_first_word = (not stripped) or (len(words) == 1 and not stripped.endswith(" ")) - if on_first_word: + # First word: command names + slash-commands + if on_first: partial = words[0].lower() if words else "" - for cmd in sorted(_COMMANDS + ["help", "clear", "exit"]): + pool: list[tuple[str, str]] = [(c, command_help.get(c, "")) for c in commands] + pool.extend((m, "REPL meta") for m in meta_cmds) + for cmd, meta in sorted(pool): if cmd.startswith(partial): yield Completion( - cmd, - start_position=-len(partial), - display_meta=_COMMAND_HELP.get(cmd, ""), + cmd, start_position=-len(partial), display_meta=meta ) return + # Inside a command: use that command's flags + cmd_name = words[0] + flags_for_cmd = command_flags.get(cmd_name, []) last = words[-1] if words else "" prev = words[-2] if len(words) >= 2 else "" - if stripped.endswith(" ") and prev in _BOOL_FLAGS: - yield Completion("true", display_meta="enable") + # After a bool flag with trailing space: suggest true/false + if text.endswith(" ") and prev in bool_flags: + yield Completion("true", display_meta="enable") yield Completion("false", display_meta="disable") return - if stripped.endswith(" ") and prev in _CHOICE_FLAGS: - for val in _CHOICE_FLAGS[prev]: - yield Completion(val) + # After a choice flag with trailing space: suggest choices + if text.endswith(" ") and prev in choice_flags: + for v in choice_flags[prev]: + yield Completion(v) return + # Mid-typing a value for a known flag (no trailing space) if len(words) >= 2 and not last.startswith("-"): - flag = words[-2] - if flag in _BOOL_FLAGS: - for val in ("true", "false"): - if val.startswith(last.lower()): - yield Completion(val, start_position=-len(last)) + if prev in bool_flags: + for v in ("true", "false"): + if v.startswith(last.lower()): + yield Completion(v, start_position=-len(last)) return - if flag in _CHOICE_FLAGS: - for val in _CHOICE_FLAGS[flag]: - if val.startswith(last.lower()): - yield Completion(val, start_position=-len(last)) + if prev in choice_flags: + for v in choice_flags[prev]: + if v.startswith(last.lower()): + yield Completion(v, start_position=-len(last)) return + # Typing a flag if last.startswith("-"): - for flag in _COMMON_FLAGS: + for flag in flags_for_cmd: if flag.startswith(last): yield Completion(flag, start_position=-len(last)) @@ -383,23 +316,18 @@ def _make_key_bindings(): @kb.add("enter", filter=has_completions) def _accept_completion(event): - """Enter with completion menu: keep current selection, close menu.""" - # prompt_toolkit already applies the selected completion as a preview - # in the buffer during navigation. Just dismiss the menu. event.current_buffer.complete_state = None @kb.add("enter", filter=~has_completions) def _submit_or_ignore(event): - """Enter without completion menu: submit if non-empty, else do nothing.""" buf = event.current_buffer if buf.text.strip(): buf.validate_and_handle() - # Empty buffer: do nothing — cursor stays, no duplicate prompt return kb -def _build_session(history_path: str): +def _build_session(history_path: str, completer: Any, toolbar_fn: Any): from prompt_toolkit import PromptSession from prompt_toolkit.auto_suggest import AutoSuggestFromHistory from prompt_toolkit.history import FileHistory @@ -412,11 +340,12 @@ def _build_session(history_path: str): return PromptSession( history=history, - completer=_make_completer(), + completer=completer, complete_while_typing=False, auto_suggest=AutoSuggestFromHistory(), style=Style.from_dict(_STYLE_DICT), key_bindings=_make_key_bindings(), + bottom_toolbar=toolbar_fn, mouse_support=False, enable_history_search=False, vi_mode=False, @@ -424,11 +353,210 @@ def _build_session(history_path: str): # --------------------------------------------------------------------------- -# REPL main loop +# Bottom toolbar # --------------------------------------------------------------------------- -def run_repl(cli_group: object, version: str) -> None: +def _build_toolbar_fn(state: SessionState) -> Any: + """Return a callable producing the bottom toolbar segments.""" + + def render() -> list[tuple[str, str]]: + segs: list[tuple[str, str]] = [("class:bottom-toolbar", " ")] + + # Credits (from cache) + if state.credits is not None: + segs.append(("class:bottom-toolbar.label", "credits ")) + segs.append(("class:bottom-toolbar.value", f"{state.credits:,}")) + else: + segs.append(("class:bottom-toolbar.label", "credits ")) + segs.append(("class:bottom-toolbar.value", "—")) + + segs.append(("class:bottom-toolbar", " ")) + + # Last command status + if state.last_command: + segs.append(("class:bottom-toolbar.label", "last ")) + segs.append(("class:bottom-toolbar.value", state.last_command)) + if state.last_status == "ok": + segs.append(("class:bottom-toolbar", " ")) + segs.append(("class:bottom-toolbar.ok", "OK")) + elif state.last_status == "fail": + segs.append(("class:bottom-toolbar", " ")) + segs.append(("class:bottom-toolbar.fail", "FAIL")) + if state.last_duration is not None: + segs.append( + ("class:bottom-toolbar", f" ({state.last_duration:.1f}s)") + ) + else: + segs.append(("class:bottom-toolbar", "no commands run yet")) + + # Active session settings + if state.settings: + segs.append(("class:bottom-toolbar", " ")) + segs.append(("class:bottom-toolbar.label", "set ")) + joined = " ".join(f"{k}={v}" for k, v in state.settings.items()) + segs.append(("class:bottom-toolbar.value", joined)) + + return segs + + return render + + +# --------------------------------------------------------------------------- +# Output frame: uniform divider above + status line below +# --------------------------------------------------------------------------- + + +def _print_command_header(args: list[str]) -> None: + import shutil + + width = shutil.get_terminal_size((80, 24)).columns + label = " " + " ".join(args) + " " + fill = max(3, width - len(label) - 6) + line = Text() + line.append("─── ", style=BEE_DIM) + line.append(label, style=f"bold {BEE_YELLOW}") + line.append("─" * fill, style=BEE_DIM) + err_console.print(line) + + +def _print_command_footer(status: str, duration_s: float) -> None: + line = Text() + line.append(" ") + if status == "ok": + line.append("[ok]", style="bold #22C55E") + elif status == "fail": + line.append("[fail]", style=f"bold {BEE_RED}") + else: + line.append(f"[{status}]", style=BEE_DIM) + line.append(f" {duration_s:.2f}s", style=BEE_DIM) + err_console.print(line) + err_console.print() + + +# --------------------------------------------------------------------------- +# Slash-command meta dispatcher +# --------------------------------------------------------------------------- + + +def _handle_meta(line: str, state: SessionState, command_help: dict[str, str]) -> str | None: + """Handle :slash commands (and their bare aliases). Returns: + + - "quit" → break out of the REPL loop + - "ok" → handled, continue to next prompt + - None → not a meta-command, fall through to click + """ + parts = line.strip().split(None, 1) + head = parts[0] + rest = parts[1] if len(parts) > 1 else "" + head_low = head.lower() + + quit_aliases = {":q", ":quit", "exit", "quit", "q"} + help_aliases = {":help", ":?", "help", "?"} + clear_aliases = {":clear", "clear"} + + if head_low in quit_aliases: + return "quit" + + if head_low in help_aliases: + _print_help(command_help) + return "ok" + + if head_low in clear_aliases: + sys.stderr.write("\033[2J\033[H") + sys.stderr.flush() + return "ok" + + if head_low == ":show": + if not state.settings: + err_console.print(f" [{BEE_DIM}]No session defaults set.[/]") + else: + err_console.print() + for k, v in state.settings.items(): + err_console.print(f" [bold {BEE_YELLOW}]{k:<20}[/] [dim]{v}[/]") + err_console.print() + return "ok" + + if head_low == ":unset": + key = rest.strip().lstrip("-") + if not key: + err_console.print(f" [bold {BEE_RED}]usage:[/] :unset KEY") + return "ok" + if key in state.settings: + del state.settings[key] + err_console.print(f" [{BEE_DIM}]unset[/] [bold {BEE_YELLOW}]{key}[/]") + else: + err_console.print(f" [{BEE_DIM}]not set:[/] {key}") + return "ok" + + if head_low == ":set": + if "=" not in rest: + err_console.print(f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE") + return "ok" + key, _, value = rest.partition("=") + key = key.strip().lstrip("-") + value = value.strip() + if not key or not value: + err_console.print(f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE") + return "ok" + state.settings[key] = value + err_console.print(f" [{BEE_DIM}]set[/] [bold {BEE_YELLOW}]{key}[/] = [dim]{value}[/]") + return "ok" + + return None + + +# --------------------------------------------------------------------------- +# Did-you-mean +# --------------------------------------------------------------------------- + + +def _levenshtein(a: str, b: str) -> int: + if a == b: + return 0 + if not a: + return len(b) + if not b: + return len(a) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a, 1): + curr = [i] + [0] * len(b) + for j, cb in enumerate(b, 1): + cost = 0 if ca == cb else 1 + curr[j] = min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost) + prev = curr + return prev[-1] + + +def _suggest(typed: str, candidates: Iterable[str], threshold: int = 2) -> str | None: + best: tuple[int, str] | None = None + for c in candidates: + d = _levenshtein(typed.lower(), c.lower()) + if d <= threshold and (best is None or d < best[0]): + best = (d, c) + return best[1] if best else None + + +# --------------------------------------------------------------------------- +# Multi-line input via trailing `\` +# --------------------------------------------------------------------------- + + +def _read_input(session: Any, main_prompt: list, cont_prompt: list) -> str: + """Read a (possibly multi-line) command. Trailing `\\` joins the next line.""" + line = session.prompt(main_prompt).rstrip() + while line.endswith("\\"): + more = session.prompt(cont_prompt).rstrip() + line = line[:-1].rstrip() + " " + more + return line + + +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- + + +def run_repl(cli_group: Any, version: str) -> None: from pathlib import Path import click @@ -437,72 +565,117 @@ def run_repl(cli_group: object, version: str) -> None: set_repl_mode(True) - play_splash(version) - _print_repl_help() + # Click introspection + command_help, command_flags, bool_flags, choice_flags = _walk_click_tree(cli_group) + command_names = sorted(command_flags.keys()) + + # Banner — once + _print_banner(version) + + # Session state + prompt session + state = SessionState() + state.refresh_credits_from_cache() history_path = str(Path.home() / ".config" / "scrapingbee-cli" / ".history") Path(history_path).parent.mkdir(parents=True, exist_ok=True) - session = _build_session(history_path) - static_prompt = _build_static_prompt() + completer = _make_completer( + command_names, command_flags, bool_flags, choice_flags, command_help + ) + toolbar = _build_toolbar_fn(state) + session = _build_session(history_path, completer, toolbar) + + main_prompt = _build_main_prompt() + cont_prompt = _build_continuation_prompt() while True: try: - line = session.prompt(static_prompt).strip() + line = _read_input(session, main_prompt, cont_prompt).strip() except KeyboardInterrupt: err_console.print() - err_console.print(f" [bold {BEE_YELLOW}]Buzz off! See you next time.[/]") break except EOFError: - continue + err_console.print() + break if not line: continue - lower = line.lower() - if lower in ("exit", "quit", "q"): - err_console.print(f" [bold {BEE_YELLOW}]Buzz off! See you next time.[/]") + # Meta-commands (`:help`, `:set`, `clear`, `exit`, etc.) + meta = _handle_meta(line, state, command_help) + if meta == "quit": break - - if lower in ("help", "?"): - _print_repl_help() - continue - - if lower == "clear": - import shutil - - rows = shutil.get_terminal_size((80, 24)).lines - # Print enough blank lines to scroll old content off screen, - # then move cursor up a few rows so prompt lands near the bottom - # (where the toolbar is) rather than stuck at the very top. - sys.stderr.write("\n" * rows) - sys.stderr.write(f"\033[{rows}A\033[J") - sys.stderr.flush() + if meta == "ok": continue - if lower.startswith("scrapingbee "): - line = line[len("scrapingbee ") :].strip() + # Tolerate users typing `scrapingbee ...` out of muscle memory + if line.lower().startswith("scrapingbee "): + line = line[len("scrapingbee "):].strip() try: args = shlex.split(line) except ValueError as e: - err_console.print(f" [bold {BEE_RED}]Parse error: {e}[/]") + err_console.print(f" [bold {BEE_RED}]parse error:[/] {e}") continue - if not args: continue - # Gap between prompt and command output - sys.stderr.write("\n") + # Unknown command + suggestion (fast path before click runs) + cmd_name = args[0] + if cmd_name not in command_flags: + suggestion = _suggest(cmd_name, command_names) + if suggestion: + err_console.print( + f" [bold {BEE_RED}]unknown:[/] {cmd_name} " + f"[{BEE_DIM}]did you mean[/] [bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" + ) + else: + err_console.print(f" [bold {BEE_RED}]unknown:[/] {cmd_name}") + continue + + # Apply session defaults + args = state.apply_settings_to_args(args) + + # Output frame: divider above + _print_command_header(args) + start = time.monotonic() + status = "ok" - # No outer spinner: commands that benefit show their own MiniBeeSpinner - # via is_repl_mode() (network calls). An outer spinner here would also - # block interactive commands like `tutorial` / `auth` from prompting. try: - cli_group.main(args, standalone_mode=False) # type: ignore[union-attr] + cli_group.main(args, standalone_mode=False) + except click.UsageError as e: + # Click usage error: try to suggest a flag if message is "no such option" + msg = str(e) + err_console.print(f" [bold {BEE_RED}]usage:[/] {msg}") + if "no such option" in msg.lower(): + # Extract the bad flag and suggest + import re as _re + m = _re.search(r"--?[A-Za-z0-9-]+", msg) + if m: + bad = m.group(0) + suggestion = _suggest(bad, command_flags.get(cmd_name, [])) + if suggestion: + err_console.print( + f" [{BEE_DIM}]did you mean[/] " + f"[bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" + ) + status = "fail" except click.ClickException as e: e.show() - except SystemExit: - pass + status = "fail" + except SystemExit as e: + code = e.code if e.code is not None else 0 + if code not in (0, None): + status = "fail" except Exception as e: - err_console.print(f" [bold {BEE_RED}]Error: {e}[/]") + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status = "fail" + + duration = time.monotonic() - start + _print_command_footer(status, duration) + + # Update session state for the toolbar + state.last_command = cmd_name + state.last_status = status + state.last_duration = duration + state.refresh_credits_from_cache() diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index 40909db..0587b91 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -124,168 +124,39 @@ def _render_inline_bee(frame_idx: int) -> Text: return text -# -- Contextual status messages per command ---------------------------------- - -_BEE_FACTS = [ - "Did you know? Bees can fly up to 15 mph", - "Did you know? A bee visits 50-100 flowers per trip", - "Did you know? Bees have 5 eyes", - "Did you know? Honey never spoils", - "Did you know? Bees communicate by dancing", - "Did you know? A hive can have 60,000 bees", - "Did you know? Bees flap 200 times per second", - "Did you know? Bees can recognize human faces", - "Did you know? One bee makes 1/12 tsp of honey in its life", - "Did you know? Bees navigate using the sun", -] - -MESSAGES: dict[str, list[str]] = { - "scrape": [ - "Scraping", - "Extracting honey", - "Buzzing through HTML", - "Parsing the nectar", - "Dodging bot traps", - *_BEE_FACTS[:3], - ], - "google": [ - "Googling", - "Searching the hive", - "Pollinating results", - "Crawling the web", - "Fetching SERPs", - *_BEE_FACTS[3:6], - ], - "fast-search": [ - "Searching", - "Speed-buzzing", - "Zipping through results", - "Lightning fast", - *_BEE_FACTS[6:8], - ], - "crawl": [ - "Crawling", - "Following the trail", - "Exploring links", - "Mapping the web", - "Discovering pages", - *_BEE_FACTS[1:4], - ], - "usage": [ - "Checking the honeypot", - "Counting credits", - "Buzzing to the API", - *_BEE_FACTS[4:6], - ], - "amazon-product": [ - "Fetching product", - "Browsing the jungle", - "Hunting for deals", - "Reading reviews", - *_BEE_FACTS[7:9], - ], - "amazon-search": [ - "Searching Amazon", - "Flying through the jungle", - "Comparing prices", - "Scanning listings", - *_BEE_FACTS[0:2], - ], - "walmart-search": [ - "Searching Walmart", - "Rolling back prices", - "Scanning the shelves", - *_BEE_FACTS[5:7], - ], - "walmart-product": [ - "Fetching product", - "Checking the aisle", - "Reading the label", - *_BEE_FACTS[8:10], - ], - "youtube-search": [ - "Searching YouTube", - "Streaming honey", - "Tuning in", - "Browsing videos", - *_BEE_FACTS[2:4], - ], - "youtube-metadata": [ - "Fetching metadata", - "Reading the description", - "Counting views", - *_BEE_FACTS[9:10], - ], - "chatgpt": [ - "Querying ChatGPT", - "Consulting the hive mind", - "Thinking bee thoughts", - "Processing prompt", - *_BEE_FACTS[4:6], - ], - "sitemap": [ - "Fetching sitemap", - "Reading the map", - "Charting the course", - *_BEE_FACTS[6:8], - ], - "_default": [ - "Working", - "Buzzing", - "zZZzzzZZ", - "Bee patient", - "Almost done", - *_BEE_FACTS[:5], - ], -} - -# How many spinner ticks before rotating to the next message. -_MSG_ROTATE_TICKS = 18 # ~0.9s at 50ms per tick +# -- Spinner ----------------------------------------------------------------- -# -- Flapping-bee spinner (single-line) -------------------------------------- +_DOT_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] class MiniBeeSpinner: - """Single-line flapping-bee spinner with rotating contextual messages. + """Single-line dot spinner with the command name as the label. Usage:: with MiniBeeSpinner("scrape"): await do_request() - The *message* argument is a command key into ``MESSAGES``. If the key is - not found it is used as a literal first message with ``_default`` extras. + Output is one steady line: a rotating braille-dot frame followed by the + command name. No emoticons, no rotating fun facts, no time-of-day + flavour — just a clean status indicator. """ - def __init__(self, message: str = "scrape") -> None: - # Resolve message list. - if message in MESSAGES: - self._messages = MESSAGES[message] - else: - self._messages = [message] + MESSAGES["_default"] - self._messages = self._messages + _time_flavor() + def __init__(self, message: str = "") -> None: + self._label = message self._stop = threading.Event() self._thread: threading.Thread | None = None def _animate(self) -> None: idx = 0 - msg_idx = 0 while not self._stop.is_set(): - # Rotate message every N ticks. - if idx > 0 and idx % _MSG_ROTATE_TICKS == 0: - msg_idx = (msg_idx + 1) % len(self._messages) - - bee = _render_inline_bee(idx) - msg = self._messages[msg_idx] - dots = "." * ((idx % 3) + 1) - + frame = _DOT_FRAMES[idx % len(_DOT_FRAMES)] line = Text() line.append(" ") - line.append_text(bee) - line.append(" ") - line.append(msg, style=f"bold {BEE_YELLOW}") - line.append(dots.ljust(4), style="dim") + line.append(frame, style=f"bold {BEE_YELLOW}") + if self._label: + line.append(f" {self._label}", style="dim") with err_console.capture() as capture: err_console.print(line, end="") @@ -293,7 +164,7 @@ def _animate(self) -> None: sys.stderr.flush() idx += 1 - self._stop.wait(0.05) + self._stop.wait(0.08) # Clear the spinner line. sys.stderr.write("\r\033[K") @@ -794,32 +665,3 @@ def echo_bee_error(status_code: int, fallback_msg: str = "") -> None: echo_error(fallback_msg or f"Error: HTTP {status_code}") -# -- Time-aware messages ----------------------------------------------------- - - -def _time_flavor() -> list[str]: - """Return extra messages based on time of day.""" - from datetime import datetime - - hour = datetime.now().hour - day = datetime.now().weekday() - - extras: list[str] = [] - if 0 <= hour < 6: - extras = ["The web never sleeps", "Late night data hunt", "Nocturnal bee mode"] - elif 6 <= hour < 12: - extras = [ - "Rise and scrape!", - "Fresh morning data", - "Early bird gets the data", - ] - elif 12 <= hour < 18: - extras = ["Afternoon buzz", "Peak pollination hours"] - else: - extras = ["Evening crawl session", "Burning the midnight nectar"] - - if day == 0: - extras.append("Monday motivation: fresh data!") - elif day == 4: - extras.append("TGIF — last scrape of the week?") - return extras From a9afab46b4622ee89082668f58eaa61397315d08 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 7 May 2026 22:39:19 +0530 Subject: [PATCH 05/15] feat(repl): bordered input area, smarter completion + :set, output preview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the working "non-TUI" iteration before switching to a true full-screen TUI. Captures every fix in this round — keep it as a checkpoint to fall back to if the TUI rewrite needs to be reverted. interactive.py: - Bordered input: dropped the Frame widget (rendering artifacts) and the horizontal rules (yellow trails on resize) — input is now just a chevron prompt + lexer-highlighted buffer + adaptive bottom toolbar. - Tab completion: re-bind Tab/Shift-Tab/Esc on the custom KeyBindings (the previous version overrode prompt_toolkit defaults). - erase_when_done=True on the Application + manual `❯ ` echo into scrollback after submit — fewer stale-render artifacts on resize. - :set overhaul: validate keys against the click flag list, accept "k=v ..." and "--k v ..." mixed forms, suggest on typo, validate choice/bool values where known. - :unset accepts space- or comma-separated keys; :unset *, :unset all, :reset all clear every setting. - :view slash command — cross-platform pager built on prompt_toolkit (no `less` dependency on Windows). Arrow keys / PgUp/PgDn / Home/End / mouse wheel to scroll, q / Esc to exit. - Toolbar adapts to width: chips truncate to "+N more" when narrow. - Per-command tab completion driven by walking the click tree (already in the previous commit, retained). theme.py: - Hex bloom spinner: 3-cell radial composition (centre + halo) so the bloom radiates symmetrically instead of growing rightward. Frames cycle dust → speck → outline → honeycomb → ✦ sparkle peak → drain, paired with a dim→bright→warm colour gradient. - White-glim shimmer sweeps across the verb ("Fetching", "Rendering") in time with the bloom. - Elapsed-time counter once an op runs > 0.5s. - Per-command verb rotation (no bee facts). cli_utils.py: - Output preview in REPL mode: large text dumps (>30 lines OR >4 KB) get truncated to a 30-line / 4 KB preview. Single-line minified HTML is detected by byte threshold so it doesn't slip through. - Full payload auto-saved to ~/.cache/scrapingbee-cli/last-output so the user can :view / cat / less it. - Binary output (PNG, PDF, etc.) is never truncated. - Non-REPL invocations are unchanged so pipes/redirects keep working. All 653 unit tests still pass. --- src/scrapingbee_cli/cli_utils.py | 106 ++- src/scrapingbee_cli/interactive.py | 1105 +++++++++++++++++++--------- src/scrapingbee_cli/theme.py | 109 ++- 3 files changed, 942 insertions(+), 378 deletions(-) diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index efd0526..fd7c045 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -21,6 +21,89 @@ ) +_REPL_PREVIEW_MAX_LINES = 30 +_REPL_PREVIEW_MAX_BYTES = 4000 + + +def _format_bytes(n: int) -> str: + if n >= 1_048_576: + return f"{n / 1_048_576:.1f} MB" + if n >= 1024: + return f"{n / 1024:.1f} KB" + return f"{n} B" + + +def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: + """If we're in REPL mode and `data` is a large text payload, shrink it + down to a preview and save the full payload to a fixed cache path. + + Triggers truncation on EITHER too many lines OR too many bytes — single- + line minified HTML often hits the byte cap without ever wrapping, so a + line-only check would let it through unchanged. + + Returns ``(bytes_to_print, summary_or_none, saved_path_or_none)``. Outside + REPL mode (or for binary data, or short outputs), returns ``(data, None, + None)`` unchanged so piped/redirected use is unaffected. + """ + if not data: + return data, None, None + if not is_repl_mode(): + return data, None, None + + # Skip binary data (screenshots, PDFs, etc.) — keep the original behaviour. + is_text = data[:1] in (b"{", b"[", b"<", b"#") or b"\x00" not in data[:512] + if not is_text: + return data, None, None + + line_count = data.count(b"\n") + 1 + if ( + len(data) <= _REPL_PREVIEW_MAX_BYTES + and line_count <= _REPL_PREVIEW_MAX_LINES + ): + return data, None, None + + # Save the full payload to a fixed cache file the user can scroll through + # via :view (or `less` directly). + full_path: str | None = None + try: + from pathlib import Path + + cache_dir = Path.home() / ".cache" / "scrapingbee-cli" + cache_dir.mkdir(parents=True, exist_ok=True) + cache_path = cache_dir / "last-output" + cache_path.write_bytes(data) + full_path = str(cache_path) + except Exception: + full_path = None + + text = data.decode("utf-8", errors="replace") + lines = text.split("\n") + line_preview = "\n".join(lines[: _REPL_PREVIEW_MAX_LINES]) + + # Decide whether to truncate by lines or by chars. Single-line minified + # HTML/JSON would have line_preview == text but len > byte cap; truncate by + # chars there so the preview really does stay small on screen. + if len(line_preview.encode("utf-8")) > _REPL_PREVIEW_MAX_BYTES: + preview = text[:_REPL_PREVIEW_MAX_BYTES] + more_chars = len(text) - len(preview) + truncation_note = ( + f"showing first {_REPL_PREVIEW_MAX_BYTES:,} chars · " + f"+{more_chars:,} more chars" + ) + else: + preview = line_preview + more_lines = max(0, len(lines) - _REPL_PREVIEW_MAX_LINES) + shown = min(_REPL_PREVIEW_MAX_LINES, len(lines)) + truncation_note = ( + f"showing {shown}/{len(lines):,} lines · +{more_lines:,} more lines" + ) + + summary = ( + f"… preview truncated · {_format_bytes(len(data))} · {truncation_note}" + ) + return preview.encode("utf-8"), summary, full_path + + class NormalizedChoice(click.Choice): """Choice type that accepts both hyphens and underscores. @@ -1648,10 +1731,27 @@ def write_output( with fh: fh.write(data) else: - sys.stdout.buffer.write(data) + # In REPL mode, truncate large text dumps to a tidy preview and surface + # a path to the full output. Non-REPL invocations (`scrapingbee scrape ...`) + # keep the original behaviour so pipes and redirects work unchanged. + preview_data, repl_summary, repl_full_path = _maybe_repl_preview(data) + sys.stdout.buffer.write(preview_data) # Only add a trailing newline for text-like content; binary data (PNG, PDF, etc.) # must not have extra bytes appended. - if data and not data.endswith(b"\n"): - is_text = data[:1] in (b"{", b"[", b"<", b"#") or b"\x00" not in data[:512] + if preview_data and not preview_data.endswith(b"\n"): + is_text = ( + preview_data[:1] in (b"{", b"[", b"<", b"#") + or b"\x00" not in preview_data[:512] + ) if is_text: click.echo() + if repl_summary: + from .theme import BEE_DIM, BEE_YELLOW, err_console + + err_console.print(f" [{BEE_DIM}]{repl_summary}[/]") + if repl_full_path: + err_console.print( + f" [bold {BEE_YELLOW}]:view[/] " + f"[{BEE_DIM}]to scroll the full output · or pass[/] " + f"[bold {BEE_YELLOW}]--output-file FILE[/]" + ) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index cad95ab..4dff462 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -1,19 +1,28 @@ -"""Interactive REPL mode for ScrapingBee CLI. - -Goals — explicit on purpose, since the previous version drifted from these: -- Get out of the user's way: no splash, no logos, no animation. -- One-line banner. Single unified prompt tag. -- Output frame uniform across every command. -- Slash-prefixed REPL meta-commands so they don't collide with click. -- Per-command tab completion driven by the click tree (no flag duplication). -- Bottom toolbar with live state (credits, last status, duration). +"""Interactive REPL — Claude-style bordered input box with status toolbar. + +Built on prompt_toolkit's `Application` API (not `PromptSession`) so we can +custom-layout the input area as a `Frame` with a chevron prompt mark, a +bottom-anchored toolbar showing live state (credits, last cmd, settings), +and a per-input syntax-highlighting lexer. + +Output from each command flows above the input box; the box stays anchored +where the cursor was when the prompt opened. + +Goals (revised; the previous version drifted from these): +- Bordered input box, anchored bottom of the prompt area. +- Restrained palette: yellow accent, soft amber chrome, dim greys, semantic + green / red. No yellow-on-yellow, no mascot, no animation. +- Slash-prefixed REPL meta-commands (`:help`, `:q`, `:clear`, `:set`, ...). +- Per-command tab completion driven by walking the click tree. +- Toolbar with credits gauge, last status icon, `:set` chips, hint line. +- Inline syntax highlighting: command, flags, URLs, quoted strings. - "Did you mean?" on typos. Multi-line input via trailing backslash. -- Session settings via `:set KEY=VAL` and `:show`. """ from __future__ import annotations import os +import re import shlex import sys import time @@ -24,86 +33,57 @@ from .theme import BEE_DIM, BEE_RED, BEE_YELLOW, err_console if TYPE_CHECKING: - import click + pass # --------------------------------------------------------------------------- -# Banner & first-launch hint +# Refined palette # --------------------------------------------------------------------------- +_AMBER = "#E5A800" # frame border / soft accent +_GREEN = "#22C55E" # success +_DIM2 = "#555555" # darker chrome (toolbar labels, hint) +_BG_CHIP = "#1a1400" # chip background (settings) +_URL_CYAN = "#7DD3FC" # URLs in input lexer -def _print_banner(version: str) -> None: - """One-line banner. No animation, no logo, no nonsense.""" - line = Text() - line.append(" ScrapingBee ", style=f"bold black on {BEE_YELLOW}") - line.append(" ") - line.append(f"v{version}", style=f"bold {BEE_YELLOW}") - line.append(" ") - line.append("Type ", style=BEE_DIM) - line.append(":help", style=f"bold {BEE_YELLOW}") - line.append(" for commands, ", style=BEE_DIM) - line.append(":q", style=f"bold {BEE_YELLOW}") - line.append(" to quit.", style=BEE_DIM) - err_console.print() - err_console.print(line) - err_console.print() - - -def _print_help(commands: dict[str, str]) -> None: - """Print the command list, grouped, plus the slash-command meta list.""" - err_console.print() - groups: dict[str, list[str]] = { - "Pages": ["scrape", "crawl"], - "Search": ["google", "fast-search"], - "Marketplaces": ["amazon-product", "amazon-search", - "walmart-product", "walmart-search"], - "Media": ["youtube-search", "youtube-metadata"], - "AI": ["chatgpt"], - "Learn": ["tutorial"], - "Account": ["auth", "logout"], - "Tools": ["usage", "schedule", "export", "docs", "unsafe"], - } - - for group_name, cmds in groups.items(): - err_console.print(f" [{BEE_DIM}]{group_name}[/]") - for cmd in cmds: - help_text = commands.get(cmd, "") - err_console.print( - f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{help_text}[/]" - ) - - err_console.print() - err_console.print(f" [{BEE_DIM}]REPL[/]") - meta_cmds = [ - (":help, :?", "Show this command list"), - (":clear", "Clear the screen"), - (":set K=V", "Set a session default (e.g. :set country-code=fr)"), - (":unset K", "Remove a session default"), - (":show", "Show current session defaults"), - (":q, :quit", "Quit the REPL"), - ] - for cmd, desc in meta_cmds: - err_console.print( - f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{desc}[/]" - ) - err_console.print() +_STYLE_DICT = { + # Top/bottom horizontal rules around the input + "rule": _AMBER, + # Prompt mark inside the input area + "promptmark": f"{BEE_YELLOW} bold", + # Lexer (input syntax highlighting) + "lexer.cmd": f"{BEE_YELLOW} bold", + "lexer.flag": _AMBER, + "lexer.url": _URL_CYAN, + "lexer.string": _GREEN, + # Bottom toolbar + "toolbar": f"{BEE_DIM}", + "toolbar.label": _DIM2, + "toolbar.value": f"{BEE_YELLOW} bold", + "toolbar.ok": f"{_GREEN} bold", + "toolbar.fail": f"{BEE_RED} bold", + "toolbar.hint": _DIM2, + "toolbar.chip": f"bg:{_BG_CHIP} {BEE_YELLOW}", + "toolbar.gauge": f"{BEE_YELLOW}", + # Completion menu + "completion-menu": f"bg:{_BG_CHIP}", + "completion-menu.completion": f"bg:{_BG_CHIP} {BEE_YELLOW}", + "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", + "completion-menu.meta.completion": f"bg:{_BG_CHIP} #886600", + "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", + "auto-suggestion": "fg:#554400 italic", +} # --------------------------------------------------------------------------- -# Click tree introspection (per-command flags / values) +# Click tree introspection # --------------------------------------------------------------------------- def _walk_click_tree(cli_group: Any) -> tuple[ - dict[str, str], # command -> short help - dict[str, list[str]], # command -> [flag, ...] - set[str], # bool flags (any command) - dict[str, list[str]], # flag -> [choice, ...] + dict[str, str], dict[str, list[str]], set[str], dict[str, list[str]] ]: - """Inspect the click group and return discovery data for completion + help. - - Returns (command_help, command_flags, bool_flags, choice_flags). - """ + """Return (command_help, command_flags, bool_flags, choice_flags).""" import click command_help: dict[str, str] = {} @@ -112,8 +92,12 @@ def _walk_click_tree(cli_group: Any) -> tuple[ choice_flags: dict[str, list[str]] = {} for name, cmd in cli_group.commands.items(): - command_help[name] = (cmd.short_help or cmd.help or "").strip().splitlines()[0:1] and \ - (cmd.short_help or cmd.help or "").strip().splitlines()[0] or "" + first_line = "" + for source in (cmd.short_help, cmd.help): + if source: + first_line = source.strip().splitlines()[0] + break + command_help[name] = first_line flags: list[str] = [] for param in cmd.params: @@ -131,81 +115,23 @@ def _walk_click_tree(cli_group: Any) -> tuple[ return command_help, command_flags, bool_flags, choice_flags -# --------------------------------------------------------------------------- -# Prompt segment builder -# --------------------------------------------------------------------------- - - -_STYLE_DICT = { - # Prompt: yellow tag with chevron inside (or Powerline arrow if opted in) - "prompt.tag": f"bg:{BEE_YELLOW} #000000 bold", - "prompt.arrow": f"{BEE_YELLOW} bold", - "prompt.cont": f"{BEE_DIM}", - "prompt.space": "", - # Completion dropdown - "completion-menu": "bg:#1a1400", - "completion-menu.completion": f"bg:#1a1400 {BEE_YELLOW}", - "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", - "completion-menu.meta.completion": "bg:#1a1400 #886600", - "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", - "scrollbar.background": "bg:#1a1400", - "scrollbar.button": f"bg:{BEE_YELLOW}", - # Ghost text - "auto-suggestion": f"fg:#554400 italic", - # Bottom toolbar - "bottom-toolbar": f"bg:#1a1400 {BEE_DIM}", - "bottom-toolbar.label": f"bg:#1a1400 {BEE_DIM}", - "bottom-toolbar.value": f"bg:#1a1400 {BEE_YELLOW} bold", - "bottom-toolbar.ok": f"bg:#1a1400 #22C55E bold", - "bottom-toolbar.fail": f"bg:#1a1400 {BEE_RED} bold", -} - - -def _powerline_mode() -> bool: - return os.environ.get("SCRAPINGBEE_POWERLINE", "").lower() in ("1", "true", "yes") - - -def _build_main_prompt() -> list[tuple[str, str]]: - """Primary prompt segments. No hint line — that's only on startup.""" - if _powerline_mode(): - return [ - ("class:prompt.tag", " ScrapingBee "), - ("class:prompt.arrow", ""), - ("class:prompt.space", " "), - ] - return [ - ("class:prompt.tag", " ScrapingBee ❯ "), - ("class:prompt.space", " "), - ] - - -def _build_continuation_prompt() -> list[tuple[str, str]]: - """Continuation prompt for multi-line input (after a trailing `\\`).""" - return [("class:prompt.cont", " … ")] - - # --------------------------------------------------------------------------- # Session state # --------------------------------------------------------------------------- class SessionState: - """REPL-wide mutable state. - - Holds the bottom-toolbar inputs and `:set` defaults. Settings keys are - stored without the `--` prefix and applied as `--key value` if the - user's command doesn't already include that flag. - """ + """REPL-wide mutable state surfaced in the bottom toolbar.""" def __init__(self) -> None: - self.last_status: str | None = None # "ok" | "fail" | None - self.last_duration: float | None = None self.last_command: str | None = None + self.last_status: str | None = None # "ok" | "fail" + self.last_duration: float | None = None self.credits: int | None = None + self.credits_total: int | None = None self.settings: dict[str, str] = {} def apply_settings_to_args(self, args: list[str]) -> list[str]: - """Inject session defaults as flags, unless the user passed them already.""" if not self.settings: return args present = {a for a in args if a.startswith("--")} @@ -218,7 +144,6 @@ def apply_settings_to_args(self, args: list[str]) -> list[str]: return out def refresh_credits_from_cache(self) -> None: - """Read cached usage from disk if available — non-blocking, best-effort.""" try: import json from pathlib import Path @@ -227,110 +152,207 @@ def refresh_credits_from_cache(self) -> None: if not cache.exists(): return data = json.loads(cache.read_text(encoding="utf-8")) - for entry in data.values() if isinstance(data, dict) else []: - creds = entry.get("credits") if isinstance(entry, dict) else None - if isinstance(creds, int): - self.credits = creds + entries = data.values() if isinstance(data, dict) else [] + for entry in entries: + if not isinstance(entry, dict): + continue + if isinstance(entry.get("credits"), int): + self.credits = entry["credits"] + if isinstance(entry.get("max_api_credit"), int): + self.credits_total = entry["max_api_credit"] + if self.credits is not None: return except Exception: return # --------------------------------------------------------------------------- -# prompt_toolkit machinery +# Helpers # --------------------------------------------------------------------------- -def _make_completer( - commands: list[str], - command_flags: dict[str, list[str]], - bool_flags: set[str], - choice_flags: dict[str, list[str]], - command_help: dict[str, str], -): - """Per-command tab completion driven by the click tree.""" - from prompt_toolkit.completion import Completer, Completion +def _format_credits(n: int) -> str: + if n >= 1_000_000: + return f"{n / 1_000_000:.1f}M" + if n >= 1_000: + return f"{n / 1_000:.1f}K" + return str(n) - meta_cmds = [":help", ":?", ":clear", ":set", ":unset", ":show", ":q", ":quit"] - class BeeCompleter(Completer): - def get_completions(self, document, complete_event): - text = document.text_before_cursor.lstrip() - words = text.split() - on_first = (not text) or (len(words) == 1 and not text.endswith(" ")) +def _credit_gauge(used_pct: int) -> str: + """Tiny block-bar showing credit usage (0..100).""" + blocks = "▁▂▃▄▅▆▇█" + n = min(7, max(0, int(used_pct * 8 / 100))) + return blocks[n] - # First word: command names + slash-commands - if on_first: - partial = words[0].lower() if words else "" - pool: list[tuple[str, str]] = [(c, command_help.get(c, "")) for c in commands] - pool.extend((m, "REPL meta") for m in meta_cmds) - for cmd, meta in sorted(pool): - if cmd.startswith(partial): - yield Completion( - cmd, start_position=-len(partial), display_meta=meta - ) - return - # Inside a command: use that command's flags - cmd_name = words[0] - flags_for_cmd = command_flags.get(cmd_name, []) - last = words[-1] if words else "" - prev = words[-2] if len(words) >= 2 else "" +def _levenshtein(a: str, b: str) -> int: + if a == b: + return 0 + if not a: + return len(b) + if not b: + return len(a) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a, 1): + curr = [i] + [0] * len(b) + for j, cb in enumerate(b, 1): + cost = 0 if ca == cb else 1 + curr[j] = min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost) + prev = curr + return prev[-1] - # After a bool flag with trailing space: suggest true/false - if text.endswith(" ") and prev in bool_flags: - yield Completion("true", display_meta="enable") - yield Completion("false", display_meta="disable") - return - # After a choice flag with trailing space: suggest choices - if text.endswith(" ") and prev in choice_flags: - for v in choice_flags[prev]: - yield Completion(v) - return - # Mid-typing a value for a known flag (no trailing space) - if len(words) >= 2 and not last.startswith("-"): - if prev in bool_flags: - for v in ("true", "false"): - if v.startswith(last.lower()): - yield Completion(v, start_position=-len(last)) - return - if prev in choice_flags: - for v in choice_flags[prev]: - if v.startswith(last.lower()): - yield Completion(v, start_position=-len(last)) - return - # Typing a flag - if last.startswith("-"): - for flag in flags_for_cmd: - if flag.startswith(last): - yield Completion(flag, start_position=-len(last)) - return BeeCompleter() +def _suggest(typed: str, candidates: Iterable[str], threshold: int = 2) -> str | None: + best: tuple[int, str] | None = None + for c in candidates: + d = _levenshtein(typed.lower(), c.lower()) + if d <= threshold and (best is None or d < best[0]): + best = (d, c) + return best[1] if best else None -def _make_key_bindings(): - from prompt_toolkit.filters import has_completions - from prompt_toolkit.key_binding import KeyBindings +# --------------------------------------------------------------------------- +# Lexer (syntax highlighting in the input buffer) +# --------------------------------------------------------------------------- - kb = KeyBindings() - @kb.add("enter", filter=has_completions) - def _accept_completion(event): - event.current_buffer.complete_state = None +def _make_lexer(): + from prompt_toolkit.lexers import Lexer + + class CmdLexer(Lexer): + def lex_document(self, document): + def get_line(lineno: int): + if lineno >= len(document.lines): + return [] + line = document.lines[lineno] + tokens: list[tuple[str, str]] = [] + first_word_seen = False + for piece in re.split(r"(\s+)", line): + if not piece: + continue + if piece.isspace(): + tokens.append(("", piece)) + continue + if not first_word_seen: + # First word coloured even if it's a slash-command + tokens.append(("class:lexer.cmd", piece)) + first_word_seen = True + elif piece.startswith("--"): + tokens.append(("class:lexer.flag", piece)) + elif piece.startswith(("http://", "https://")): + tokens.append(("class:lexer.url", piece)) + elif ( + len(piece) > 1 + and piece[0] in ("'", '"') + and piece[-1] == piece[0] + ): + tokens.append(("class:lexer.string", piece)) + else: + tokens.append(("", piece)) + return tokens + + return get_line + + return CmdLexer() - @kb.add("enter", filter=~has_completions) - def _submit_or_ignore(event): - buf = event.current_buffer - if buf.text.strip(): - buf.validate_and_handle() - return kb +# --------------------------------------------------------------------------- +# Bottom toolbar +# --------------------------------------------------------------------------- -def _build_session(history_path: str, completer: Any, toolbar_fn: Any): - from prompt_toolkit import PromptSession +def _make_toolbar(state: SessionState): + """Return a callable producing toolbar segments. + + The toolbar adapts to terminal width: + - Wide: credits gauge · last cmd · all chips · hint + - Medium: credits gauge · last cmd · chip count · hint + - Narrow: credits · last cmd · chip count + """ + + def render() -> list[tuple[str, str]]: + import shutil + + width = shutil.get_terminal_size((80, 24)).columns + segs: list[tuple[str, str]] = [("class:toolbar", " ")] + + # --- Credits + gauge -------------------------------------------------- + segs.append(("class:toolbar.label", "credits ")) + if state.credits is not None: + segs.append(("class:toolbar.value", _format_credits(state.credits))) + if state.credits_total: + used_pct = max( + 0, + min(100, 100 - int(state.credits / state.credits_total * 100)), + ) + segs.append(("class:toolbar", " ")) + segs.append(("class:toolbar.gauge", _credit_gauge(used_pct))) + else: + segs.append(("class:toolbar.value", "—")) + + # --- Last command ----------------------------------------------------- + if state.last_command: + segs.append(("class:toolbar", " · ")) + segs.append(("class:toolbar.label", "last ")) + segs.append(("class:toolbar.value", state.last_command)) + segs.append(("class:toolbar", " ")) + if state.last_status == "ok": + segs.append(("class:toolbar.ok", "✓")) + elif state.last_status == "fail": + segs.append(("class:toolbar.fail", "✗")) + if state.last_duration is not None: + segs.append(("class:toolbar", f" {state.last_duration:.1f}s")) + + # --- Session setting chips (with overflow handling) ------------------- + if state.settings: + # Estimate space already used + reserved for hint + so_far = sum(len(text) for _, text in segs) + hint_len = 24 # roughly "tab · ↑↓ · :help · :q" + spacing + budget = max(0, width - so_far - hint_len - 4) + + chips = list(state.settings.items()) + shown = 0 + for k, v in chips: + chip_text = f" {k}={v} " + if budget < len(chip_text) + 2 and shown > 0: + break + segs.append(("class:toolbar", " ")) + segs.append(("class:toolbar.chip", chip_text)) + budget -= len(chip_text) + 2 + shown += 1 + remaining = len(chips) - shown + if remaining > 0: + segs.append(("class:toolbar", " ")) + segs.append(("class:toolbar.hint", f"+{remaining} more")) + + # --- Hint (rightmost, but only if there's room) ----------------------- + used = sum(len(text) for _, text in segs) + if width - used > 26: + segs.append(("class:toolbar", " " * max(2, width - used - 24))) + segs.append(("class:toolbar.hint", "tab · ↑↓ · :help · :q")) + + return segs + + return render + + +# --------------------------------------------------------------------------- +# Application (Frame around input + toolbar) +# --------------------------------------------------------------------------- + + +def _build_application(state: SessionState, completer: Any, history_path: str): + from prompt_toolkit.application import Application from prompt_toolkit.auto_suggest import AutoSuggestFromHistory + from prompt_toolkit.buffer import Buffer + from prompt_toolkit.filters import has_completions from prompt_toolkit.history import FileHistory + from prompt_toolkit.key_binding import KeyBindings + from prompt_toolkit.layout import Layout + from prompt_toolkit.layout.containers import HSplit, Window + from prompt_toolkit.layout.controls import BufferControl, FormattedTextControl + from prompt_toolkit.layout.dimension import D from prompt_toolkit.styles import Style try: @@ -338,73 +360,145 @@ def _build_session(history_path: str, completer: Any, toolbar_fn: Any): except Exception: history = None # type: ignore[assignment] - return PromptSession( + buffer = Buffer( history=history, completer=completer, complete_while_typing=False, auto_suggest=AutoSuggestFromHistory(), - style=Style.from_dict(_STYLE_DICT), - key_bindings=_make_key_bindings(), - bottom_toolbar=toolbar_fn, - mouse_support=False, - enable_history_search=False, - vi_mode=False, + multiline=False, ) + # The input is a single Window with a per-line prefix (the chevron). + # `dont_extend_height=True` makes the Window report its preferred height as + # the content's line count — so the layout shrinks to fit, no greedy fill. + def _line_prefix(line_no, _wrap_count): + if line_no == 0: + return [("class:promptmark", "❯ ")] + return [("", " ")] + + input_window = Window( + content=BufferControl(buffer=buffer, lexer=_make_lexer()), + get_line_prefix=_line_prefix, + wrap_lines=True, + height=D(min=1), + dont_extend_height=True, + ) -# --------------------------------------------------------------------------- -# Bottom toolbar -# --------------------------------------------------------------------------- - - -def _build_toolbar_fn(state: SessionState) -> Any: - """Return a callable producing the bottom toolbar segments.""" + toolbar_window = Window( + content=FormattedTextControl(_make_toolbar(state)), + height=D.exact(1), + ) - def render() -> list[tuple[str, str]]: - segs: list[tuple[str, str]] = [("class:bottom-toolbar", " ")] + # No horizontal rules above/below the input. Earlier versions had `─` + # rules for visual structure, but every resize redraws the layout at the + # new width and leaves the old wider rule fragments behind in scrollback — + # piles of yellow horizontal lines accumulate. Visual hierarchy still + # holds via the yellow chevron prompt mark and the dim toolbar. + layout = Layout(HSplit([input_window, toolbar_window])) - # Credits (from cache) - if state.credits is not None: - segs.append(("class:bottom-toolbar.label", "credits ")) - segs.append(("class:bottom-toolbar.value", f"{state.credits:,}")) - else: - segs.append(("class:bottom-toolbar.label", "credits ")) - segs.append(("class:bottom-toolbar.value", "—")) + kb = KeyBindings() - segs.append(("class:bottom-toolbar", " ")) + @kb.add("enter") + def _enter(event): + text = buffer.text + if text.strip(): + event.app.exit(result=text) + + @kb.add("c-c") + def _ctrl_c(event): + event.app.exit(result=None) + + @kb.add("c-d") + def _ctrl_d(event): + if not buffer.text: + event.app.exit(result=None) + + # Tab opens / advances the completion menu. (Custom KeyBindings override + # prompt_toolkit's default Tab handler, so we re-bind it explicitly.) + @kb.add("tab", filter=~has_completions) + def _tab_open(event): + event.current_buffer.start_completion(select_first=False) + + @kb.add("tab", filter=has_completions) + def _tab_next(event): + event.current_buffer.complete_next() + + @kb.add("s-tab", filter=has_completions) + def _shift_tab(event): + event.current_buffer.complete_previous() + + @kb.add("escape", filter=has_completions, eager=True) + def _escape_menu(event): + event.current_buffer.cancel_completion() + + app = Application( + layout=layout, + key_bindings=kb, + style=Style.from_dict(_STYLE_DICT), + full_screen=False, + mouse_support=False, + # Erase the rendered prompt area on exit so rules + input + toolbar + # don't pile up in scrollback as stale-width artifacts after every + # submit (or after a terminal resize). The submitted command is + # echoed manually by the main loop so the user can still see what + # they typed. + erase_when_done=True, + ) + return app, buffer - # Last command status - if state.last_command: - segs.append(("class:bottom-toolbar.label", "last ")) - segs.append(("class:bottom-toolbar.value", state.last_command)) - if state.last_status == "ok": - segs.append(("class:bottom-toolbar", " ")) - segs.append(("class:bottom-toolbar.ok", "OK")) - elif state.last_status == "fail": - segs.append(("class:bottom-toolbar", " ")) - segs.append(("class:bottom-toolbar.fail", "FAIL")) - if state.last_duration is not None: - segs.append( - ("class:bottom-toolbar", f" ({state.last_duration:.1f}s)") - ) - else: - segs.append(("class:bottom-toolbar", "no commands run yet")) - # Active session settings - if state.settings: - segs.append(("class:bottom-toolbar", " ")) - segs.append(("class:bottom-toolbar.label", "set ")) - joined = " ".join(f"{k}={v}" for k, v in state.settings.items()) - segs.append(("class:bottom-toolbar.value", joined)) +# --------------------------------------------------------------------------- +# Banner / help / output frame +# --------------------------------------------------------------------------- - return segs - return render +def _print_banner(version: str) -> None: + line = Text() + line.append(" ScrapingBee ", style=f"bold black on {BEE_YELLOW}") + line.append(" ") + line.append(f"v{version}", style=f"bold {BEE_YELLOW}") + line.append(" ") + line.append("Type ", style=BEE_DIM) + line.append(":help", style=f"bold {BEE_YELLOW}") + line.append(" for commands", style=BEE_DIM) + err_console.print() + err_console.print(line) + err_console.print() -# --------------------------------------------------------------------------- -# Output frame: uniform divider above + status line below -# --------------------------------------------------------------------------- +def _print_help(commands: dict[str, str]) -> None: + err_console.print() + groups = { + "Pages": ["scrape", "crawl"], + "Search": ["google", "fast-search"], + "Marketplaces": ["amazon-product", "amazon-search", + "walmart-product", "walmart-search"], + "Media": ["youtube-search", "youtube-metadata"], + "AI": ["chatgpt"], + "Learn": ["tutorial"], + "Account": ["auth", "logout"], + "Tools": ["usage", "schedule", "export", "docs", "unsafe"], + } + for group_name, cmds in groups.items(): + err_console.print(f" [{BEE_DIM}]{group_name}[/]") + for cmd in cmds: + err_console.print( + f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{commands.get(cmd, '')}[/]" + ) + err_console.print() + err_console.print(f" [{BEE_DIM}]REPL[/]") + for cmd, desc in [ + (":help, :?", "Show this command list"), + (":clear", "Clear the screen"), + (":view", "Scroll through the last command's full output"), + (":set K=V ...", "Set one or more session defaults"), + (":unset K", "Remove a session default ('all' or '*' clears every)"), + (":reset", "Clear every session default"), + (":show", "Show current session defaults"), + (":q, :quit", "Quit the REPL"), + ]: + err_console.print(f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{desc}[/]") + err_console.print() def _print_command_header(args: list[str]) -> None: @@ -420,134 +514,414 @@ def _print_command_header(args: list[str]) -> None: err_console.print(line) -def _print_command_footer(status: str, duration_s: float) -> None: +def _print_command_footer(status: str, duration: float) -> None: line = Text() line.append(" ") if status == "ok": - line.append("[ok]", style="bold #22C55E") + line.append("✓", style=f"bold {_GREEN}") elif status == "fail": - line.append("[fail]", style=f"bold {BEE_RED}") - else: - line.append(f"[{status}]", style=BEE_DIM) - line.append(f" {duration_s:.2f}s", style=BEE_DIM) + line.append("✗", style=f"bold {BEE_RED}") + line.append(f" {duration:.2f}s", style=BEE_DIM) err_console.print(line) err_console.print() # --------------------------------------------------------------------------- -# Slash-command meta dispatcher +# Slash-command dispatcher # --------------------------------------------------------------------------- -def _handle_meta(line: str, state: SessionState, command_help: dict[str, str]) -> str | None: - """Handle :slash commands (and their bare aliases). Returns: +def _open_pager(path: str) -> None: + """Cross-platform scrollable pager built on prompt_toolkit. - - "quit" → break out of the REPL loop - - "ok" → handled, continue to next prompt - - None → not a meta-command, fall through to click + Replaces external tools (`less` on Unix, `more` on Windows) with an + in-process viewer so the CLI works identically everywhere with no extra + install. Arrow keys / page up-down / home / end / mouse wheel scroll; + `q` or `Esc` exits back to the REPL. """ + from pathlib import Path + + from prompt_toolkit.application import Application + from prompt_toolkit.buffer import Buffer + from prompt_toolkit.document import Document + from prompt_toolkit.filters import Condition + from prompt_toolkit.key_binding import KeyBindings + from prompt_toolkit.layout import Layout + from prompt_toolkit.layout.containers import HSplit, Window + from prompt_toolkit.layout.controls import BufferControl, FormattedTextControl + from prompt_toolkit.layout.dimension import D + from prompt_toolkit.styles import Style + + text = Path(path).read_text(encoding="utf-8", errors="replace") + line_count = text.count("\n") + 1 + + buffer = Buffer(read_only=Condition(lambda: True)) + buffer.set_document(Document(text=text, cursor_position=0), bypass_readonly=True) + + text_window = Window( + content=BufferControl(buffer=buffer), + wrap_lines=False, + ) + + def _status_line(): + cursor_line = buffer.document.cursor_position_row + 1 + pct = int(cursor_line / max(1, line_count) * 100) + return [ + ("class:pager.bar", " "), + ("class:pager.value", f"{cursor_line}/{line_count}"), + ("class:pager.bar", f" ({pct}%) · "), + ("class:pager.label", path), + ("class:pager.bar", " "), + ("class:pager.hint", "↑↓ PgUp/PgDn Home/End scroll · q / Esc to exit"), + ] + + status_window = Window( + content=FormattedTextControl(_status_line), + height=D.exact(1), + ) + + layout = Layout(HSplit([text_window, status_window])) + + kb = KeyBindings() + + @kb.add("q") + @kb.add("escape") + @kb.add("c-c") + def _exit(event): + event.app.exit() + + @kb.add("up") + def _up(_e): + buffer.cursor_up() + + @kb.add("down") + def _down(_e): + buffer.cursor_down() + + @kb.add("pageup") + def _pgup(_e): + buffer.cursor_up(count=20) + + @kb.add("pagedown") + def _pgdn(_e): + buffer.cursor_down(count=20) + + @kb.add("home") + def _home(event): + buffer.cursor_position = 0 + + @kb.add("end") + def _end(event): + buffer.cursor_position = len(buffer.text) + + @kb.add("left") + def _left(_e): + buffer.cursor_left() + + @kb.add("right") + def _right(_e): + buffer.cursor_right() + + style = Style.from_dict( + { + "pager.bar": f"bg:{_BG_CHIP} {BEE_DIM}", + "pager.value": f"bg:{_BG_CHIP} {BEE_YELLOW} bold", + "pager.label": f"bg:{_BG_CHIP} {BEE_DIM}", + "pager.hint": f"bg:{_BG_CHIP} {_DIM2}", + } + ) + + app = Application( + layout=layout, + key_bindings=kb, + style=style, + full_screen=True, + mouse_support=True, + ) + app.run() + + +def _normalize_setting_key(key: str) -> str: + """Strip leading dashes; settings keys are stored without `--` prefix. + + Hyphen vs underscore is left to the user — we don't normalise either way + because click options exist in both forms across the codebase. The + validation check (against the click flag list) settles which is correct. + """ + return key.strip().lstrip("-") + + +def _parse_set_args(rest: str) -> list[tuple[str, str]] | str: + """Parse the argument string for `:set`. Returns either a list of + (key, value) pairs, or an error string explaining what's wrong. + + Accepted forms (mix and match in one line): + :set country-code=fr + :set --country-code fr + :set country-code=fr premium-proxy=true device=mobile + :set --country-code fr --premium-proxy true + """ + try: + tokens = shlex.split(rest) + except ValueError as e: + return f"parse error: {e}" + + pairs: list[tuple[str, str]] = [] + i = 0 + while i < len(tokens): + tok = tokens[i] + if "=" in tok and not tok.startswith("="): + key, _, value = tok.partition("=") + key = _normalize_setting_key(key) + value = value.strip() + if not key or value == "": + return f"empty key or value in '{tok}'" + pairs.append((key, value)) + i += 1 + elif tok.startswith("--"): + key = _normalize_setting_key(tok) + if i + 1 >= len(tokens): + return f"missing value for --{key}" + pairs.append((key, tokens[i + 1])) + i += 2 + else: + return ( + f"unexpected '{tok}'. Use key=value or --key value " + f"(e.g. :set country-code=fr or :set --country-code fr)" + ) + return pairs + + +def _handle_meta( + line: str, + state: SessionState, + command_help: dict[str, str], + all_known_flags: set[str], + bool_flags: set[str], + choice_flags: dict[str, list[str]], +) -> str | None: parts = line.strip().split(None, 1) head = parts[0] rest = parts[1] if len(parts) > 1 else "" head_low = head.lower() - quit_aliases = {":q", ":quit", "exit", "quit", "q"} - help_aliases = {":help", ":?", "help", "?"} - clear_aliases = {":clear", "clear"} - - if head_low in quit_aliases: + if head_low in {":q", ":quit", "exit", "quit", "q"}: return "quit" - - if head_low in help_aliases: + if head_low in {":help", ":?", "help", "?"}: _print_help(command_help) return "ok" - - if head_low in clear_aliases: + if head_low in {":clear", "clear"}: sys.stderr.write("\033[2J\033[H") sys.stderr.flush() return "ok" - if head_low == ":show": if not state.settings: err_console.print(f" [{BEE_DIM}]No session defaults set.[/]") else: err_console.print() for k, v in state.settings.items(): - err_console.print(f" [bold {BEE_YELLOW}]{k:<20}[/] [dim]{v}[/]") + err_console.print( + f" [bold {BEE_YELLOW}]{k:<20}[/] [dim]{v}[/]" + ) err_console.print() return "ok" + if head_low == ":view": + from pathlib import Path - if head_low == ":unset": - key = rest.strip().lstrip("-") - if not key: - err_console.print(f" [bold {BEE_RED}]usage:[/] :unset KEY") + cache_path = Path.home() / ".cache" / "scrapingbee-cli" / "last-output" + if not cache_path.exists(): + err_console.print(f" [{BEE_DIM}]no recent output to view[/]") return "ok" - if key in state.settings: - del state.settings[key] - err_console.print(f" [{BEE_DIM}]unset[/] [bold {BEE_YELLOW}]{key}[/]") - else: - err_console.print(f" [{BEE_DIM}]not set:[/] {key}") + try: + _open_pager(str(cache_path)) + except Exception as e: + err_console.print(f" [bold {BEE_RED}]pager error:[/] {e}") + err_console.print( + f" [{BEE_DIM}]full output saved at[/] " + f"[bold {BEE_YELLOW}]{cache_path}[/]" + ) return "ok" - if head_low == ":set": - if "=" not in rest: - err_console.print(f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE") + if head_low in {":reset", ":unset-all"}: + n = len(state.settings) + state.settings.clear() + err_console.print(f" [{BEE_DIM}]cleared {n} setting(s)[/]") + return "ok" + if head_low == ":unset": + target = rest.strip() + if not target: + err_console.print( + f" [bold {BEE_RED}]usage:[/] :unset KEY | :unset * | :reset" + ) return "ok" - key, _, value = rest.partition("=") - key = key.strip().lstrip("-") - value = value.strip() - if not key or not value: - err_console.print(f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE") + if target in {"*", "all"}: + n = len(state.settings) + state.settings.clear() + err_console.print(f" [{BEE_DIM}]cleared {n} setting(s)[/]") return "ok" - state.settings[key] = value - err_console.print(f" [{BEE_DIM}]set[/] [bold {BEE_YELLOW}]{key}[/] = [dim]{value}[/]") + # Allow space- or comma-separated multiple keys. + keys = [_normalize_setting_key(k) for k in re.split(r"[,\s]+", target) if k] + for key in keys: + if key in state.settings: + del state.settings[key] + err_console.print( + f" [{BEE_DIM}]unset[/] [bold {BEE_YELLOW}]{key}[/]" + ) + else: + err_console.print(f" [{BEE_DIM}]not set:[/] {key}") return "ok" + if head_low == ":set": + if not rest.strip(): + err_console.print( + f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE [KEY=VALUE ...]" + ) + err_console.print( + f" [{BEE_DIM}] or:[/] :set --KEY VALUE [--KEY VALUE ...]" + ) + return "ok" + parsed = _parse_set_args(rest) + if isinstance(parsed, str): + err_console.print(f" [bold {BEE_RED}]:set[/] {parsed}") + return "ok" + valid_keys = {f.lstrip("-") for f in all_known_flags} + applied: list[tuple[str, str]] = [] + rejected: list[str] = [] + for key, value in parsed: + if key not in valid_keys: + err_console.print( + f" [bold {BEE_RED}]unknown option:[/] " + f"[bold {BEE_YELLOW}]--{key}[/]" + ) + suggestion = _suggest(key, valid_keys, threshold=2) + if suggestion: + err_console.print( + f" [{BEE_DIM}] did you mean[/] " + f"[bold {BEE_YELLOW}]--{suggestion}[/][{BEE_DIM}]?[/]" + ) + rejected.append(key) + continue + flag = f"--{key}" + # Validate choices + if flag in choice_flags and value not in choice_flags[flag]: + err_console.print( + f" [bold {BEE_RED}]invalid value for[/] " + f"[bold {BEE_YELLOW}]--{key}[/][bold {BEE_RED}]:[/] {value}" + ) + err_console.print( + f" [{BEE_DIM}] choices:[/] " + + ", ".join(choice_flags[flag]) + ) + rejected.append(key) + continue + # Validate bool values + if flag in bool_flags and value.lower() not in ( + "true", "false", "yes", "no", "1", "0", "on", "off" + ): + err_console.print( + f" [bold {BEE_RED}]--{key} expects a bool, got:[/] {value}" + ) + rejected.append(key) + continue + state.settings[key] = value + applied.append((key, value)) + + for key, value in applied: + err_console.print( + f" [{BEE_DIM}]set[/] [bold {BEE_YELLOW}]{key}[/] = " + f"[dim]{value}[/]" + ) + return "ok" return None # --------------------------------------------------------------------------- -# Did-you-mean +# Completer # --------------------------------------------------------------------------- -def _levenshtein(a: str, b: str) -> int: - if a == b: - return 0 - if not a: - return len(b) - if not b: - return len(a) - prev = list(range(len(b) + 1)) - for i, ca in enumerate(a, 1): - curr = [i] + [0] * len(b) - for j, cb in enumerate(b, 1): - cost = 0 if ca == cb else 1 - curr[j] = min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost) - prev = curr - return prev[-1] +def _make_completer( + commands: list[str], + command_flags: dict[str, list[str]], + bool_flags: set[str], + choice_flags: dict[str, list[str]], + command_help: dict[str, str], +): + from prompt_toolkit.completion import Completer, Completion + meta_cmds = [ + ":help", ":?", ":clear", ":view", ":set", ":unset", ":reset", ":show", + ":q", ":quit", + ] -def _suggest(typed: str, candidates: Iterable[str], threshold: int = 2) -> str | None: - best: tuple[int, str] | None = None - for c in candidates: - d = _levenshtein(typed.lower(), c.lower()) - if d <= threshold and (best is None or d < best[0]): - best = (d, c) - return best[1] if best else None + class BeeCompleter(Completer): + def get_completions(self, document, complete_event): + text = document.text_before_cursor.lstrip() + words = text.split() + on_first = (not text) or (len(words) == 1 and not text.endswith(" ")) + + if on_first: + partial = words[0].lower() if words else "" + pool: list[tuple[str, str]] = [(c, command_help.get(c, "")) for c in commands] + pool.extend((m, "REPL meta") for m in meta_cmds) + for cmd, meta in sorted(pool): + if cmd.startswith(partial): + yield Completion( + cmd, start_position=-len(partial), display_meta=meta + ) + return + + cmd_name = words[0] + flags_for_cmd = command_flags.get(cmd_name, []) + last = words[-1] if words else "" + prev = words[-2] if len(words) >= 2 else "" + + if text.endswith(" ") and prev in bool_flags: + yield Completion("true", display_meta="enable") + yield Completion("false", display_meta="disable") + return + if text.endswith(" ") and prev in choice_flags: + for v in choice_flags[prev]: + yield Completion(v) + return + if len(words) >= 2 and not last.startswith("-"): + if prev in bool_flags: + for v in ("true", "false"): + if v.startswith(last.lower()): + yield Completion(v, start_position=-len(last)) + return + if prev in choice_flags: + for v in choice_flags[prev]: + if v.startswith(last.lower()): + yield Completion(v, start_position=-len(last)) + return + if last.startswith("-"): + for flag in flags_for_cmd: + if flag.startswith(last): + yield Completion(flag, start_position=-len(last)) + + return BeeCompleter() # --------------------------------------------------------------------------- -# Multi-line input via trailing `\` +# Multi-line: trailing backslash continues the next line # --------------------------------------------------------------------------- -def _read_input(session: Any, main_prompt: list, cont_prompt: list) -> str: - """Read a (possibly multi-line) command. Trailing `\\` joins the next line.""" - line = session.prompt(main_prompt).rstrip() - while line.endswith("\\"): - more = session.prompt(cont_prompt).rstrip() - line = line[:-1].rstrip() + " " + more +def _prompt_once(state: SessionState, completer: Any, history_path: str) -> str | None: + app, _buffer = _build_application(state, completer, history_path) + return app.run() + + +def _read_input(state: SessionState, completer: Any, history_path: str) -> str | None: + line = _prompt_once(state, completer, history_path) + if line is None: + return None + while line.rstrip().endswith("\\"): + more = _prompt_once(state, completer, history_path) + if more is None: + return line.rstrip().rstrip("\\").rstrip() + line = line.rstrip().rstrip("\\").rstrip() + " " + more return line @@ -565,14 +939,11 @@ def run_repl(cli_group: Any, version: str) -> None: set_repl_mode(True) - # Click introspection command_help, command_flags, bool_flags, choice_flags = _walk_click_tree(cli_group) command_names = sorted(command_flags.keys()) - # Banner — once _print_banner(version) - # Session state + prompt session state = SessionState() state.refresh_credits_from_cache() @@ -582,27 +953,40 @@ def run_repl(cli_group: Any, version: str) -> None: completer = _make_completer( command_names, command_flags, bool_flags, choice_flags, command_help ) - toolbar = _build_toolbar_fn(state) - session = _build_session(history_path, completer, toolbar) - main_prompt = _build_main_prompt() - cont_prompt = _build_continuation_prompt() + # Flat set of every known flag across all commands — used by `:set` to + # validate keys and surface "did you mean?" suggestions for typos. + all_known_flags: set[str] = set() + for flags_list in command_flags.values(): + all_known_flags.update(flags_list) while True: try: - line = _read_input(session, main_prompt, cont_prompt).strip() - except KeyboardInterrupt: + line = _read_input(state, completer, history_path) + except (KeyboardInterrupt, EOFError): err_console.print() break - except EOFError: + + if line is None: err_console.print() break + line = line.strip() if not line: continue - # Meta-commands (`:help`, `:set`, `clear`, `exit`, etc.) - meta = _handle_meta(line, state, command_help) + # The prompt area is erased on submit (erase_when_done=True), so echo + # what the user typed into scrollback. Single line, ❯ + dim text — + # cleaner and more resize-safe than the old `─── cmd ──` divider. + echo = Text() + echo.append("❯ ", style=f"bold {BEE_YELLOW}") + echo.append(line, style="dim") + err_console.print(echo) + + # Slash / bare meta-commands + meta = _handle_meta( + line, state, command_help, all_known_flags, bool_flags, choice_flags + ) if meta == "quit": break if meta == "ok": @@ -620,7 +1004,6 @@ def run_repl(cli_group: Any, version: str) -> None: if not args: continue - # Unknown command + suggestion (fast path before click runs) cmd_name = args[0] if cmd_name not in command_flags: suggestion = _suggest(cmd_name, command_names) @@ -633,24 +1016,17 @@ def run_repl(cli_group: Any, version: str) -> None: err_console.print(f" [bold {BEE_RED}]unknown:[/] {cmd_name}") continue - # Apply session defaults args = state.apply_settings_to_args(args) - - # Output frame: divider above - _print_command_header(args) start = time.monotonic() status = "ok" try: cli_group.main(args, standalone_mode=False) except click.UsageError as e: - # Click usage error: try to suggest a flag if message is "no such option" msg = str(e) err_console.print(f" [bold {BEE_RED}]usage:[/] {msg}") if "no such option" in msg.lower(): - # Extract the bad flag and suggest - import re as _re - m = _re.search(r"--?[A-Za-z0-9-]+", msg) + m = re.search(r"--?[A-Za-z0-9-]+", msg) if m: bad = m.group(0) suggestion = _suggest(bad, command_flags.get(cmd_name, [])) @@ -674,7 +1050,6 @@ def run_repl(cli_group: Any, version: str) -> None: duration = time.monotonic() - start _print_command_footer(status, duration) - # Update session state for the toolbar state.last_command = cmd_name state.last_status = status state.last_duration = duration diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index 0587b91..2f839dc 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -127,36 +127,125 @@ def _render_inline_bee(frame_idx: int) -> Text: # -- Spinner ----------------------------------------------------------------- -_DOT_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] +# Hex bloom — a "honey crystallising" cycle expressed as a 3-cell-wide +# animation that radiates from the centre outward. Pure geometry, no +# mascot: dot grows into a honeycomb cell, peaks at a four-pointed +# sparkle (the moment crystals form), then drains back. +# +# The middle cell is the focal point and stays anchored; "halo" cells +# appear and disappear symmetrically so the bloom feels like it's growing +# in all directions, not rightward. +# +# Each frame pairs a 3-character composition with a colour from a +# dim→bright→warm gradient so the eye reads a glowing, breathing shape. +# +# Frames (centre + halo, always 3 cells wide): +# " · " dust (dim grey) +# " • " speck (dim amber) +# "·⬡·" outline + halo (amber) +# "·⬢·" honeycomb + halo (bright yellow) +# "⬡✦⬡" sparkle + halo (warm yellow-orange — PEAK / crystallised) +# "·⬢·" descending +# "·⬡·" +# " • " +_HEX_BLOOM_FRAMES: list[tuple[str, str]] = [ + (" · ", "#555555"), + (" • ", "#886600"), + ("·⬡·", "#BAA000"), + ("·⬢·", "#FFCD23"), + ("⬡✦⬡", "#FFB13D"), + ("·⬢·", "#FFCD23"), + ("·⬡·", "#BAA000"), + (" • ", "#886600"), +] + +# Per-command verbs that rotate during the pulse — keep them short and active. +_PHRASES: dict[str, list[str]] = { + "scrape": ["Fetching", "Rendering", "Extracting"], + "crawl": ["Crawling", "Following links", "Discovering"], + "google": ["Searching", "Querying"], + "fast-search": ["Searching"], + "amazon-product": ["Fetching product"], + "amazon-search": ["Searching Amazon"], + "walmart-product": ["Fetching product"], + "walmart-search": ["Searching Walmart"], + "youtube-search": ["Searching"], + "youtube-metadata": ["Fetching metadata"], + "chatgpt": ["Querying", "Thinking"], + "usage": ["Checking credits"], + "sitemap": ["Fetching sitemap"], +} + +_FRAME_INTERVAL = 0.08 # seconds per frame ⇒ ~12 fps, smooth bloom +_PHRASE_DURATION_FRAMES = 30 # rotate verb every ~2.4s +_SHIMMER_DIVISOR = 2 # shimmer advances every N bloom frames + +# Shimmer palette — one bright "peak" cell sweeps across the verb, with two +# flank cells receiving softer highlights so the glim feels like a wave +# instead of a hard cursor. +_SHIMMER_PEAK = "#FFFFFF" +_SHIMMER_FLANK = "#FFE780" + + +def _shimmer_text(text: str, position: int, base_color: str) -> Text: + """Render `text` with a glimmer of light at `position`. + + The character at `position` is bright white; characters at ±1 are warm + light yellow; everything else uses `base_color`. Combined with a position + that advances each frame, this reads as a glow sweeping across the word. + """ + out = Text() + for i, ch in enumerate(text): + distance = abs(i - position) + if distance == 0: + style = f"bold {_SHIMMER_PEAK}" + elif distance == 1: + style = f"bold {_SHIMMER_FLANK}" + else: + style = f"bold {base_color}" + out.append(ch, style=style) + return out class MiniBeeSpinner: - """Single-line dot spinner with the command name as the label. + """Single-line pulsing-asterisk spinner with a rotating command verb. Usage:: with MiniBeeSpinner("scrape"): await do_request() - Output is one steady line: a rotating braille-dot frame followed by the - command name. No emoticons, no rotating fun facts, no time-of-day - flavour — just a clean status indicator. + Renders one line: a Claude-style asterisk that blooms (· → ✻ → ·), a + short verb that rotates every ~2.4s ("Fetching" / "Rendering" / ...), + and an elapsed-time counter once the operation passes 0.5s. """ def __init__(self, message: str = "") -> None: self._label = message + # Resolve the verb cycle: per-command phrases if known, else just the + # label as a single static verb. + self._phrases = _PHRASES.get(message, [message] if message else ["Working"]) self._stop = threading.Event() self._thread: threading.Thread | None = None def _animate(self) -> None: + import time + + start = time.monotonic() idx = 0 while not self._stop.is_set(): - frame = _DOT_FRAMES[idx % len(_DOT_FRAMES)] + glyph, color = _HEX_BLOOM_FRAMES[idx % len(_HEX_BLOOM_FRAMES)] + phrase = self._phrases[(idx // _PHRASE_DURATION_FRAMES) % len(self._phrases)] + shimmer_pos = (idx // _SHIMMER_DIVISOR) % max(1, len(phrase)) + elapsed = time.monotonic() - start + line = Text() line.append(" ") - line.append(frame, style=f"bold {BEE_YELLOW}") - if self._label: - line.append(f" {self._label}", style="dim") + line.append(glyph, style=f"bold {color}") + line.append(" ") + line.append_text(_shimmer_text(phrase, shimmer_pos, BEE_YELLOW)) + if elapsed >= 0.5: + line.append(f" · {elapsed:.1f}s", style="dim") with err_console.capture() as capture: err_console.print(line, end="") @@ -164,7 +253,7 @@ def _animate(self) -> None: sys.stderr.flush() idx += 1 - self._stop.wait(0.08) + self._stop.wait(_FRAME_INTERVAL) # Clear the spinner line. sys.stderr.write("\r\033[K") From c4d4a688f2b1b872c00ee7142c4cb972f50aaa3e Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 13 May 2026 16:52:56 +0530 Subject: [PATCH 06/15] feat(repl): full_screen alt-buffer + virtual scrollback + many fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switches the REPL from prompt_toolkit's full_screen=False (inline) mode to full_screen=True with an in-memory ScrollbackBuffer. Eliminates the wrap-fragment / orphan-toolbar artifacts that bled into terminal scrollback on resize, and gives us full control over rendering for shimmer animations, mouse handling, and pagination. Layout - Pinned banner Window at the top: compact smblock "ScrapingBee" + version + tagline + ":help / :q" hint. Stays visible during long scrapes. - Scrollback Window below the banner; spacer rows + horizontal separator between scrollback and the input area (Claude-CLI style). - Toolbar at the bottom with paginated fields that rotate every 5s (Available Credits / Used Session / Concurrency / Next Update); the mode hint is pinned on every page so it's always visible. - Running-state toolbar pins "running · Xs" on the left, rotates a stat in the middle (so credits consumed are visible during long crawls), and pins "Ctrl+C to stop" on the right. Output handling - ScrollbackBuffer + ScrollbackWriter pipe stdout / stderr / err_console through ANSI-parsing into an in-memory line list rendered by a scrollable Window. 10K-line ring buffer. - Visual-row scroll (not logical-line): scroll_offset measured in terminal rows with width-aware line splitting, so long single-line output (huge JSON, etc.) scrolls one terminal row per wheel tick. - Command echo splices into scrollback at the position where output started, on completion — no echo during execution (only the shimmer is the live indicator), echo appears right above output when done. Input / interaction - Mouse mode 1000 captures wheel/trackpad scroll; native drag-select still works because the terminal owns motion events. Tab toggles Scroll vs Select mode at runtime; toolbar hint shows current mode. - Up/Down arrow keys navigate command history; explicit history.store_string() per submit since the custom Enter binding bypasses Buffer.validate_and_handle(). - Tab completion opens a popup via FloatContainer + CompletionsMenu (was silently entering completion state with no UI). Up/Down navigate, Enter picks, Esc dismisses. - Pager (:view) wraps long lines, defaults to pretty-printed JSON with "r" to toggle raw, runs in a worker thread to avoid asyncio.run() conflict with the outer loop, re-enters alt buffer on exit so the outer REPL doesn't bleed into the main screen. - Resize detection in the ticker triggers app.invalidate() so the layout adapts cleanly. State / usage - SessionState gains api_key_hash + per-session "used_credits_at_start" so re-auth with the same key preserves the session counter; a different key resets it. - Background usage refresher polls /usage every 30s; "usage" command completion + auth completion trigger an immediate refresh via a thread-safe event. - Banner shows "API key not set — type auth" when no key is configured. - :help wrapping with a proper hanging indent (Text objects, not Rich markup, so leading whitespace is preserved); blank row between categories. Crawl - Skip Twisted signal-handler installation in REPL mode (signal.signal requires the main thread, but commands run in worker threads). - Wire LOG_FILE to ~/.cache/scrapingbee-cli/crawl.log in REPL mode so the full crawl log is preserved beyond scrollback's MAX_LINES. - Initialise usage_info to None before the batch-usage try block to prevent UnboundLocalError when the initial fetch raises. Misc - cli_utils: always overwrite the last-output cache for text responses in REPL mode (not just truncated ones) so :view never shows stale output from a previous command. - Ctrl+C while running injects KeyboardInterrupt into the worker via PyThreadState_SetAsyncExc; surfaces as "stopped" in the footer. - Reverted earlier experiments with Braille / PIL-rendered logos. --- src/scrapingbee_cli/cli.py | 11 +- src/scrapingbee_cli/cli_utils.py | 21 +- src/scrapingbee_cli/commands/crawl.py | 2 + src/scrapingbee_cli/commands/usage.py | 3 - src/scrapingbee_cli/crawl.py | 55 +- src/scrapingbee_cli/interactive.py | 2251 ++++++++++++++++++++++--- src/scrapingbee_cli/theme.py | 14 +- 7 files changed, 2142 insertions(+), 215 deletions(-) diff --git a/src/scrapingbee_cli/cli.py b/src/scrapingbee_cli/cli.py index ff913bc..f1bb8c0 100644 --- a/src/scrapingbee_cli/cli.py +++ b/src/scrapingbee_cli/cli.py @@ -71,8 +71,15 @@ def _show_active_schedules_hint() -> None: @click.group(invoke_without_command=True) @click.version_option(version=__version__) +@click.option( + "--keep-bg", + is_flag=True, + default=False, + help="Keep the terminal's current background and theme colours instead " + "of forcing the REPL to black/light-grey.", +) @click.pass_context -def cli(ctx: click.Context) -> None: +def cli(ctx: click.Context, keep_bg: bool) -> None: """ScrapingBee CLI - Web scraping API client. Commands: scrape (single or batch), crawl (Scrapy/quick-crawl), usage, @@ -89,7 +96,7 @@ def cli(ctx: click.Context) -> None: _in_repl = True try: - run_repl(cli, __version__) + run_repl(cli, __version__, keep_bg=keep_bg) finally: _in_repl = False diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index fd7c045..bf19dba 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -55,15 +55,10 @@ def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: if not is_text: return data, None, None - line_count = data.count(b"\n") + 1 - if ( - len(data) <= _REPL_PREVIEW_MAX_BYTES - and line_count <= _REPL_PREVIEW_MAX_LINES - ): - return data, None, None - - # Save the full payload to a fixed cache file the user can scroll through - # via :view (or `less` directly). + # Always overwrite the ``last-output`` cache for every response, even + # short ones. Otherwise ``:view`` would happily display a stale large + # response from a previous command — the cache file would only get + # refreshed by responses big enough to trigger the truncation branch. full_path: str | None = None try: from pathlib import Path @@ -76,6 +71,14 @@ def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: except Exception: full_path = None + line_count = data.count(b"\n") + 1 + if ( + len(data) <= _REPL_PREVIEW_MAX_BYTES + and line_count <= _REPL_PREVIEW_MAX_LINES + ): + # Small enough to print inline — but the cache is still fresh. + return data, None, None + text = data.decode("utf-8", errors="replace") lines = text.split("\n") line_preview = "\n".join(lines[: _REPL_PREVIEW_MAX_LINES]) diff --git a/src/scrapingbee_cli/commands/crawl.py b/src/scrapingbee_cli/commands/crawl.py index 4f234da..70681ed 100644 --- a/src/scrapingbee_cli/commands/crawl.py +++ b/src/scrapingbee_cli/commands/crawl.py @@ -432,6 +432,7 @@ def crawl_cmd( if not target: click.echo("Provide a spider name, one or more URLs, or --from-sitemap URL.", err=True) raise SystemExit(1) + usage_info: dict | None = None try: usage_info = get_batch_usage(None) concurrency = resolve_batch_concurrency(obj["concurrency"], usage_info, 1) @@ -443,6 +444,7 @@ def crawl_cmd( "Use --concurrency to set explicitly.", err=True, ) + usage_info = None concurrency = 1 from_concurrency = False plan_concurrency = 0 diff --git a/src/scrapingbee_cli/commands/usage.py b/src/scrapingbee_cli/commands/usage.py index beadfe7..a358f53 100644 --- a/src/scrapingbee_cli/commands/usage.py +++ b/src/scrapingbee_cli/commands/usage.py @@ -62,7 +62,6 @@ def _show_repl_usage(data: bytes) -> None: from ..theme import ( BEE_YELLOW, - _render_inline_bee, echo_key_value, echo_separator, err_console, @@ -72,8 +71,6 @@ def _show_repl_usage(data: bytes) -> None: raw = _json.loads(data) header = Text() - header.append(" ") - header.append_text(_render_inline_bee(0)) header.append(" Credit Usage", style=f"bold {BEE_YELLOW}") err_console.print(header) err_console.print() diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py index 363900c..1122c14 100644 --- a/src/scrapingbee_cli/crawl.py +++ b/src/scrapingbee_cli/crawl.py @@ -29,6 +29,51 @@ SCRAPINGBEE_MIDDLEWARE = "scrapy_scrapingbee.ScrapingBeeMiddleware" MIDDLEWARE_PRIORITY = 725 + +def _install_signal_handlers() -> bool: + """Whether Scrapy / Twisted should install Unix signal handlers. + + Returns False when running inside the REPL — there we run crawl in a + worker thread (to avoid asyncio.run conflicting with prompt_toolkit's + main-thread loop), and ``signal.signal()`` is restricted to the main + thread, so any attempt to install handlers raises ``ValueError: + signal only works in main thread of the main interpreter``. The REPL + provides its own Ctrl+C handling that injects ``KeyboardInterrupt`` + into the worker thread, so we don't need Scrapy's handlers there. + + Returns True for direct ``scrapingbee crawl ...`` invocations — those + run on the main thread and benefit from Twisted's graceful shutdown. + """ + try: + from .theme import is_repl_mode + return not is_repl_mode() + except Exception: + return True + + +def _maybe_set_repl_log_file(settings) -> str | None: + """In REPL mode, also pipe Scrapy logs to a file on disk. + + The REPL's virtual scrollback caps at ~10K lines and drops the oldest + 10% when full, so long crawls lose history. Setting ``LOG_FILE`` makes + Scrapy *also* write its full log stream to the given path (terminal + output stays — LOG_FILE adds a file sink, doesn't replace stderr). + Returns the log path so the caller can surface it in the UI, or None + if logging-to-file wasn't enabled (non-REPL or on filesystem failure). + """ + try: + from .theme import is_repl_mode + if not is_repl_mode(): + return None + log_dir = Path.home() / ".cache" / "scrapingbee-cli" + log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / "crawl.log" + settings.set("LOG_FILE", str(log_path)) + settings.set("LOG_FILE_APPEND", False) # fresh log per run + return str(log_path) + except Exception: + return None + # 0 means unlimited DEFAULT_MAX_DEPTH = 0 DEFAULT_MAX_PAGES = 0 @@ -617,9 +662,12 @@ def run_project_spider( download_delay=download_delay, autothrottle_enabled=autothrottle_enabled, ) + log_path = _maybe_set_repl_log_file(settings) + if log_path: + click.echo(f"REPL mode: full crawl log → {log_path}", err=True) process = CrawlerProcess(settings) process.crawl(spider_name) - process.start() + process.start(install_signal_handlers=_install_signal_handlers()) finally: os.chdir(orig_cwd) @@ -682,6 +730,9 @@ def run_urls_spider( settings.set("LOG_LEVEL", "WARNING") if max_pages > 0: settings.set("CLOSESPIDER_PAGECOUNT", max_pages) + log_path = _maybe_set_repl_log_file(settings) + if log_path: + click.echo(f"REPL mode: full crawl log → {log_path}", err=True) process = CrawlerProcess(settings) process.crawl( GenericScrapingBeeSpider, @@ -699,4 +750,4 @@ def run_urls_spider( exclude_pattern=exclude_pattern, save_pattern=save_pattern, ) - process.start() + process.start(install_signal_handlers=_install_signal_handlers()) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 4dff462..d9de2ff 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -1,22 +1,27 @@ -"""Interactive REPL — Claude-style bordered input box with status toolbar. - -Built on prompt_toolkit's `Application` API (not `PromptSession`) so we can -custom-layout the input area as a `Frame` with a chevron prompt mark, a -bottom-anchored toolbar showing live state (credits, last cmd, settings), -and a per-input syntax-highlighting lexer. - -Output from each command flows above the input box; the box stays anchored -where the cursor was when the prompt opened. - -Goals (revised; the previous version drifted from these): -- Bordered input box, anchored bottom of the prompt area. -- Restrained palette: yellow accent, soft amber chrome, dim greys, semantic - green / red. No yellow-on-yellow, no mascot, no animation. -- Slash-prefixed REPL meta-commands (`:help`, `:q`, `:clear`, `:set`, ...). -- Per-command tab completion driven by walking the click tree. -- Toolbar with credits gauge, last status icon, `:set` chips, hint line. -- Inline syntax highlighting: command, flags, URLs, quoted strings. -- "Did you mean?" on typos. Multi-line input via trailing backslash. +"""Interactive REPL — Ink-style hybrid (real scrollback + persistent bottom prompt). + +The pattern is the same one Claude CLI uses (see Ink's `` component): +- Past command output is printed to real terminal stdout → goes into terminal + scrollback. Mouse-wheel scrolling and selection work normally, resize is + handled by the terminal, and quitting leaves a clean record behind. +- The input area + status toolbar live at the very bottom of the terminal as + a small persistent `Application(full_screen=False)`. prompt_toolkit's + `patch_stdout` redraws this strip whenever something prints, so the prompt + is always visible no matter how many lines of output flow above. + +That means: typing a command, hitting enter, watching output stream in +*above* the prompt — exactly the Claude experience — without losing real +terminal scrollback or selection. + +Implementation notes: +- ONE persistent Application for the whole REPL session (not one-per-prompt). +- Enter key binding runs the click command synchronously inside the handler. + Output from the command goes through patched stdout/stderr and lands above + the prompt. +- Interactive commands (tutorial, auth) take over the terminal via + `run_in_terminal` so click.prompt() works. +- On launch we pad with newlines so the prompt anchors at the bottom from + the first frame. """ from __future__ import annotations @@ -25,6 +30,7 @@ import re import shlex import sys +import threading import time from typing import TYPE_CHECKING, Any, Iterable @@ -51,7 +57,9 @@ "rule": _AMBER, # Prompt mark inside the input area "promptmark": f"{BEE_YELLOW} bold", - # Lexer (input syntax highlighting) + # Lexer (input syntax highlighting). Specific categories have explicit + # colours; unstyled tokens fall through to the application's default + # style (key `""`), which is set per-session in `_style_dict_for`. "lexer.cmd": f"{BEE_YELLOW} bold", "lexer.flag": _AMBER, "lexer.url": _URL_CYAN, @@ -75,6 +83,350 @@ } +def _style_dict_for(keep_bg: bool) -> dict[str, str]: + """Return the prompt_toolkit Style dict for the REPL session. + + When `keep_bg` is False (default), set the empty class `""` (the default + style) to a dark-theme foreground. Combined with the OSC 11/10 escapes + that switch the *terminal* fg/bg to dark, this gives a single coherent + "dark theme" applied at both layers — explicit class colours stay as-is, + and any unstyled text falls back to a readable light-grey. + + With `keep_bg=True`, the default class is empty and the terminal's own + fg/bg are untouched — the user's system theme drives all defaults. + """ + style = dict(_STYLE_DICT) + if not keep_bg: + style[""] = "fg:#EAEAEA" + return style + + +# --------------------------------------------------------------------------- +# Binary-write adapter +# --------------------------------------------------------------------------- + + +class _BinaryAdapter: + """Adapter that exposes a ``.write(bytes)`` interface on top of a text + stream. Bolted onto prompt_toolkit's StdoutProxy at runtime so callers + that write bytes (``sys.stdout.buffer.write(b"...")``) work transparently + while we're inside a ``patch_stdout`` context. + """ + + def __init__(self, text_stream) -> None: + self._stream = text_stream + + def write(self, data) -> int: + if data is None or len(data) == 0: + return 0 + if isinstance(data, (bytes, bytearray, memoryview)): + text = bytes(data).decode("utf-8", errors="replace") + else: + text = str(data) + self._stream.write(text) + return len(data) + + def flush(self) -> None: + try: + self._stream.flush() + except Exception: + pass + + @property + def closed(self) -> bool: + return False + + +# --------------------------------------------------------------------------- +# Virtual scrollback (for full_screen=True mode) +# --------------------------------------------------------------------------- + + +def _split_fragments_to_width( + line: list[tuple[str, str]], width: int +) -> list[list[tuple[str, str]]]: + """Split a logical line's (style, text) fragments into a list of + visual rows, each at most ``width`` characters wide. + + Empty input → one empty row (so blank lines still occupy one row). + Preserves styles across the split — if a styled fragment crosses a + row boundary, the boundary lands inside the fragment with the same + style on both sides. + """ + if width <= 0: + return [list(line)] + if not line: + return [[]] + out: list[list[tuple[str, str]]] = [] + current: list[tuple[str, str]] = [] + current_len = 0 + for sty, text in line: + if not text: + continue + i = 0 + n = len(text) + while i < n: + room = width - current_len + if room <= 0: + out.append(current) + current = [] + current_len = 0 + room = width + chunk = text[i : i + room] + current.append((sty, chunk)) + current_len += len(chunk) + i += len(chunk) + if current or not out: + out.append(current) + return out + + +class ScrollbackBuffer: + """In-memory line buffer that backs the scrollable output Window. + + When the REPL runs in full_screen mode we own the alt buffer, so command + output can't flow into real terminal scrollback. Instead, every line of + output gets parsed for ANSI escapes and stored as a list of + ``(style, text)`` fragments. The render callback for the output Window + asks the buffer for a slice based on the current scroll offset. + + Thread-safe append: command output is written from worker threads and + the renderer reads from the loop thread; a lock keeps the list + consistent without trying to be clever. + """ + + MAX_LINES = 10_000 # ring-buffer cap so a runaway scrape can't OOM us + + def __init__(self) -> None: + self.lines: list[list[tuple[str, str]]] = [] + # How many lines we're scrolled up from the bottom. 0 = at bottom + # (auto-follow); positive = locked at some scrolled-up position. + self.scroll_offset = 0 + self._lock = threading.Lock() + + def append_fragments(self, fragments: list[tuple[str, str]]) -> None: + """Append one rendered line (already styled) as the final entry.""" + with self._lock: + self.lines.append(list(fragments)) + if len(self.lines) > self.MAX_LINES: + # Drop the oldest 10% — cheaper than dropping one at a time + # if a scrape produces tens of thousands of lines. + drop = self.MAX_LINES // 10 + del self.lines[:drop] + + def append_ansi_text(self, text: str) -> None: + """Parse ANSI codes in ``text`` and append the resulting line(s). + + Handles partial-line writes: callers may write text without a + trailing newline (e.g. an in-progress progress bar). We split on + ``\\n``; the final post-split chunk goes into a pending buffer + that gets prepended to the next write. + """ + from prompt_toolkit.formatted_text import ANSI, to_formatted_text + + # Combine with anything pending from a previous partial write. + with self._lock: + pending = self._pending if hasattr(self, "_pending") else "" + combined = pending + text + chunks = combined.split("\n") + self._pending = chunks[-1] # may be empty if text ended with \n + complete = chunks[:-1] + + for raw in complete: + try: + fragments = list(to_formatted_text(ANSI(raw))) + except Exception: + fragments = [("", raw)] + self.append_fragments(fragments) + + def flush_pending(self) -> None: + """Commit any pending partial line as its own row.""" + with self._lock: + pending = getattr(self, "_pending", "") + self._pending = "" + if pending: + from prompt_toolkit.formatted_text import ANSI, to_formatted_text + + try: + fragments = list(to_formatted_text(ANSI(pending))) + except Exception: + fragments = [("", pending)] + self.append_fragments(fragments) + + def get_visible_window( + self, height: int + ) -> list[list[tuple[str, str]]]: + """Backwards-compatible: visible slice in *logical* lines.""" + with self._lock: + total = len(self.lines) + if total == 0: + return [] + max_offset = max(0, total - height) + if self.scroll_offset > max_offset: + self.scroll_offset = max_offset + end = total - self.scroll_offset + start = max(0, end - height) + return [list(line) for line in self.lines[start:end]] + + def get_visible_visual( + self, height: int, width: int + ) -> list[list[tuple[str, str]]]: + """Return visible content in *visual rows* (post-wrap). + + Long single lines that wrap to multiple terminal rows are + pre-split here at ``width`` characters so each entry in the + returned list is exactly one terminal row. ``scroll_offset`` + is in visual rows too, so one ``scroll_up(1)`` step moves the + view by exactly one visible row — even through a 5000-char + JSON blob that wraps to dozens of rows. This is what makes + wheel/trackpad scrolling feel consistent regardless of line + length. + """ + if width <= 1: + return self.get_visible_window(height) + with self._lock: + # Walk from the bottom up, accumulating visual rows until we + # have enough to fill the window at the requested scroll offset. + # Stops early on large buffers — we don't need to wrap content + # the user can't see this frame. + need = max(0, self.scroll_offset) + max(1, height) + collected: list[list[tuple[str, str]]] = [] # newest-first + for line in reversed(self.lines): + for visual_row in reversed(_split_fragments_to_width(line, width)): + collected.append(visual_row) + if len(collected) >= need: + break + collected.reverse() # back to oldest-first + total = len(collected) + max_offset = max(0, total - height) + if self.scroll_offset > max_offset: + self.scroll_offset = max_offset + end = total - self.scroll_offset + start = max(0, end - height) + return collected[start:end] + + def scroll_up(self, n: int = 1) -> None: + with self._lock: + # Soft cap — get_visible_window will further clamp based on + # the actual rendered height, but capping here at total-1 + # avoids letting offset grow unboundedly between renders. + self.scroll_offset = min( + max(0, len(self.lines) - 1), self.scroll_offset + n + ) + + def scroll_down(self, n: int = 1) -> None: + with self._lock: + self.scroll_offset = max(0, self.scroll_offset - n) + + def scroll_to_top(self) -> None: + with self._lock: + self.scroll_offset = max(0, len(self.lines) - 1) + + def scroll_to_bottom(self) -> None: + with self._lock: + self.scroll_offset = 0 + + @property + def at_bottom(self) -> bool: + with self._lock: + return self.scroll_offset == 0 + + def insert_line(self, index: int, fragments: list[tuple[str, str]]) -> None: + """Insert a single line at ``index`` (clamped to current length). + + Used to retroactively splice the command-echo line in front of a + finished command's output, so the user sees ``❯ `` above the + output rows the command produced — without the echo being visible + during execution itself (where the shimmer is the live indicator). + """ + with self._lock: + i = max(0, min(index, len(self.lines))) + self.lines.insert(i, list(fragments)) + + def current_length(self) -> int: + with self._lock: + return len(self.lines) + + +class ScrollbackWriter: + """File-like writer that pipes everything into a ScrollbackBuffer. + + Installed as ``sys.stdout`` / ``sys.stderr`` while the REPL runs. + Click commands, rich consoles, plain ``print`` calls — all flow + through here, get parsed for ANSI, and end up as rows in the + scrollback. The renderer then displays them. + + Thread-safe: command output comes from worker threads while the + prompt_toolkit loop renders on the main thread. + """ + + encoding = "utf-8" + + def __init__(self, scrollback: ScrollbackBuffer, on_write: Any = None) -> None: + self._sb = scrollback + self._on_write = on_write # callable to nudge the app to re-render + + def write(self, s: Any) -> int: + if not s: + return 0 + if isinstance(s, (bytes, bytearray, memoryview)): + s = bytes(s).decode("utf-8", errors="replace") + elif not isinstance(s, str): + s = str(s) + self._sb.append_ansi_text(s) + if self._on_write is not None: + try: + self._on_write() + except Exception: + pass + return len(s) + + def flush(self) -> None: + # No-op — we don't buffer beyond ScrollbackBuffer's pending partial. + pass + + def isatty(self) -> bool: + return True # let click / rich treat us as a tty so colors stay on + + @property + def closed(self) -> bool: + return False + + def writable(self) -> bool: + return True + + +# --------------------------------------------------------------------------- +# Shimmer (prompt_toolkit-formatted) +# --------------------------------------------------------------------------- + +# Used for the live "running command" line above the input. A bright white +# "peak" cell sweeps across the line, flanked by warm-yellow cells, with the +# rest in brand yellow — reads as a glow running along the command text. +_SHIMMER_PEAK_PT = "#FFFFFF" +_SHIMMER_FLANK_PT = "#FFE780" + + +def _shimmer_pt(text: str, position: int, base_color: str) -> list[tuple[str, str]]: + """Return prompt_toolkit formatted-text tuples with a shimmer at `position`. + + Character at `position` is peak white, neighbours at ±1 are warm yellow, + everything else uses ``base_color``. Combined with a position that + advances each tick this reads as a wave of light along the text. + """ + out: list[tuple[str, str]] = [] + for i, ch in enumerate(text): + distance = abs(i - position) + if distance == 0: + style = f"bold fg:{_SHIMMER_PEAK_PT}" + elif distance == 1: + style = f"bold fg:{_SHIMMER_FLANK_PT}" + else: + style = f"bold fg:{base_color}" + out.append((style, ch)) + return out + + # --------------------------------------------------------------------------- # Click tree introspection # --------------------------------------------------------------------------- @@ -123,13 +475,40 @@ def _walk_click_tree(cli_group: Any) -> tuple[ class SessionState: """REPL-wide mutable state surfaced in the bottom toolbar.""" + USAGE_REFRESH_INTERVAL = 30.0 # seconds between background usage API calls + def __init__(self) -> None: self.last_command: str | None = None self.last_status: str | None = None # "ok" | "fail" self.last_duration: float | None = None - self.credits: int | None = None - self.credits_total: int | None = None + # Live account state — surfaced in the toolbar. None ⇒ unknown / N/A. + self.credits: int | None = None # available = max - used + self.credits_total: int | None = None # max_api_credit + self.used_credits: int | None = None # used_api_credit (latest) + self.used_credits_at_start: int | None = None # snapshotted after first ok refresh + self.max_concurrency: int | None = None + self.current_concurrency: int | None = None + # Whether the API key was present when the REPL started (or after auth). + # Drives "N/A" rendering in the toolbar while False. + self.api_key_set: bool = False + # Short hash of the live API key. Used to detect logout/relogin with + # the same key — when the key is unchanged we keep the session + # counter going instead of resetting it to 0. + self.api_key_hash: str | None = None + self.last_usage_refresh_mono: float | None = None # time.monotonic() of last ok refresh self.settings: dict[str, str] = {} + # In-flight execution state — drives the live "running" line above + # the input (with shimmer sweep) and the toolbar's running indicator. + self.is_running: bool = False + self.running_command: str | None = None + self.running_command_text: str | None = None # full line as typed + self.run_start: float | None = None + self.tick: int = 0 # frame counter for the shimmer position + # Mouse mode toggle: "scroll" = mouse_support on (wheel scrolls the + # virtual buffer, drag-select needs a per-terminal modifier); + # "select" = mouse_support off (native drag-select works everywhere + # without a modifier, but wheel scroll stops). Alt+S toggles. + self.mouse_mode: str = "scroll" def apply_settings_to_args(self, args: list[str]) -> list[str]: if not self.settings: @@ -144,6 +523,19 @@ def apply_settings_to_args(self, args: list[str]) -> list[str]: return out def refresh_credits_from_cache(self) -> None: + """Populate live fields from the on-disk usage cache. + + Cache file shape (written by ``batch.write_usage_file_cache``): + ``{"ts": , "key_hash": , "data": }`` + where ``data`` is the output of ``client.parse_usage``: + ``{"credits": int, "max_api_credit": int, "max_concurrency": int}`` + + Only the ``data`` sub-dict has the values we care about; reading any + other key would just see metadata. Earlier versions iterated + ``data.values()`` and relied on the fact that the inner dict happened + to have matching keys — works by accident, brittle if the cache + format ever grows. + """ try: import json from pathlib import Path @@ -151,20 +543,71 @@ def refresh_credits_from_cache(self) -> None: cache = Path.home() / ".config" / "scrapingbee-cli" / "usage_cache.json" if not cache.exists(): return - data = json.loads(cache.read_text(encoding="utf-8")) - entries = data.values() if isinstance(data, dict) else [] - for entry in entries: - if not isinstance(entry, dict): - continue - if isinstance(entry.get("credits"), int): - self.credits = entry["credits"] - if isinstance(entry.get("max_api_credit"), int): - self.credits_total = entry["max_api_credit"] - if self.credits is not None: - return + entry = json.loads(cache.read_text(encoding="utf-8")) + if not isinstance(entry, dict): + return + data = entry.get("data") + if not isinstance(data, dict): + return + if isinstance(data.get("credits"), int): + self.credits = data["credits"] + if isinstance(data.get("max_api_credit"), int): + self.credits_total = data["max_api_credit"] + if isinstance(data.get("max_concurrency"), int): + self.max_concurrency = data["max_concurrency"] except Exception: return + def update_from_usage_response(self, raw: dict, key_hash: str | None = None) -> None: + """Apply a parsed JSON usage-API response to the live state. + + Snapshots ``used_credits_at_start`` on first successful update so the + toolbar's "used this session" remains accurate even if the REPL was + launched before the first refresh succeeded. If ``key_hash`` is + provided and differs from the previous one, the session start + snapshot is reset — so logging out and back in with a *different* + key starts the counter at 0, but re-auth with the *same* key keeps + counting from where it left off. + """ + if key_hash is not None and key_hash != self.api_key_hash: + # Key changed (initial set OR switched to a different key) — + # forget the previous session's baseline so the next snapshot + # below establishes a fresh one. + self.used_credits_at_start = None + self.api_key_hash = key_hash + max_credit = raw.get("max_api_credit") + used_credit = raw.get("used_api_credit") + if isinstance(max_credit, (int, float)): + self.credits_total = int(max_credit) + if isinstance(used_credit, (int, float)): + self.used_credits = int(used_credit) + if self.used_credits_at_start is None: + self.used_credits_at_start = int(used_credit) + if self.credits_total is not None and self.used_credits is not None: + self.credits = max(0, self.credits_total - self.used_credits) + mc = raw.get("max_concurrency") + if isinstance(mc, (int, float)): + self.max_concurrency = int(mc) + cc = raw.get("current_concurrency") + if isinstance(cc, (int, float)): + self.current_concurrency = int(cc) + self.last_usage_refresh_mono = time.monotonic() + + @property + def session_credits_used(self) -> int | None: + if self.used_credits is None or self.used_credits_at_start is None: + return None + return max(0, self.used_credits - self.used_credits_at_start) + + @property + def seconds_until_next_refresh(self) -> int | None: + if self.last_usage_refresh_mono is None: + return None + remaining = ( + self.last_usage_refresh_mono + self.USAGE_REFRESH_INTERVAL - time.monotonic() + ) + return max(0, int(remaining + 0.999)) # ceil so the countdown never shows -1 + # --------------------------------------------------------------------------- # Helpers @@ -249,6 +692,9 @@ def get_line(lineno: int): ): tokens.append(("class:lexer.string", piece)) else: + # Inherit the app default style (`""`), which is set + # to light-grey foreground when --keep-bg is off and + # left empty (terminal default) when --keep-bg is on. tokens.append(("", piece)) return tokens @@ -269,69 +715,279 @@ def _make_toolbar(state: SessionState): - Wide: credits gauge · last cmd · all chips · hint - Medium: credits gauge · last cmd · chip count · hint - Narrow: credits · last cmd · chip count + + While a command is in flight (``state.is_running``) the toolbar shows a + plain "running · s" label; the visual animation lives on the + shimmering command line just above the input. """ def render() -> list[tuple[str, str]]: - import shutil + # Width: prefer prompt_toolkit's live SIGWINCH-tracked size when an + # app is actually running (so the toolbar stays in lockstep with + # what prompt_toolkit's own renderer is using). Outside a run loop, + # ``get_app()`` returns a dummy whose output reports a constant 80 + # — useless — so we fall through to shutil in that case. + width = 0 + try: + from prompt_toolkit.application import get_app as _get_app - width = shutil.get_terminal_size((80, 24)).columns + _app = _get_app() + # get_app() returns a dummy outside a real run loop; its output + # reports a constant 80 — useless. Only trust the live app. + if getattr(_app, "is_running", False): + width = _app.output.get_size().columns + except Exception: + pass + if not width: + import shutil + width = shutil.get_terminal_size((80, 24)).columns segs: list[tuple[str, str]] = [("class:toolbar", " ")] - # --- Credits + gauge -------------------------------------------------- - segs.append(("class:toolbar.label", "credits ")) - if state.credits is not None: - segs.append(("class:toolbar.value", _format_credits(state.credits))) + # --- In-flight: running label + elapsed + rotating usage stats ─── + # Layout: ``running · 12.3s`` pinned on the left, ``Ctrl+C to stop`` + # pinned on the right, and a rotating stat (Used Session / Concurrency + # / Next Update) in the middle. The rotation cycles every 5s so the + # user can monitor credits being consumed during a long scrape + # without leaving the command. + if state.is_running: + segs.append(("class:toolbar.label", "running")) + if state.run_start is not None: + elapsed = time.monotonic() - state.run_start + segs.append(("class:toolbar", f" · {elapsed:.1f}s")) + + # Build rotating stat chunks (subset of the idle toolbar's info). + stat_chunks: list[list[tuple[str, str]]] = [] + if state.api_key_set and state.credits is not None: + stat_chunks.append([ + ("class:toolbar.label", "Available "), + ("class:toolbar.value", _format_credits(state.credits)), + ]) + scu = state.session_credits_used if state.api_key_set else None + stat_chunks.append([ + ("class:toolbar.label", "Used (Session) "), + ("class:toolbar.value", _format_credits(scu) if scu is not None else "N/A"), + ]) + if state.api_key_set and state.max_concurrency is not None: + cur = state.current_concurrency if state.current_concurrency is not None else 0 + stat_chunks.append([ + ("class:toolbar.label", "Concurrency "), + ("class:toolbar.value", f"{cur}/{state.max_concurrency}"), + ]) + if state.api_key_set: + nxt = state.seconds_until_next_refresh + if nxt is not None: + stat_chunks.append([ + ("class:toolbar.label", "Next Update "), + ("class:toolbar.value", f"{nxt}s"), + ]) + + stop_hint = "Ctrl+C to stop" + stop_hint_len = len(stop_hint) + so_far = sum(len(t) for _, t in segs) + # Reserve room for: " · ..." + right-aligned stop hint + available = max(0, width - so_far - stop_hint_len - 6) + + # Pick the stat chunk for this rotation tick — only if it fits. + if stat_chunks and available > 8: + idx = int(time.monotonic() / 5) % len(stat_chunks) + chunk = stat_chunks[idx] + chunk_len = sum(len(t) for _, t in chunk) + if chunk_len + 5 <= available: + segs.append(("class:toolbar", " · ")) + segs.extend(chunk) + + # Setting chips still show below if any room remains + if state.settings: + so_far = sum(len(t) for _, t in segs) + budget = max(0, width - so_far - stop_hint_len - 4) + shown = 0 + for k, v in state.settings.items(): + chip = f" {k}={v} " + if budget < len(chip) + 2 and shown > 0: + break + segs.append(("class:toolbar", " ")) + segs.append(("class:toolbar.chip", chip)) + budget -= len(chip) + 2 + shown += 1 + remaining = len(state.settings) - shown + if remaining > 0: + segs.append(("class:toolbar", " ")) + segs.append(("class:toolbar.hint", f"+{remaining} more")) + + # Right-align "Ctrl+C to stop" hint + used = sum(len(t) for _, t in segs) + if width - used > stop_hint_len + 4: + segs.append(("class:toolbar", " " * max(2, width - used - stop_hint_len - 2))) + segs.append(("class:toolbar.hint", stop_hint)) + return segs + + # --- Idle: build all fields, then either render statically or paginate + # When the joined toolbar text exceeds the terminal width we'd + # otherwise emit a line longer than the screen — the terminal soft- + # wraps it into a phantom 2nd row that prompt_toolkit doesn't know + # about, leaving a ghost-toolbar in scrollback on resize. To keep + # everything visible without scrolling jitter, we greedy-pack fields + # into "pages" that each fit, then cycle pages every PAGE_SECONDS. + # Each page is rendered statically — no per-frame motion — so it + # reads cleanly and doesn't waste redraws. + fields: list[list[tuple[str, str]]] = [] + + # Available Credits + avail: list[tuple[str, str]] = [("class:toolbar.label", "Available Credits ")] + if state.api_key_set and state.credits is not None: + avail.append(("class:toolbar.value", _format_credits(state.credits))) if state.credits_total: used_pct = max( 0, min(100, 100 - int(state.credits / state.credits_total * 100)), ) - segs.append(("class:toolbar", " ")) - segs.append(("class:toolbar.gauge", _credit_gauge(used_pct))) + avail.append(("class:toolbar.hint", f" ({used_pct}% used)")) + else: + avail.append(("class:toolbar.value", "N/A")) + fields.append(avail) + + # Used (Current Session) + used_chunk: list[tuple[str, str]] = [ + ("class:toolbar.label", "Used (Current Session) ") + ] + scu = state.session_credits_used if state.api_key_set else None + used_chunk.append( + ("class:toolbar.value", _format_credits(scu) if scu is not None else "N/A") + ) + fields.append(used_chunk) + + # Concurrency + conc_chunk: list[tuple[str, str]] = [("class:toolbar.label", "Concurrency ")] + if state.api_key_set and state.max_concurrency is not None: + cur = state.current_concurrency if state.current_concurrency is not None else 0 + conc_chunk.append(("class:toolbar.value", f"{cur}/{state.max_concurrency}")) else: - segs.append(("class:toolbar.value", "—")) - - # --- Last command ----------------------------------------------------- - if state.last_command: - segs.append(("class:toolbar", " · ")) - segs.append(("class:toolbar.label", "last ")) - segs.append(("class:toolbar.value", state.last_command)) - segs.append(("class:toolbar", " ")) - if state.last_status == "ok": - segs.append(("class:toolbar.ok", "✓")) - elif state.last_status == "fail": - segs.append(("class:toolbar.fail", "✗")) - if state.last_duration is not None: - segs.append(("class:toolbar", f" {state.last_duration:.1f}s")) - - # --- Session setting chips (with overflow handling) ------------------- + conc_chunk.append(("class:toolbar.value", "N/A")) + fields.append(conc_chunk) + + # Next Update countdown (only after first successful refresh) + if state.api_key_set: + nxt = state.seconds_until_next_refresh + if nxt is not None: + fields.append([ + ("class:toolbar.label", "Next Update "), + ("class:toolbar.value", f"{nxt}s"), + ]) + + # (Removed "last cmd" indicator — the typed command and its + # ✓/✗ footer are already visible in the scrollback echo, so a + # toolbar copy doesn't add information and just consumes width.) + + # Session setting chips if state.settings: - # Estimate space already used + reserved for hint - so_far = sum(len(text) for _, text in segs) - hint_len = 24 # roughly "tab · ↑↓ · :help · :q" + spacing - budget = max(0, width - so_far - hint_len - 4) - - chips = list(state.settings.items()) - shown = 0 - for k, v in chips: - chip_text = f" {k}={v} " - if budget < len(chip_text) + 2 and shown > 0: + chip_segs: list[tuple[str, str]] = [] + for k, v in state.settings.items(): + if chip_segs: + chip_segs.append(("class:toolbar", " ")) + chip_segs.append(("class:toolbar.chip", f" {k}={v} ")) + fields.append(chip_segs) + + # Hint chunk — surfaces the active mouse mode and how to switch. + # Replaces the older "tab · ↑↓ · :help · :q" cheat-sheet, since the + # mode toggle is the one keybinding the user might actually need + # to *change* during a session. The other shortcuts are in :help. + if not state.api_key_set: + hint_text = "type `auth` to set API key" + hint_chunk: list[tuple[str, str]] = [("class:toolbar.hint", hint_text)] + else: + mode_label = ( + "Scroll mode" if state.mouse_mode == "scroll" else "Select mode" + ) + hint_chunk = [ + ("class:toolbar.value", mode_label), + ("class:toolbar.hint", " · Tab to switch"), + ] + + LEADING = " " + SEP = " · " + PAGE_SECONDS = 5 # how long each page is displayed before rotating + + def _seg_len(chunk: list[tuple[str, str]]) -> int: + return sum(len(t) for _, t in chunk) + + # The mode hint ("Scroll mode · Tab to switch" / auth nudge) is the + # one piece of toolbar content the user needs to see at all times — + # it advertises the only globally-mutable runtime mode. Pin it on + # every page by reserving its width up-front and pagination only + # packs the *other* fields into the remaining space. + hint_len = _seg_len(hint_chunk) + budget = max(10, width - 2) + # Reserve room for hint + separator on every page. If the hint alone + # is wider than the budget, we'll still try to render it (final + # hard-truncate at the bottom of this function will clip). + field_budget = max(0, budget - hint_len - len(SEP)) + + # Greedy-pack the non-hint fields into pages, each ≤ field_budget. + pages: list[list[list[tuple[str, str]]]] = [] + cur: list[list[tuple[str, str]]] = [] + cur_len = len(LEADING) + for chunk in fields: + chunk_len = _seg_len(chunk) + added = chunk_len + (len(SEP) if cur else 0) + if cur and cur_len + added > field_budget: + pages.append(cur) + cur = [chunk] + cur_len = len(LEADING) + chunk_len + else: + cur.append(chunk) + cur_len += added + if cur: + pages.append(cur) + # Even if there are no non-hint fields (extreme narrow), produce + # one empty page so the hint still renders. + if not pages: + pages = [[]] + + # Rotate pages by wall-clock time. Single-page case is static. + if len(pages) == 1: + page_idx = 0 + else: + page_idx = int(time.monotonic() / PAGE_SECONDS) % len(pages) + page = pages[page_idx] + + # Compose the chosen page. + segs: list[tuple[str, str]] = [("class:toolbar", LEADING)] + for i, chunk in enumerate(page): + if i > 0: + segs.append(("class:toolbar", SEP)) + segs.extend(chunk) + + # Page indicator (e.g. "1/3") trailing — only when rotating. + if len(pages) > 1: + indicator = f" ({page_idx + 1}/{len(pages)})" + cur_total = sum(len(t) for _, t in segs) + if cur_total + len(indicator) <= field_budget: + segs.append(("class:toolbar.hint", indicator)) + + # Hint always rendered on the right edge of every page. + cur_total = sum(len(t) for _, t in segs) + pad = max(2, width - cur_total - hint_len - 2) + segs.append(("class:toolbar", " " * pad)) + segs.extend(hint_chunk) + + # Final safety: hard-truncate so we never emit a line wider than + # the terminal (prevents the soft-wrap ghost-toolbar artifact). + total = sum(len(t) for _, t in segs) + if total > width - 1: + cap = max(0, width - 1) + kept: list[tuple[str, str]] = [] + used_len = 0 + for sty, text in segs: + room = cap - used_len + if room <= 0: break - segs.append(("class:toolbar", " ")) - segs.append(("class:toolbar.chip", chip_text)) - budget -= len(chip_text) + 2 - shown += 1 - remaining = len(chips) - shown - if remaining > 0: - segs.append(("class:toolbar", " ")) - segs.append(("class:toolbar.hint", f"+{remaining} more")) - - # --- Hint (rightmost, but only if there's room) ----------------------- - used = sum(len(text) for _, text in segs) - if width - used > 26: - segs.append(("class:toolbar", " " * max(2, width - used - 24))) - segs.append(("class:toolbar.hint", "tab · ↑↓ · :help · :q")) - + if len(text) <= room: + kept.append((sty, text)) + used_len += len(text) + else: + kept.append((sty, text[: max(0, room - 1)] + "…")) + break + segs = kept return segs return render @@ -452,21 +1108,125 @@ def _escape_menu(event): # --------------------------------------------------------------------------- -def _print_banner(version: str) -> None: - line = Text() - line.append(" ScrapingBee ", style=f"bold black on {BEE_YELLOW}") - line.append(" ") - line.append(f"v{version}", style=f"bold {BEE_YELLOW}") - line.append(" ") - line.append("Type ", style=BEE_DIM) - line.append(":help", style=f"bold {BEE_YELLOW}") - line.append(" for commands", style=BEE_DIM) - err_console.print() - err_console.print(line) - err_console.print() +# ScrapingBee wordmark — approximation of the actual brand logo +# (https://www.scrapingbee.com/images/favico.svg): three honeycomb cells +# arranged in an L-shape (top, bottom-left, bottom-right) next to the +# "ScrapingBee" text rendered in the figlet ``smblock`` font. +# All rendered in brand yellow (terminal limits us to single-colour per +# Window; the real SVG has the bottom-left cell highlighted vs the other +# two). ~42 cols × 4 rows. +# "ScrapingBee" rendered in the figlet ``smblock`` font — 4 rows × 32 cols, +# roughly the same width as the "Web scraping from the terminal" tagline. +# Same block-letter style as the old 6-row logo, just compact. +_SCRAPINGBEE_LOGO = [ + " ▞▀▖ ▗ ▛▀▖ ", + " ▚▄ ▞▀▖▙▀▖▝▀▖▛▀▖▄ ▛▀▖▞▀▌▙▄▘▞▀▖▞▀▖", + " ▖ ▌▌ ▖▌ ▞▀▌▙▄▘▐ ▌ ▌▚▄▌▌ ▌▛▀ ▛▀ ", + " ▝▀ ▝▀ ▘ ▝▀▘▌ ▀▘▘ ▘▗▄▘▀▀ ▝▀▘▝▀▘", +] +# Legacy 6-row logos kept around in case we want to swap back later or +# use them elsewhere (e.g. a one-shot welcome screen). The pinned REPL +# banner uses the compact form above. +_BEE_LOGO = [ + " ██████╗ ███████╗███████╗", + " ██╔══██╗██╔════╝██╔════╝", + " ██████╔╝█████╗ █████╗ ", + " ██╔══██╗██╔══╝ ██╔══╝ ", + " ██████╔╝███████╗███████╗", + " ╚═════╝ ╚══════╝╚══════╝", +] + + +def _render_banner(version: str) -> str: + """Render the startup banner to an ANSI-formatted string. + + Rendered into an in-memory StringIO via rich so the whole banner is + assembled before any write to the terminal — avoids interleaving with + other stdout writes (clear-screen, padding newlines) and avoids any + timing-related re-ordering between rich's internal flushing and our + direct sys.stdout.write calls. + """ + from io import StringIO + + from rich.console import Console + + from .theme import SCRAPINGBEE_THEME + + buf = StringIO() + c = Console( + file=buf, + theme=SCRAPINGBEE_THEME, + highlight=False, + force_terminal=True, + width=200, # don't wrap the wide ASCII logo + ) + c.print() + for line in _SCRAPINGBEE_LOGO: + c.print(f"[bold {BEE_YELLOW}]{line}[/]") + for line in _BEE_LOGO: + c.print(f"[bold white]{line}[/]") + c.print() + # Version + c.print(f" [bold {BEE_YELLOW}]v{version}[/]") + # Tagline + c.print(f" [{BEE_DIM}]Web scraping from the terminal[/]") + c.print() + # Hint + hint = Text() + hint.append(" Type ", style=BEE_DIM) + hint.append(":help", style=f"bold {BEE_YELLOW}") + hint.append(" for commands, ", style=BEE_DIM) + hint.append(":q", style=f"bold {BEE_YELLOW}") + hint.append(" to quit", style=BEE_DIM) + c.print(hint) + c.print() + return buf.getvalue() def _print_help(commands: dict[str, str]) -> None: + """Print the REPL command list with a two-column layout. + + Long descriptions wrap with a hanging indent so continuation lines line + up under the description column instead of flowing back to column 0. + Column widths: + 4 (leading) + 20 (cmd col) + 2 (gap) = 26-col indent for + continuation lines. The description column gets the rest of the + terminal width. + """ + import shutil + import textwrap + + cmd_col = 20 + leading = 4 + gap = 2 + indent_width = leading + cmd_col + gap # 26 + indent_str = " " * indent_width + + def _print_row(cmd: str, desc: str) -> None: + try: + term_w = shutil.get_terminal_size((80, 24)).columns + except Exception: + term_w = 80 + desc_w = max(20, term_w - indent_width) + lines = textwrap.wrap(desc, width=desc_w) or [""] + # Build Text objects directly instead of using Rich's markup + # parser — markup strings like ``[dim]...[/]`` go through Rich's + # console renderer which strips leading whitespace and re-wraps + # at its own console width (re-wrapping our pre-wrapped lines + # mid-word, and dropping the hanging indent). Plain Text objects + # plus ``soft_wrap=True`` keep the spans and indent intact. + first = Text() + first.append(" " * leading) + first.append(cmd.ljust(cmd_col), style=f"bold {BEE_YELLOW}") + first.append(" " * gap) + first.append(lines[0], style=BEE_DIM) + err_console.print(first, soft_wrap=True) + for line in lines[1:]: + cont = Text() + cont.append(indent_str) + cont.append(line, style=BEE_DIM) + err_console.print(cont, soft_wrap=True) + err_console.print() groups = { "Pages": ["scrape", "crawl"], @@ -479,12 +1239,12 @@ def _print_help(commands: dict[str, str]) -> None: "Account": ["auth", "logout"], "Tools": ["usage", "schedule", "export", "docs", "unsafe"], } - for group_name, cmds in groups.items(): + for i, (group_name, cmds) in enumerate(groups.items()): + if i > 0: + err_console.print() # blank row between categories for breathing room err_console.print(f" [{BEE_DIM}]{group_name}[/]") for cmd in cmds: - err_console.print( - f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{commands.get(cmd, '')}[/]" - ) + _print_row(cmd, commands.get(cmd, "")) err_console.print() err_console.print(f" [{BEE_DIM}]REPL[/]") for cmd, desc in [ @@ -497,7 +1257,7 @@ def _print_help(commands: dict[str, str]) -> None: (":show", "Show current session defaults"), (":q, :quit", "Quit the REPL"), ]: - err_console.print(f" [bold {BEE_YELLOW}]{cmd:<20}[/] [dim]{desc}[/]") + _print_row(cmd, desc) err_console.print() @@ -521,6 +1281,8 @@ def _print_command_footer(status: str, duration: float) -> None: line.append("✓", style=f"bold {_GREEN}") elif status == "fail": line.append("✗", style=f"bold {BEE_RED}") + elif status == "stopped": + line.append("■", style=f"bold {BEE_YELLOW}") line.append(f" {duration:.2f}s", style=BEE_DIM) err_console.print(line) err_console.print() @@ -537,8 +1299,11 @@ def _open_pager(path: str) -> None: Replaces external tools (`less` on Unix, `more` on Windows) with an in-process viewer so the CLI works identically everywhere with no extra install. Arrow keys / page up-down / home / end / mouse wheel scroll; - `q` or `Esc` exits back to the REPL. + `q` or `Esc` exits back to the REPL. Long lines wrap to the terminal + width so you can see all of a wide JSON or HTML response without + horizontal scrolling. Press `p` to toggle pretty-printed JSON. """ + import json from pathlib import Path from prompt_toolkit.application import Application @@ -552,27 +1317,64 @@ def _open_pager(path: str) -> None: from prompt_toolkit.layout.dimension import D from prompt_toolkit.styles import Style - text = Path(path).read_text(encoding="utf-8", errors="replace") - line_count = text.count("\n") + 1 + raw_text = Path(path).read_text(encoding="utf-8", errors="replace") + + # If the cached output is valid JSON, prepare a pretty-printed + # version up-front. We default to pretty mode so the user sees the + # human-readable form first; `r` toggles raw if they need to grep + # the original bytes. When the content isn't JSON, pretty is + # unavailable and we stick with raw. + pretty_text: str | None + try: + pretty_text = json.dumps( + json.loads(raw_text), indent=2, ensure_ascii=False + ) + except Exception: + pretty_text = None + + mode = ["pretty" if pretty_text is not None else "raw"] buffer = Buffer(read_only=Condition(lambda: True)) - buffer.set_document(Document(text=text, cursor_position=0), bypass_readonly=True) + + def _set_text(s: str) -> None: + buffer.set_document(Document(text=s, cursor_position=0), bypass_readonly=True) + + _set_text(pretty_text if mode[0] == "pretty" else raw_text) + + def _current_line_count() -> int: + return buffer.document.line_count text_window = Window( content=BufferControl(buffer=buffer), - wrap_lines=False, + # Wrap long lines so a multi-KB JSON / HTML response is fully + # visible without horizontal scrolling. The previous default + # (wrap_lines=False) clipped at column-N and the rest was just + # gone unless the user used Left/Right scrolling. + wrap_lines=True, ) def _status_line(): cursor_line = buffer.document.cursor_position_row + 1 - pct = int(cursor_line / max(1, line_count) * 100) + total = _current_line_count() + pct = int(cursor_line / max(1, total) * 100) + mode_label = "pretty" if mode[0] == "pretty" else "raw" + # `r` toggles raw on/off. Hidden when there's no pretty version + # available (non-JSON content) — there'd be nothing to toggle to. + toggle_hint = ( + ("r: pretty" if mode[0] == "raw" else "r: raw") + if pretty_text is not None else "" + ) + right_hint = ( + "↑↓ PgUp/PgDn scroll" + (f" · {toggle_hint}" if toggle_hint else "") + + " · q to exit" + ) return [ ("class:pager.bar", " "), - ("class:pager.value", f"{cursor_line}/{line_count}"), - ("class:pager.bar", f" ({pct}%) · "), + ("class:pager.value", f"{cursor_line}/{total}"), + ("class:pager.bar", f" ({pct}%) · {mode_label} · "), ("class:pager.label", path), ("class:pager.bar", " "), - ("class:pager.hint", "↑↓ PgUp/PgDn Home/End scroll · q / Esc to exit"), + ("class:pager.hint", right_hint), ] status_window = Window( @@ -590,6 +1392,19 @@ def _status_line(): def _exit(event): event.app.exit() + @kb.add("r") + def _toggle_raw(_e): + # No-op if the content isn't JSON — pretty isn't available, so + # we're already showing raw and there's nothing to toggle to. + if pretty_text is None: + return + if mode[0] == "pretty": + mode[0] = "raw" + _set_text(raw_text) + else: + mode[0] = "pretty" + _set_text(pretty_text) + @kb.add("up") def _up(_e): buffer.cursor_up() @@ -631,14 +1446,54 @@ def _right(_e): } ) - app = Application( + pager_app = Application( layout=layout, key_bindings=kb, style=style, full_screen=True, mouse_support=True, ) - app.run() + + # We're (almost certainly) called from inside the REPL's prompt_toolkit + # event loop — a sync key-binding handler invoked `:view`. Calling + # ``pager_app.run()`` here would hit ``asyncio.run()`` from inside a + # running loop and raise. Detect that and farm the pager out to a + # worker thread which has no loop of its own, so ``app.run()`` can + # safely create a fresh one. Blocking the main thread on ``join()`` + # freezes the outer app's rendering while the pager has the terminal, + # which is exactly what we want — the pager uses the alternate screen + # buffer (full_screen=True), then yields it back on exit. + try: + import asyncio as _asyncio_check + + _asyncio_check.get_running_loop() + in_loop = True + except RuntimeError: + in_loop = False + + if not in_loop: + pager_app.run() + return + + err_holder: list[BaseException | None] = [None] + + def _run_in_worker() -> None: + try: + pager_app.run() + except BaseException as e: + err_holder[0] = e + + t = threading.Thread(target=_run_in_worker, daemon=False) + t.start() + t.join() + if err_holder[0] is not None: + raise err_holder[0] + # NOTE: the caller (run_repl, after :view) is responsible for + # re-entering the alt buffer and resetting the outer app's renderer + # cache. prompt_toolkit's Application.run cleanup emits + # ``\x1b[?1049l`` on exit, which kicks the outer REPL out of the + # alt buffer too — only the caller has access to ``app`` to invalidate + # it properly, so the cleanup lives there. def _normalize_setting_key(key: str) -> str: @@ -699,6 +1554,7 @@ def _handle_meta( all_known_flags: set[str], bool_flags: set[str], choice_flags: dict[str, list[str]], + scrollback: ScrollbackBuffer | None = None, ) -> str | None: parts = line.strip().split(None, 1) head = parts[0] @@ -711,8 +1567,15 @@ def _handle_meta( _print_help(command_help) return "ok" if head_low in {":clear", "clear"}: - sys.stderr.write("\033[2J\033[H") - sys.stderr.flush() + if scrollback is not None: + # full_screen mode — clear our virtual buffer + with scrollback._lock: + scrollback.lines.clear() + scrollback.scroll_offset = 0 + else: + # Legacy fallback (shouldn't trigger in current REPL) + sys.stderr.write("\033[2J\033[H") + sys.stderr.flush() return "ok" if head_low == ":show": if not state.settings: @@ -734,6 +1597,9 @@ def _handle_meta( return "ok" try: _open_pager(str(cache_path)) + except FileNotFoundError: + # File got deleted between exists() and read() — race with cleanup + err_console.print(f" [{BEE_DIM}]cached output no longer available[/]") except Exception as e: err_console.print(f" [bold {BEE_RED}]pager error:[/] {e}") err_console.print( @@ -854,6 +1720,15 @@ def _make_completer( ":q", ":quit", ] + # Precompute the union of every flag known to any command. Used as a + # fallback completion pool when the user's typed command isn't + # recognised (typo, in-progress rename, etc.) — without this the + # completer would silently stop suggesting anything as soon as the + # first word is unknown, which is confusing UX. + _all_known_flags: list[str] = sorted({ + f for flags in command_flags.values() for f in flags + }) + class BeeCompleter(Completer): def get_completions(self, document, complete_event): text = document.text_before_cursor.lstrip() @@ -872,7 +1747,14 @@ def get_completions(self, document, complete_event): return cmd_name = words[0] - flags_for_cmd = command_flags.get(cmd_name, []) + # If cmd_name is unknown, fall back to the union of all flags + # so the user still gets *some* suggestions instead of silence. + # Display "(unknown command)" so they know completions may + # not actually apply to what they typed. + cmd_known = cmd_name in command_flags + flags_for_cmd = ( + command_flags[cmd_name] if cmd_known else _all_known_flags + ) last = words[-1] if words else "" prev = words[-2] if len(words) >= 2 else "" @@ -896,9 +1778,12 @@ def get_completions(self, document, complete_event): yield Completion(v, start_position=-len(last)) return if last.startswith("-"): + meta_label = "" if cmd_known else "(unknown command)" for flag in flags_for_cmd: if flag.startswith(last): - yield Completion(flag, start_position=-len(last)) + yield Completion( + flag, start_position=-len(last), display_meta=meta_label + ) return BeeCompleter() @@ -908,149 +1793,1119 @@ def get_completions(self, document, complete_event): # --------------------------------------------------------------------------- -def _prompt_once(state: SessionState, completer: Any, history_path: str) -> str | None: - app, _buffer = _build_application(state, completer, history_path) - return app.run() - +# --------------------------------------------------------------------------- +# Main loop — persistent Application + patch_stdout +# --------------------------------------------------------------------------- -def _read_input(state: SessionState, completer: Any, history_path: str) -> str | None: - line = _prompt_once(state, completer, history_path) - if line is None: - return None - while line.rstrip().endswith("\\"): - more = _prompt_once(state, completer, history_path) - if more is None: - return line.rstrip().rstrip("\\").rstrip() - line = line.rstrip().rstrip("\\").rstrip() + " " + more - return line +_INTERACTIVE_COMMANDS = {"tutorial", "auth"} -# --------------------------------------------------------------------------- -# Main loop -# --------------------------------------------------------------------------- +def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: + """Run the REPL with the Ink-style hybrid pattern. -def run_repl(cli_group: Any, version: str) -> None: + Banner is printed to real stdout, lands in scrollback. The input + toolbar + live in a persistent Application(full_screen=False) at the bottom of the + terminal. The whole loop runs inside ``patch_stdout()`` so any print or + click.echo from a command flows through real terminal stdout (real + scrollback, real selection) while the bottom strip is redrawn afterwards. + """ + import shutil from pathlib import Path import click + from prompt_toolkit.application import Application + from prompt_toolkit.application.run_in_terminal import run_in_terminal + from prompt_toolkit.auto_suggest import AutoSuggestFromHistory + from prompt_toolkit.buffer import Buffer + from prompt_toolkit.document import Document + from prompt_toolkit.filters import Condition, has_completions + from prompt_toolkit.history import FileHistory + from prompt_toolkit.key_binding import KeyBindings + from prompt_toolkit.layout import Layout + from prompt_toolkit.layout.containers import ConditionalContainer, HSplit, Window + from prompt_toolkit.layout.controls import BufferControl, FormattedTextControl + from prompt_toolkit.layout.dimension import D + from prompt_toolkit.styles import Style from .theme import set_repl_mode set_repl_mode(True) + # ── Click tree introspection ──────────────────────────────────────────── command_help, command_flags, bool_flags, choice_flags = _walk_click_tree(cli_group) command_names = sorted(command_flags.keys()) - - _print_banner(version) + all_known_flags: set[str] = set() + for flags_list in command_flags.values(): + all_known_flags.update(flags_list) state = SessionState() state.refresh_credits_from_cache() + from .config import get_api_key_if_set + + state.api_key_set = bool(get_api_key_if_set(None)) + history_path = str(Path.home() / ".config" / "scrapingbee-cli" / ".history") Path(history_path).parent.mkdir(parents=True, exist_ok=True) + try: + history = FileHistory(history_path) + except Exception: + history = None # type: ignore[assignment] completer = _make_completer( command_names, command_flags, bool_flags, choice_flags, command_help ) - # Flat set of every known flag across all commands — used by `:set` to - # validate keys and surface "did you mean?" suggestions for typos. - all_known_flags: set[str] = set() - for flags_list in command_flags.values(): - all_known_flags.update(flags_list) + # Set the terminal background to pure black AND the default foreground to + # light grey for the REPL session. We need both — otherwise, any text the + # terminal renders with its theme-default foreground (e.g. a number or an + # unstyled token in the lexer) keeps the user's theme's fg colour, which + # may be near-black on a light theme → invisible on our forced-black bg. + # OSC 11 sets bg, OSC 10 sets fg. BEL terminator (`\x07`) is the most + # compatible across Mac Terminal, Warp, iTerm2, kitty, alacritty, + # gnome-terminal, Windows Terminal. Opt out with `scrapingbee --keep-bg`. + _set_black_bg = not keep_bg + if _set_black_bg: + sys.stdout.write("\033]11;#000000\007") + sys.stdout.write("\033]10;#EAEAEA\007") + sys.stdout.flush() + + # Create the virtual scrollback buffer and seed it with the banner. + # In full_screen mode we own the alt buffer entirely. The banner is + # rendered as a FIXED Window at the top of the layout (not pushed into + # scrollback), so it stays anchored while command output flows in the + # scrollback area below it. Trade-off: banner consumes its natural + # height of terminal rows every frame, but the user keeps the brand + # surface visible (their explicit ask: "when scraping banner should + # not disappear"). + scrollback = ScrollbackBuffer() + rows = shutil.get_terminal_size((80, 24)).lines # kept for API-key prompt sizing + + # ── First-run API key prompt ──────────────────────────────────────────── + # Inline masked input — banner already announced us, no need to repeat. + # Validates against the live `/usage` endpoint, saves to ~/.config/ + # scrapingbee-cli/.env, and updates os.environ so the rest of this + # process sees the new key (the .env file alone wouldn't help — most + # call sites read os.environ directly). + # + # Required, not skippable: almost every command in the CLI hits the + # ScrapingBee API, so launching the REPL without a key would just give + # the user an unusable shell. We loop until a valid key is entered or + # they Ctrl+C / Ctrl+D out (which exits the whole program, since with + # no key the REPL is dead weight). + if not state.api_key_set: + from .commands.auth import _masked_getpass, _validate_api_key + from .config import ENV_API_KEY, save_api_key_to_dotenv + + err_console.print( + f" [{BEE_DIM}]Enter your API key to get started — find it at " + f"[bold {BEE_YELLOW}]dashboard.scrapingbee.com/dashboard[/][{BEE_DIM}].[/]" + ) + err_console.print() + while not state.api_key_set: + try: + raw = _masked_getpass(" API key: ") + except (EOFError, KeyboardInterrupt): + err_console.print() + err_console.print( + f" [{BEE_DIM}]Exiting — an API key is required to use the CLI.[/]" + ) + if _set_black_bg: + try: + sys.stdout.write("\033]111\007") + sys.stdout.write("\033]110\007") + sys.stdout.flush() + except Exception: + pass + set_repl_mode(False) + return + raw = raw or "" + key = raw.strip() + # Pasted keys from password managers / clipboards often pick up + # a leading or trailing space. Strip silently but warn so the + # user knows we did — otherwise a key that "looks right" but + # fails to authenticate is bewildering. + if key and key != raw: + err_console.print( + f" [{BEE_DIM}]Note: stripped surrounding whitespace from your key.[/]" + ) + if not key: + err_console.print( + f" [bold {BEE_RED}]Empty key.[/] [{BEE_DIM}]Please paste your API key.[/]" + ) + continue + err_console.print(f" [{BEE_DIM}]Validating…[/]") + valid, err_msg = _validate_api_key(key) + if valid: + try: + save_api_key_to_dotenv(key) + except Exception as e: + err_console.print( + f" [bold {BEE_RED}]Could not save:[/] [{BEE_DIM}]{e}[/]" + ) + os.environ[ENV_API_KEY] = key + state.api_key_set = True + err_console.print(f" [bold {BEE_YELLOW}]✓[/] API key saved.") + else: + err_console.print( + f" [bold {BEE_RED}]Invalid:[/] [{BEE_DIM}]{err_msg or 'unknown error'}. Try again.[/]" + ) + # No clear / re-write needed — the API-key prompt happened in + # the real terminal, then app.run() will switch to the alt + # buffer and we get a clean screen automatically. The banner is + # already loaded in our scrollback buffer from earlier. + + # ── Input buffer ──────────────────────────────────────────────────────── + # Locked while a worker thread is running a command so the user can't + # submit another command on top of the first one (their outputs would + # interleave through patched stdout). + is_input_locked = [False] + # Reference to the currently-running worker thread (or None). Used by the + # Ctrl+C handler to inject KeyboardInterrupt into the worker so the user + # can stop a long scrape without exiting the REPL. + current_worker: list[threading.Thread | None] = [None] + + input_buffer = Buffer( + history=history, + completer=completer, + complete_while_typing=False, + auto_suggest=AutoSuggestFromHistory(), + multiline=False, + read_only=Condition(lambda: is_input_locked[0]), + ) + + def _line_prefix(line_no, _wrap_count): + if line_no == 0: + return [("class:promptmark", "❯ ")] + return [("", " ")] + + # While a command is in flight we collapse the input window's height to + # 0 — instead of hiding it via ConditionalContainer. Hiding via Conditional + # makes the focused window invisible, but prompt_toolkit still places the + # terminal cursor *somewhere*, and Mac Terminal renders that cursor as a + # visible `[` block on the first visible row. With the input still in the + # layout but 0-rows tall, the cursor is "on" the input but in an invisible + # row → no stray indicator anywhere. + def _input_height(): + if state.is_running: + return D.exact(0) + return D(min=1, max=8) + + input_window = Window( + content=BufferControl(buffer=input_buffer, lexer=_make_lexer()), + get_line_prefix=_line_prefix, + wrap_lines=True, + height=_input_height, + dont_extend_height=True, + always_hide_cursor=Condition(lambda: state.is_running), + ) + + toolbar_window = Window( + content=FormattedTextControl(_make_toolbar(state)), + height=D.exact(1), + wrap_lines=False, # pin explicitly so toolbar can never grow to 2 rows + ) + + # Live "running command" line that appears above the input only while a + # command is in flight. Renders the typed line with a sweeping white-glim + # shimmer so the user has clear visual feedback that something is happening. + def _running_text() -> list[tuple[str, str]]: + if not state.is_running or not state.running_command_text: + return [] + text = f"❯ {state.running_command_text}" + pos = state.tick % max(1, len(text)) + return _shimmer_pt(text, pos, BEE_YELLOW) + + running_window = ConditionalContainer( + content=Window( + content=FormattedTextControl(_running_text), + height=D.exact(1), + ), + filter=Condition(lambda: state.is_running), + ) - while True: + # ── Scrollback Window — virtual buffer rendered as the top section ───── + # This Window fills the vertical space above the running line / input / + # toolbar. It renders whatever ScrollbackBuffer says is visible based + # on the current scroll offset. The user scrolls it with PgUp/PgDn etc. + def _scrollback_render() -> list[tuple[str, str]]: + height = 20 + width = 80 try: - line = _read_input(state, completer, history_path) - except (KeyboardInterrupt, EOFError): - err_console.print() - break + from prompt_toolkit.application import get_app as _get_app + + _app = _get_app() + if getattr(_app, "is_running", False): + size = _app.output.get_size() + # Reserve rows for the full banner + everything below the + # scrollback in the layout: banner_visual + spacer_top(1) + # + separator(1) + running_or_input(1) + spacer_bottom(1) + # + toolbar(1) = banner_visual + 5. + reserved = _banner_visual_height + 5 + height = max(1, size.rows - reserved) + width = max(1, size.columns) + except Exception: + pass + # Use visual-row pagination so scrolling moves exactly one terminal + # row per step, even through long single-line content that would + # otherwise wrap into many visual rows. We split at width-1 so a + # full-width row never accidentally pushes the cursor onto the + # next terminal row (which some terminals do at col == width). + visual_rows = scrollback.get_visible_visual(height, max(1, width - 1)) + out: list[tuple[str, str]] = [] + for i, row in enumerate(visual_rows): + if i > 0: + out.append(("", "\n")) + out.extend(row) + return out - if line is None: - err_console.print() - break + # FormattedTextControl subclass that routes mouse wheel / trackpad + # scroll events to our virtual buffer. prompt_toolkit's default mouse + # mode (1000) captures button events but NOT motion, so the terminal + # still handles drag-select natively (or with a modifier — Option on + # Mac, Shift on most Linux terminals — depending on the terminal). + from prompt_toolkit.mouse_events import MouseEventType + from prompt_toolkit.layout.controls import FormattedTextControl as _PTFTC + + class _ScrollbackControl(_PTFTC): + def mouse_handler(self, mouse_event): + et = mouse_event.event_type + # 1 line per wheel/trackpad event keeps motion smooth — trackpads + # send a flurry of small events per gesture, so a tight step + # tracks the user's finger movement closely. Larger steps (3+) + # feel jumpy / snap-y. + if et == MouseEventType.SCROLL_UP: + scrollback.scroll_up(1) + try: + app.invalidate() + except Exception: + pass + return None + if et == MouseEventType.SCROLL_DOWN: + scrollback.scroll_down(1) + try: + app.invalidate() + except Exception: + pass + return None + return NotImplemented + + scrollback_window = Window( + content=_ScrollbackControl(_scrollback_render), + # We pre-wrap content ourselves (see _split_fragments_to_width) so + # each line passed to prompt_toolkit is already ≤ terminal width. + # Disable prompt_toolkit's own line-wrapping so it doesn't try to + # second-guess us — we want exact control of which visual rows + # appear for accurate scroll-by-row behaviour. + wrap_lines=False, + always_hide_cursor=True, + ) - line = line.strip() - if not line: - continue + # ── Pinned banner Window (smaller logo, original stacked structure) ─── + # Restores the original banner layout — ASCII logo, then version, + # tagline, blank, hint — but uses only the SCRAPING logo (6 rows) + # instead of stacking SCRAPING + BEE (which was 12 rows). Half the + # vertical footprint, same look. + _banner_visual_height = len(_SCRAPINGBEE_LOGO) + 5 # logo + 5 text rows + + def _banner_render() -> list[tuple[str, str]]: + out: list[tuple[str, str]] = [] + # SCRAPING logo in brand yellow. + for i, logo_line in enumerate(_SCRAPINGBEE_LOGO): + if i > 0: + out.append(("", "\n")) + out.append((f"bold {BEE_YELLOW}", logo_line)) + # Spacer row + out.append(("", "\n")) + # v1.4.1 + out.append(("", "\n")) + out.append((f"bold {BEE_YELLOW}", f" v{version}")) + # Tagline + out.append(("", "\n")) + out.append((f"{BEE_DIM}", " Web scraping from the terminal")) + out.append(("", "\n")) + # Hint + out.append((f"{BEE_DIM}", " Type ")) + out.append((f"bold {BEE_YELLOW}", ":help")) + out.append((f"{BEE_DIM}", " for commands, ")) + out.append((f"bold {BEE_YELLOW}", ":q")) + out.append((f"{BEE_DIM}", " to quit")) + return out + + def _banner_height() -> "D": + return D.exact(_banner_visual_height) + + banner_window = Window( + content=FormattedTextControl(_banner_render), + height=_banner_height, + wrap_lines=False, + always_hide_cursor=True, + ) + + # Breathing room around the prompt area (Claude-CLI-style). + # - blank row above the separator → visual gap from output + # - dim horizontal rule → clear boundary between "history" and "input" + # - blank row below the toolbar → keeps the toolbar from sitting right + # on the bottom edge of the terminal + def _hr_render() -> list[tuple[str, str]]: + try: + from prompt_toolkit.application import get_app as _get_app + + _app = _get_app() + if getattr(_app, "is_running", False): + cols = _app.output.get_size().columns + else: + cols = 80 + except Exception: + cols = 80 + return [("class:toolbar.hint", "─" * max(1, cols))] + + spacer_top = Window(height=D.exact(1), char=" ") + separator = Window( + content=FormattedTextControl(_hr_render), + height=D.exact(1), + always_hide_cursor=True, + ) + spacer_bottom = Window(height=D.exact(1), char=" ") + + # FloatContainer wraps the main layout so we can hover a completion + # popup near the cursor. Without the Float + CompletionsMenu prompt- + # toolkit's `start_completion()` enters completion *state* but nothing + # visible changes — the user thought Tab did nothing and pressed + # again, hitting `complete_next` which cycled invisibly. With the + # menu in place, the first Tab opens the popup; Up/Down navigate + # entries; Enter / Tab inserts; Esc dismisses. + from prompt_toolkit.layout.containers import Float, FloatContainer + from prompt_toolkit.layout.menus import CompletionsMenu + + main_split = HSplit( + [ + banner_window, + scrollback_window, + spacer_top, + separator, + running_window, + input_window, + spacer_bottom, + toolbar_window, + ] + ) + layout = Layout( + FloatContainer( + content=main_split, + floats=[ + Float( + xcursor=True, + ycursor=True, + content=CompletionsMenu(max_height=10, scroll_offset=1), + ), + ], + ) + ) + + # ── Command echo ──────────────────────────────────────────────────────── + def _echo_to_scrollback(line: str) -> None: + """Echo the submitted command into scrollback (dim grey). + + Both chevron and line use the explicit ``#888888`` colour rather + than mixing in Rich's ``dim`` attribute on top — on our dark + background the compound was rendering nearly black, making the + echo invisible. A single mid-grey shade is subdued enough to feel + like "history" without disappearing. - # The prompt area is erased on submit (erase_when_done=True), so echo - # what the user typed into scrollback. Single line, ❯ + dim text — - # cleaner and more resize-safe than the old `─── cmd ──` divider. + Only the *live* input prompt at the bottom uses the bright yellow + chevron, so the eye can find "where I'm typing now" without it + competing with past commands above. + """ echo = Text() - echo.append("❯ ", style=f"bold {BEE_YELLOW}") - echo.append(line, style="dim") + echo.append("❯ ", style=BEE_DIM) + echo.append(line, style=BEE_DIM) err_console.print(echo) - # Slash / bare meta-commands + # ── Command execution (synchronous, output flows via patched stdout) ──── + def _execute(line: str) -> bool: + """Run a single REPL submission: meta-command or click command. + + Returns ``True`` if the submission was consumed (whether it + succeeded, failed at runtime, or was an unknown command) — in + every such case the user has gotten feedback and the input buffer + should be cleared. Returns ``False`` only when the submission + couldn't even be parsed (shlex error); the caller leaves the + buffer untouched so the user can correct and retry without + re-typing. + """ + line = line.strip() + if not line: + return True + + # Meta-commands (`:set`, `:help`, `:show`, ...) and unknown / parse + # errors echo the command immediately — there's no shimmer pass for + # those. Click commands defer the echo until after completion, so + # the live shimmering line above the input is the only on-screen + # representation while the command runs. + # `:q` is handled at the key-binding layer so we don't get here for it. meta = _handle_meta( - line, state, command_help, all_known_flags, bool_flags, choice_flags + line, state, command_help, all_known_flags, bool_flags, choice_flags, + scrollback=scrollback, ) - if meta == "quit": - break if meta == "ok": - continue - - # Tolerate users typing `scrapingbee ...` out of muscle memory + # If we just ran :view, the nested pager Application emitted + # ``\x1b[?1049l`` on its exit, kicking us out of the alt screen + # buffer. Re-enter it and reset the outer renderer so the next + # paint goes into the fresh alt buffer instead of leaking into + # main-screen scrollback. + if line.strip().lower().startswith(":view"): + try: + sys.__stdout__.write("\x1b[?1049h") + sys.__stdout__.flush() + except Exception: + pass + try: + app.renderer.reset() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + # Meta commands echo themselves with rich-styled output already; + # only echo the user's typed line for the record. + _echo_to_scrollback(line) + return True + if meta == "quit": # belt-and-braces; key binding usually catches it + return True + + # Tolerate users typing `scrapingbee ...` out of muscle memory. if line.lower().startswith("scrapingbee "): line = line[len("scrapingbee "):].strip() + original_line = line # what to echo after completion + try: args = shlex.split(line) except ValueError as e: + # Parse error — DO NOT consume the buffer. The user almost + # certainly has an unclosed quote; let them fix it in-place. err_console.print(f" [bold {BEE_RED}]parse error:[/] {e}") - continue + return False if not args: - continue + return True cmd_name = args[0] if cmd_name not in command_flags: + _echo_to_scrollback(original_line) suggestion = _suggest(cmd_name, command_names) if suggestion: err_console.print( f" [bold {BEE_RED}]unknown:[/] {cmd_name} " - f"[{BEE_DIM}]did you mean[/] [bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" + f"[{BEE_DIM}]did you mean[/] " + f"[bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" ) else: err_console.print(f" [bold {BEE_RED}]unknown:[/] {cmd_name}") - continue + return True args = state.apply_settings_to_args(args) + + # Mark the scrollback position where this command's output will + # start. We DO NOT echo here — while the command runs, only the + # shimmering running line is the live indicator. After the + # command finishes, _finish inserts the dim echo at this index + # so the rendered order becomes: + # ❯ scrape https://… (echo, inserted post-completion) + # (was streamed in during execution) + # ✓ 0.45s (footer, appended in _finish) + # i.e. echo + output + footer atomically appear together at the + # moment of completion, without doubling up the live shimmer. + output_start_index = scrollback.current_length() + start = time.monotonic() - status = "ok" + status_ref = ["ok"] + state.is_running = True + state.running_command = cmd_name + state.running_command_text = original_line # used by shimmer above input + state.run_start = start + + def _run() -> None: + try: + cli_group.main(args, standalone_mode=False) + except click.UsageError as e: + msg = str(e) + err_console.print(f" [bold {BEE_RED}]usage:[/] {msg}") + if "no such option" in msg.lower(): + m = re.search(r"--?[A-Za-z0-9-]+", msg) + if m: + bad = m.group(0) + suggestion = _suggest(bad, command_flags.get(cmd_name, [])) + if suggestion: + err_console.print( + f" [{BEE_DIM}]did you mean[/] " + f"[bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" + ) + status_ref[0] = "fail" + except click.ClickException as e: + e.show() + status_ref[0] = "fail" + except KeyboardInterrupt: + # Ctrl+C while running — the keybinding injected this into us + # via PyThreadState_SetAsyncExc. Surface it as a deliberate + # stop in the footer rather than a generic failure. + err_console.print(f" [{BEE_DIM}]stopped[/]") + status_ref[0] = "stopped" + except SystemExit as e: + code = e.code if e.code is not None else 0 + if code not in (0, None): + status_ref[0] = "fail" + except Exception as e: + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status_ref[0] = "fail" + + def _finish() -> None: + duration = time.monotonic() - start + # Stop the shimmer first so the echo + footer commit cleanly to + # scrollback without competing with the live above-input line. + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + # Splice the dim echo line in *front of* the output rows that + # streamed into scrollback during execution. We marked the + # position at the start of _execute (output_start_index); any + # rows past that index belong to this command. Inserting at + # that index puts the echo right above its output. + try: + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + to_formatted_text as _tft, + ) + from io import StringIO as _SIO + from rich.console import Console as _RC + + _buf = _SIO() + _c = _RC( + file=_buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(original_line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(output_start_index, _echo_fragments) + except Exception: + # Defensive fallback: if anything goes wrong with the rich + # render, drop the echo rather than crash the REPL. + pass + _print_command_footer(status_ref[0], duration) + state.last_command = cmd_name + state.last_status = status_ref[0] + state.last_duration = duration + state.refresh_credits_from_cache() + is_input_locked[0] = False + # State mutations triggered by auth/logout need to be visible to + # the asyncio loop's _usage_refresher and the toolbar render — + # both run on the main loop thread while we're in the worker + # thread. Bouncing the writes through call_soon_threadsafe + # guarantees a happens-before edge with the loop's next tick. + # + # We deliberately keep ``used_credits_at_start`` across logout — + # if the user re-authenticates with the *same* key, the next + # refresh detects an unchanged ``api_key_hash`` and continues the + # session counter. A *different* key triggers a reset there. + def _apply_post_cmd_state() -> None: + if cmd_name == "auth": + if get_api_key_if_set(None): + state.api_key_set = True + elif cmd_name == "logout": + state.api_key_set = False + state.credits = None + state.credits_total = None + state.used_credits = None + state.max_concurrency = None + state.current_concurrency = None + state.last_usage_refresh_mono = None + try: + app.invalidate() + except Exception: + pass + + try: + loop = getattr(app, "loop", None) + if loop is not None: + loop.call_soon_threadsafe(_apply_post_cmd_state) + else: + _apply_post_cmd_state() + except Exception: + _apply_post_cmd_state() + + # `usage` and `auth` are the two commands whose completion implies + # the live toolbar values are stale — trigger an immediate refresh + # rather than waiting for the next 30s tick. + if cmd_name in ("usage", "auth"): + _signal_refresh_from_thread() + try: + app.invalidate() + except Exception: + pass + + if cmd_name in _INTERACTIVE_COMMANDS: + # tutorial / auth use click.prompt() and need raw terminal access. + # Suspend the persistent prompt-toolkit app, run the command in + # the bare terminal, then resume. Synchronous — we wait for it. + is_input_locked[0] = True + try: + run_in_terminal(_run, in_executor=False) + finally: + _finish() + return True + + # Network commands run in a worker thread so they don't fight + # prompt_toolkit's asyncio loop. (scrape, google, etc. each call + # `asyncio.run(...)` internally — and asyncio.run refuses to start + # when a loop is already running, which is the case while + # prompt_toolkit's Application is alive.) Locking the input + # prevents the user from submitting a second command on top. + is_input_locked[0] = True + try: + app.invalidate() + except Exception: + pass + + def _worker() -> None: + try: + _run() + finally: + # Always clear the worker reference first — the Ctrl+C handler + # uses it to decide between "cancel command" and "exit REPL". + # Stale references would make a quick second Ctrl+C target + # a thread that's already finished. + current_worker[0] = None + # Cleanup MUST always run, even if _finish itself raises — a + # broken finish would leave is_running=True and is_input_locked=True + # forever, making the REPL unusable until restart. + try: + _finish() + except Exception: + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + worker_thread = threading.Thread(target=_worker, daemon=True) + current_worker[0] = worker_thread + worker_thread.start() + return True + + # ── Key bindings ──────────────────────────────────────────────────────── + _QUIT_TOKENS = {":q", ":quit", "exit", "quit", "q"} + + kb = KeyBindings() + + @kb.add("enter", filter=has_completions) + def _accept(event): + event.current_buffer.complete_state = None + + @kb.add("enter", filter=~has_completions) + def _submit(event): + text = input_buffer.text + stripped = text.strip() + if not stripped: + input_buffer.set_document(Document(""), bypass_readonly=True) + return + if stripped.lower() in _QUIT_TOKENS: + input_buffer.set_document(Document(""), bypass_readonly=True) + event.app.exit() + return + # Persist the submitted line into the FileHistory before we kick off + # execution. We do this manually (rather than letting prompt_toolkit + # do it via Buffer.validate_and_handle) because our custom Enter + # binding bypasses that path. The default up/down arrow bindings on + # Buffer pull from this same history, so commands the user runs + # become navigable on the next prompt. + if history is not None: + try: + history.store_string(stripped) + except Exception: + pass + # Clear the buffer only after a successful parse — _execute returns + # False for shlex errors so the user can fix their unclosed quote + # in-place instead of having to retype the whole line. + # ``bypass_readonly=True`` is mandatory: _execute synchronously sets + # ``is_input_locked[0] = True`` before spawning the worker, which + # makes the Buffer read-only — a plain bypass_readonly=False + # set_document would be silently rejected, leaving the typed line + # stranded in the prompt after the command finishes. + if _execute(stripped): + input_buffer.set_document(Document(""), bypass_readonly=True) + + @kb.add("c-c") + def _ctrl_c(event): + # If a worker thread is running, Ctrl+C stops that command rather + # than exiting the REPL. Uses PyThreadState_SetAsyncExc to inject + # KeyboardInterrupt into the worker — the inner _run catches it and + # surfaces a "stopped" footer. This is the documented mechanism for + # interrupting a misbehaving thread; for in-flight HTTP the + # exception fires when the request returns, which is acceptable. + worker = current_worker[0] + if state.is_running and worker is not None and worker.is_alive(): + import ctypes + + tid = worker.ident + if tid is None: + event.app.exit() + return + try: + res = ctypes.pythonapi.PyThreadState_SetAsyncExc( + ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt) + ) + # If we managed to flip exception state in more than one + # thread, the docs say to undo it — otherwise we leave a + # dangling pending exception on an unrelated thread. + if res > 1: + ctypes.pythonapi.PyThreadState_SetAsyncExc( + ctypes.c_ulong(tid), None + ) + except Exception: + # ctypes path failed (PyPy? embedded?) — fall back to + # exiting; daemon worker dies with the process. + event.app.exit() + return + event.app.exit() + + @kb.add("c-d") + def _ctrl_d(event): + # Ctrl+D on empty input is "logout from shell" → exit. While a + # command is running, ignore it to avoid yanking the REPL out from + # under the user mid-scrape; they have :q or a second Ctrl+C. + if state.is_running: + return + if not input_buffer.text: + event.app.exit() + + @kb.add("tab", filter=~has_completions) + def _tab_open(event): + # Tab on an EMPTY input → toggle Scroll/Select mode (no need for + # completions when there's nothing to complete). Tab while typing + # opens completions as before. + if not input_buffer.text: + _toggle_mouse_mode(event) + return + event.current_buffer.start_completion(select_first=False) + + @kb.add("tab", filter=has_completions) + def _tab_next(event): + event.current_buffer.complete_next() + @kb.add("s-tab", filter=has_completions) + def _shift_tab(event): + event.current_buffer.complete_previous() + + @kb.add("escape", filter=has_completions, eager=True) + def _esc(event): + event.current_buffer.cancel_completion() + + # ── History navigation ───────────────────────────────────────────────── + # Plain Up/Down navigate the FileHistory at ~/.config/scrapingbee-cli/ + # .history. When the completion menu is open these keys instead + # navigate the menu (prompt_toolkit's default behaviour); the + # ``~has_completions`` filter ensures we don't compete. + @kb.add("up", filter=~has_completions) + def _history_back(event): + event.current_buffer.history_backward() + + @kb.add("down", filter=~has_completions) + def _history_forward(event): + event.current_buffer.history_forward() + + # ── Scrollback navigation ────────────────────────────────────────────── + # Keyboard-only scrolling of the virtual buffer. We don't enable mouse + # capture (so native drag-select stays usable), so these keys are the + # primary way to scroll history. Familiar to vim/less/htop users. + # + # ``eager=True`` is critical here: prompt_toolkit's Buffer has its own + # default bindings for PgUp/PgDn (history navigation in some modes) and + # the completion menu also consumes PgUp/PgDn when open. Eager bindings + # fire BEFORE buffer-level handlers, so our scrollback scroll wins + # whenever no completion popup is showing. + @kb.add("pageup", eager=True, filter=~has_completions) + def _sb_pageup(_e): + scrollback.scroll_up(10) try: - cli_group.main(args, standalone_mode=False) - except click.UsageError as e: - msg = str(e) - err_console.print(f" [bold {BEE_RED}]usage:[/] {msg}") - if "no such option" in msg.lower(): - m = re.search(r"--?[A-Za-z0-9-]+", msg) - if m: - bad = m.group(0) - suggestion = _suggest(bad, command_flags.get(cmd_name, [])) - if suggestion: - err_console.print( - f" [{BEE_DIM}]did you mean[/] " - f"[bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" - ) - status = "fail" - except click.ClickException as e: - e.show() - status = "fail" - except SystemExit as e: - code = e.code if e.code is not None else 0 - if code not in (0, None): - status = "fail" - except Exception as e: - err_console.print(f" [bold {BEE_RED}]error:[/] {e}") - status = "fail" + app.invalidate() + except Exception: + pass - duration = time.monotonic() - start - _print_command_footer(status, duration) + @kb.add("pagedown", eager=True, filter=~has_completions) + def _sb_pagedown(_e): + scrollback.scroll_down(10) + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-up", eager=True) + def _sb_lineup(_e): + scrollback.scroll_up(1) + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-down", eager=True) + def _sb_linedown(_e): + scrollback.scroll_down(1) + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-home", eager=True) + def _sb_top(_e): + scrollback.scroll_to_top() + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-end", eager=True) + def _sb_bottom(_e): + scrollback.scroll_to_bottom() + try: + app.invalidate() + except Exception: + pass + + # ── Mouse mode toggle (Alt+S = Esc S in terminal protocol) ───────────── + # Flips between "scroll mode" (mouse_support on — wheel scrolls our + # virtual buffer, drag-select needs per-terminal modifier like + # Option/Shift) and "select mode" (mouse_support off — drag-select + # works without any modifier on every terminal, wheel scrolling falls + # back to PgUp/PgDn/Ctrl-arrows). Toolbar shows the active mode. + @kb.add("escape", "s", eager=True) + def _toggle_mouse_mode(_event): + if state.mouse_mode == "scroll": + state.mouse_mode = "select" + try: + app.output.disable_mouse_support() + app.output.flush() + except Exception: + pass + else: + state.mouse_mode = "scroll" + try: + app.output.enable_mouse_support() + app.output.flush() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + # ── Application (full_screen=True: own the alt buffer cleanly) ───────── + # Owning the alternate screen buffer eliminates the wrap-fragment / + # orphan-toolbar artifacts we got with full_screen=False (where the + # terminal could reflow content under us on resize). + # + # Mouse support is enabled so trackpad / wheel scroll events reach our + # scrollback handler. prompt_toolkit uses mode 1000 — button events + # only, NO motion tracking — so the terminal still owns drag-selection + # (Mac Terminal / iTerm / kitty all keep native select with mode 1000; + # on a few terminals users may need to hold Option/Shift while + # dragging to bypass mouse capture). + app = Application( + layout=layout, + key_bindings=kb, + style=Style.from_dict(_style_dict_for(keep_bg)), + full_screen=True, + mouse_support=True, + ) + + # ── Periodic invalidate while a command is in flight ─────────────────── + # The shimmer on the running command line + the elapsed-time counter + # need a tick ~10× per second to feel live. Without this, the live area + # would only redraw on stdout writes (sparse for long-running scrapes). + # When idle, 1Hz is enough — the "Next Update Xs" countdown only changes + # once per second, and the paged toolbar carousel rotates on 5-second + # boundaries. + async def _ticker(): + import asyncio + + idle_counter = 0 + # Track terminal width and trigger a fresh invalidate on resize. + # No manual resize-detection needed any more — in full_screen + # mode prompt_toolkit owns the entire screen, so SIGWINCH is + # handled cleanly by the framework: the next render uses the + # new size and the alt buffer has no scrollback-vs-logical-row + # mismatch to worry about. + + while True: + await asyncio.sleep(0.1) + if state.is_running: + state.tick += 1 + try: + app.invalidate() + except Exception: + pass + idle_counter = 0 + else: + idle_counter += 1 + if idle_counter >= 10: # 1Hz idle redraw + idle_counter = 0 + try: + app.invalidate() + except Exception: + pass + + # ── Background usage refresher ────────────────────────────────────────── + # Polls the usage API on a 30s interval so the toolbar's "available", + # "used (session)" and "conc" values stay roughly current. The user can + # force an immediate refresh by signalling _refresh_event (used after the + # `usage` and `auth` commands complete — see _execute). The first call + # is fire-and-forget right after the task starts, so the toolbar + # populates within a beat of REPL startup rather than after a 30s wait. + import asyncio as _asyncio # local alias avoids shadowing module-level usage + + _refresh_event = _asyncio.Event() + + async def _do_usage_refresh() -> None: + import hashlib as _hashlib + import json as _json + + from .batch import write_usage_file_cache + from .client import Client, parse_usage + from .config import BASE_URL, get_api_key + + try: + key = get_api_key(None) + except ValueError: + return # No key set yet — quietly skip; toolbar stays N/A. + # Short non-reversible hash of the key — used to detect logout/relogin + # with the *same* key vs a different one, so the session counter + # continues for the former and resets for the latter. + key_hash = _hashlib.sha256(key.encode("utf-8")).hexdigest()[:16] + try: + async with Client(key, BASE_URL) as client: + data, _hdrs, status_code = await client.usage(retries=1, backoff=1.0) + if status_code != 200: + return + try: + raw = _json.loads(data) + except Exception: + return + state.update_from_usage_response(raw, key_hash=key_hash) + try: + write_usage_file_cache(key, parse_usage(data)) + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + except Exception: + # Network errors must not kill the refresher — just skip this + # tick and try again on the next interval. + return - state.last_command = cmd_name - state.last_status = status - state.last_duration = duration - state.refresh_credits_from_cache() + async def _usage_refresher() -> None: + while True: + if state.api_key_set: + await _do_usage_refresh() + try: + await _asyncio.wait_for( + _refresh_event.wait(), + timeout=SessionState.USAGE_REFRESH_INTERVAL, + ) + _refresh_event.clear() + except _asyncio.TimeoutError: + pass + + def _signal_refresh_from_thread() -> None: + """Request an immediate usage refresh from a non-loop thread. + + ``asyncio.Event.set`` is not thread-safe, so we hop back onto the + application's event loop. Used after the worker thread finishes + ``usage`` (data just arrived) or ``auth`` (api_key may have just + become set) so the toolbar updates without waiting for the next + scheduled 30s tick. + """ + try: + loop = app.loop # type: ignore[attr-defined] + if loop is not None: + loop.call_soon_threadsafe(_refresh_event.set) + except Exception: + pass + + # Track background tasks so we can cancel them cleanly on shutdown + # instead of letting them run until the process exits (they would keep + # firing app.invalidate() against a dead app and leak the asyncio loop + # if the REPL is ever embedded in a larger program). + _bg_tasks: list[Any] = [] + + def _pre_run() -> None: + _bg_tasks.append(app.create_background_task(_ticker())) + _bg_tasks.append(app.create_background_task(_usage_refresher())) + + # ── Run inside patch_stdout so command output flows above the prompt ──── + def _restore_bg(): + if _set_black_bg: + try: + sys.stdout.write("\033]111\007") # reset bg to user default + sys.stdout.write("\033]110\007") # reset fg to user default + sys.stdout.flush() + except Exception: + pass + + # Pipe every stdout / stderr write into the virtual scrollback buffer. + # The renderer (FormattedTextControl on the output Window) reads from + # the buffer each frame. We don't touch the real terminal at all + # while the app runs — that's the alt buffer's job, and it'll be + # dismissed cleanly on exit. + def _on_buffer_write() -> None: + # Auto-follow: a write while user is at the bottom keeps them at + # the bottom (scroll_offset stays 0). A user who's scrolled up + # stays put — they explicitly asked to read history. + try: + app.invalidate() + except Exception: + pass + + sb_writer = ScrollbackWriter(scrollback, on_write=_on_buffer_write) + original_stdout, original_stderr = sys.stdout, sys.stderr + sys.stdout = sb_writer # type: ignore[assignment] + sys.stderr = sb_writer # type: ignore[assignment] + # Some callers (cli_utils.write_output) call ``sys.stdout.buffer.write(bytes)``. + # Expose a binary-decoding adapter so those routes still land in our + # scrollback as text. Truly binary output is decoded with errors=replace. + if not hasattr(sys.stdout, "buffer"): + sys.stdout.buffer = _BinaryAdapter(sys.stdout) # type: ignore[attr-defined] + if not hasattr(sys.stderr, "buffer"): + sys.stderr.buffer = _BinaryAdapter(sys.stderr) # type: ignore[attr-defined] + # err_console (rich.Console used by theme.py) caches a file= reference + # at module import time — point it at our buffer too. + _orig_err_console_file = err_console.file + err_console.file = sb_writer # type: ignore[assignment] + try: + app.run(pre_run=_pre_run) + finally: + # Cancel background tasks (ticker + usage refresher) so they stop + # invalidating the now-dead app and release the loop they live on. + for task in _bg_tasks: + try: + task.cancel() + except Exception: + pass + sys.stdout = original_stdout + sys.stderr = original_stderr + try: + err_console.file = _orig_err_console_file + except Exception: + pass + _restore_bg() + set_repl_mode(False) diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index 2f839dc..12fdaf4 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -260,6 +260,12 @@ def _animate(self) -> None: sys.stderr.flush() def start(self) -> None: + # Disabled inside the REPL: the spinner's `\r`-rewrites would flow + # through patch_stdout and trigger a bottom-strip redraw on every + # frame, causing visible flicker. The REPL's toolbar conveys the + # "running" state instead. + if _repl_mode: + return if not sys.stderr.isatty(): return self._thread = threading.Thread(target=self._animate, daemon=True) @@ -425,7 +431,13 @@ def _run(self) -> None: # -- public -------------------------------------------------------------- def start(self) -> None: - if not _repl_mode: + # Disabled inside the REPL. The REPL's bottom toolbar already shows + # credits + a usage gauge; running this thread additionally would + # repaint the bottom strip every ~0.5s via `\r`-rewrites that flow + # through patch_stdout, which is exactly what we see as flicker + # during a scrape. (Direct CLI mode — `scrapingbee scrape ...` outside + # the REPL — still gets the live meter on stderr as before.) + if _repl_mode: return # Print initial meter immediately if we have data if self._remaining is not None: From 70cd140f08671dbdb5d9262bfb31f200e9c2e9e3 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 13 May 2026 18:28:26 +0530 Subject: [PATCH 07/15] fix(repl): snappy Esc + reliable Up after submit/erase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit History navigation - _submit now calls ``input_buffer.reset()`` instead of ``set_document(Document(""))`` so the history-navigation cursor (``working_index``) is also reset. Without this, after submitting a command the next Up press could continue browsing from wherever the user had last left off in history. - Up handler synchronously loads history strings into ``_working_lines`` when the buffer is fresh (len == 1). prompt_toolkit's ``load_history_if_not_yet_loaded`` schedules an *async* task that doesn't run before the first keypress, so without this the first Up after submit was a no-op and required two presses. - Up handler also jumps ``working_index`` to the end when the buffer is empty after browsing, so Up restarts from the newest entry rather than walking further back from the previous browse position. Esc latency - Drop ``ttimeoutlen`` (parser-level escape-sequence wait, default 0.5s) to 0.05s on both the main REPL Application and the :view pager Application. Modern terminals deliver escape sequences as one read so 50ms is plenty. - Drop ``timeoutlen`` (key-processor multi-key-binding wait, default 1.0s) to 0.05s on the pager — this was the main culprit behind the 2-3 second Esc delay there. - Bind ``escape`` in the pager with ``eager=True`` so it fires the moment the key processor sees it, bypassing partial-match search. Both attributes are set on the Application instance after construction because they aren't constructor parameters in this prompt_toolkit version (passing them to __init__ raises TypeError). --- src/scrapingbee_cli/interactive.py | 68 +++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index d9de2ff..802c93f 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -1387,11 +1387,19 @@ def _status_line(): kb = KeyBindings() @kb.add("q") - @kb.add("escape") @kb.add("c-c") def _exit(event): event.app.exit() + # Esc gets its own binding with ``eager=True`` so it fires immediately + # instead of waiting through prompt_toolkit's internal key-processor + # ``timeoutlen`` (the buffered-input default + any partial-match + # search across implicit bindings). Without eager the user perceives + # a multi-second pause between pressing Esc and the pager exiting. + @kb.add("escape", eager=True) + def _exit_esc(event): + event.app.exit() + @kb.add("r") def _toggle_raw(_e): # No-op if the content isn't JSON — pretty isn't available, so @@ -1453,6 +1461,15 @@ def _right(_e): full_screen=True, mouse_support=True, ) + # Shrink BOTH escape-related timeouts. ``ttimeoutlen`` is the parser- + # level wait for "is this Esc-byte the start of an escape sequence", + # default 0.5s. ``timeoutlen`` is the key-processor wait for "is this + # complete key the start of a multi-key binding", default 1.0s. + # Together with eager=True on the Esc-exit binding above, this makes + # Esc fire essentially instantly in the pager. 50ms is enough for + # any well-formed escape sequence from a modern terminal. + pager_app.ttimeoutlen = 0.05 + pager_app.timeoutlen = 0.05 # We're (almost certainly) called from inside the REPL's prompt_toolkit # event loop — a sync key-binding handler invoked `:view`. Calling @@ -2516,10 +2533,15 @@ def _submit(event): text = input_buffer.text stripped = text.strip() if not stripped: - input_buffer.set_document(Document(""), bypass_readonly=True) + # ``reset()`` clears the buffer AND the history-navigation + # cursor (``working_index``). A plain set_document keeps the + # cursor, so an Up press after an empty Enter would resume + # whatever the user was previously browsing in history rather + # than starting fresh from the most recent command. + input_buffer.reset() return if stripped.lower() in _QUIT_TOKENS: - input_buffer.set_document(Document(""), bypass_readonly=True) + input_buffer.reset() event.app.exit() return # Persist the submitted line into the FileHistory before we kick off @@ -2536,13 +2558,12 @@ def _submit(event): # Clear the buffer only after a successful parse — _execute returns # False for shlex errors so the user can fix their unclosed quote # in-place instead of having to retype the whole line. - # ``bypass_readonly=True`` is mandatory: _execute synchronously sets - # ``is_input_locked[0] = True`` before spawning the worker, which - # makes the Buffer read-only — a plain bypass_readonly=False - # set_document would be silently rejected, leaving the typed line - # stranded in the prompt after the command finishes. + # We use ``reset()`` (not ``set_document``) so the + # history-navigation cursor is reset; otherwise a subsequent Up + # press would continue browsing from the prior position instead + # of starting at the newest entry. if _execute(stripped): - input_buffer.set_document(Document(""), bypass_readonly=True) + input_buffer.reset() @kb.add("c-c") def _ctrl_c(event): @@ -2617,7 +2638,29 @@ def _esc(event): # ``~has_completions`` filter ensures we don't compete. @kb.add("up", filter=~has_completions) def _history_back(event): - event.current_buffer.history_backward() + buf = event.current_buffer + # prompt_toolkit loads history asynchronously via a background + # task scheduled at first render. After our ``buffer.reset()`` on + # submit, that task is cancelled and ``_working_lines`` is just + # ``[""]`` — the next Up press lands before the task re-runs, so + # ``history_backward`` has nothing to walk and is a no-op. Load + # the history strings synchronously here as a fallback so the + # first Up after a submit actually shows the newest entry. + try: + if len(buf._working_lines) <= 1: + strings = list(buf.history.get_strings()) + if strings: + for s in reversed(strings): + buf._working_lines.appendleft(s) + buf.working_index = len(buf._working_lines) - 1 + elif not buf.text and buf.working_index != len(buf._working_lines) - 1: + # User has browsed back and erased to empty: jump the + # cursor to the newest entry so this Up restarts there + # instead of continuing from the previous browse point. + buf.working_index = len(buf._working_lines) - 1 + except Exception: + pass + buf.history_backward() @kb.add("down", filter=~has_completions) def _history_forward(event): @@ -2726,6 +2769,11 @@ def _toggle_mouse_mode(_event): full_screen=True, mouse_support=True, ) + # 50ms escape-sequence timeout (default 500ms). Snappy Esc for + # cancel-completion etc. — modern terminals deliver escape sequences + # as one read, so 50ms is plenty. Set on the instance because + # ``ttimeoutlen`` isn't a constructor parameter. + app.ttimeoutlen = 0.05 # ── Periodic invalidate while a command is in flight ─────────────────── # The shimmer on the running command line + the elapsed-time counter From eef71a8d76ddbac0f8bc98b06b4f4fd8ef67809f Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 13 May 2026 21:53:55 +0530 Subject: [PATCH 08/15] feat(repl): in-place api key prompt, ! shell exec, honeycomb progress, fast Ctrl+C - API key entry now lives inside the REPL UI: prompt flips to `API key:` with a masked input on startup or after `logout` / `auth`. No more pre-app getpass; no more `run_in_terminal` suspend/resume jolt. - `!cmd` runs a shell command in a worker thread, gated by the existing unsafe-mode check. Output streams into scrollback; Ctrl+C terminates the child. - Ctrl+C during a scrape stops in a frame instead of waiting for the HTTP request: tracks the worker's asyncio loop via a monkey-patched `asyncio.run` and cancels in-flight tasks via `call_soon_threadsafe`. CancelledError is caught alongside KeyboardInterrupt. - Submitted command stays in the buffer if the run fails or is cancelled; only successful runs clear it. - Batch progress: brand-yellow honeycomb hexes that fill as you go, with a shimmering boundary cell driven by the REPL's 10 Hz ticker. Single live-updating line via `replace_last_n_lines` instead of one appended row per completion. Usage credit meter mirrors the brand-yellow filled/outline palette. - `:view` now also accepts `:view crawl` (alias for the crawl log) and `:view ` for arbitrary files. Meta-command echo is spliced ABOVE the meta's output, matching click-command echo order. - History Up after submit no longer inverts oldest/newest order. - `_validate_api_key` detects a running loop and offloads to a worker thread so REPL-mode `auth` no longer hits "asyncio.run cannot be called from a running event loop". --- src/scrapingbee_cli/batch.py | 25 +- src/scrapingbee_cli/commands/auth.py | 18 +- src/scrapingbee_cli/crawl.py | 12 +- src/scrapingbee_cli/interactive.py | 1005 +++++++++++++++++++++++--- src/scrapingbee_cli/theme.py | 219 +++++- 5 files changed, 1136 insertions(+), 143 deletions(-) diff --git a/src/scrapingbee_cli/batch.py b/src/scrapingbee_cli/batch.py index d59c204..6883a3c 100644 --- a/src/scrapingbee_cli/batch.py +++ b/src/scrapingbee_cli/batch.py @@ -5,6 +5,7 @@ import asyncio import hashlib import os +import sys import time from collections.abc import Awaitable, Callable from dataclasses import dataclass @@ -597,10 +598,15 @@ async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: if failure_count > 0: fail_pct = failure_count / completed * 100 if is_repl_mode(): - progress = format_honeycomb_trail( - completed, total, rps=rps_val, eta=eta_val, failure_pct=fail_pct + # Push the latest counts/rates into the shared progress + # state. ``update_progress_state`` renders immediately + # AND the REPL ticker will keep re-rendering at ~10 Hz + # so the boundary hex shimmers between completions. + from .theme import update_progress_state + update_progress_state( + completed, total, + rps=rps_val, eta=eta_val, failure_pct=fail_pct, ) - err_console.print(progress) else: parts = [f"[{completed}/{total}]"] if rps_val is not None: @@ -615,7 +621,18 @@ async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: return i, result tasks = [run_one(i, inp) for i, inp in enumerate(inputs)] - ordered = await asyncio.gather(*tasks, return_exceptions=True) + try: + ordered = await asyncio.gather(*tasks, return_exceptions=True) + finally: + # Stop the REPL's ticker from re-rendering the progress widget + # now that the batch is done (or cancelled). Safe to call even + # when state was never set. + if is_repl_mode(): + try: + from .theme import clear_progress_state + clear_progress_state() + except Exception: + pass results: list[BatchResult] = [] for i, item in enumerate(ordered): if isinstance(item, BaseException): diff --git a/src/scrapingbee_cli/commands/auth.py b/src/scrapingbee_cli/commands/auth.py index fce226a..ac6ea2b 100644 --- a/src/scrapingbee_cli/commands/auth.py +++ b/src/scrapingbee_cli/commands/auth.py @@ -83,8 +83,24 @@ async def _check() -> tuple[int, bytes]: data, _, status_code = await client.usage(retries=1, backoff=1.0) return status_code, data + def _run_check() -> tuple[int, bytes]: + return asyncio.run(_check()) + try: - status, data = asyncio.run(_check()) + # ``asyncio.run`` refuses to start when a loop is already running + # in the current thread. The REPL's ``auth`` flow runs us on the + # main thread (via ``run_in_terminal``) while prompt_toolkit's + # Application loop is still active — offload the coroutine to a + # short-lived worker thread in that case. From a plain CLI + # invocation no loop is running, so we just use ``asyncio.run`` + # directly. + try: + asyncio.get_running_loop() + from concurrent.futures import ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=1) as pool: + status, data = pool.submit(_run_check).result() + except RuntimeError: + status, data = _run_check() if status == 200: return True, "" # API returned an error — try to extract the message diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py index 1122c14..50d54b4 100644 --- a/src/scrapingbee_cli/crawl.py +++ b/src/scrapingbee_cli/crawl.py @@ -664,7 +664,11 @@ def run_project_spider( ) log_path = _maybe_set_repl_log_file(settings) if log_path: - click.echo(f"REPL mode: full crawl log → {log_path}", err=True) + click.echo( + f"REPL mode: full crawl log → {log_path} " + f"(use `:view crawl` to scroll through it)", + err=True, + ) process = CrawlerProcess(settings) process.crawl(spider_name) process.start(install_signal_handlers=_install_signal_handlers()) @@ -732,7 +736,11 @@ def run_urls_spider( settings.set("CLOSESPIDER_PAGECOUNT", max_pages) log_path = _maybe_set_repl_log_file(settings) if log_path: - click.echo(f"REPL mode: full crawl log → {log_path}", err=True) + click.echo( + f"REPL mode: full crawl log → {log_path} " + f"(use `:view crawl` to scroll through it)", + err=True, + ) process = CrawlerProcess(settings) process.crawl( GenericScrapingBeeSpider, diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 802c93f..9c6d994 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -79,7 +79,7 @@ "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", "completion-menu.meta.completion": f"bg:{_BG_CHIP} #886600", "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", - "auto-suggestion": "fg:#554400 italic", + "auto-suggestion": "fg:#777777 italic", } @@ -142,6 +142,217 @@ def closed(self) -> bool: # --------------------------------------------------------------------------- +try: + from prompt_toolkit.auto_suggest import AutoSuggest as _PTKAutoSuggest +except Exception: # pragma: no cover — prompt_toolkit should always be present + _PTKAutoSuggest = object # type: ignore[assignment,misc] + + +class BeeAutoSuggest(_PTKAutoSuggest): + """Context-aware ghost-text autosuggest for the REPL prompt. + + On each keystroke prompt_toolkit calls ``get_suggestion`` with the + current buffer; we look at the partial token under the cursor and + return a single greyed-out continuation (or ``None`` for silence). + + Sources used, in order: + - **First word** → match against known command names. + - **A flag** (token starts with ``-``) → match flags registered for + the current command. + - **Token after a choice/bool flag** → match valid choice values. + - **Free text otherwise** → match the start of a previous history + line that begins with the same prefix. + + Candidates are ranked by recency in command history (most-recently- + used wins → behaves like frequency for active users). If the + partial token doesn't prefix any known candidate, we return + ``None`` — typos get no suggestion, even if they happen to be + substrings of past commands. + + Accepting a suggestion (Right arrow / End, or Ctrl+F for the first + word in emacs-style bindings) is handled by prompt_toolkit's + built-in ``auto_suggest_apply`` key processors — no extra wiring + needed here. + """ + + def __init__( + self, + command_names, + command_flags, + bool_flags, + choice_flags, + history, + is_disabled=None, + ) -> None: + self._command_names = sorted(command_names) + self._command_flags = command_flags + self._bool_flags = bool_flags + self._choice_flags = choice_flags + self._history = history + # Optional callable; when it returns True we skip suggestions + # entirely. Used during first-run API key entry — we don't want + # history-based suggestions (which might leak a previously-typed + # secret) or command-name suggestions (irrelevant in that mode). + self._is_disabled = is_disabled + # Cache history lines (newest-first). Refreshed lazily when the + # underlying length changes — cheap O(1) check, avoids re-listing + # the history on every keystroke. + self._cached_lines: list[str] = [] + self._cached_len = -1 + + def _refresh_history(self) -> None: + if self._history is None: + return + try: + lines = list(self._history.get_strings()) + except Exception: + return + if len(lines) != self._cached_len: + self._cached_len = len(lines) + self._cached_lines = lines + + def _rank_by_recency(self, candidates: list[str]) -> list[str]: + """Sort candidates by first occurrence in (newest-first) history. + Unseen candidates fall to the end, then ordered alphabetically.""" + self._refresh_history() + recency: dict[str, int] = {} + for i, line in enumerate(self._cached_lines): + for tok in line.split(): + if tok in candidates and tok not in recency: + recency[tok] = i + return sorted(candidates, key=lambda c: (recency.get(c, 10**9), c)) + + def get_suggestion(self, buffer, document): + from prompt_toolkit.auto_suggest import Suggestion + + try: + if self._is_disabled is not None and self._is_disabled(): + return None + text = document.text_before_cursor + if not text: + return None + words = text.split() + if not words: + return None + first = words[0] + + # Gate against typos at the command level. We only allow a + # suggestion if the first token is either a recognised command + # or a valid PREFIX of one — otherwise we'd risk surfacing + # history junk for a clear typo (the user's explicit ask). + first_is_known = first in self._command_flags + first_is_prefix = ( + not first_is_known + and any(c.startswith(first) for c in self._command_names) + ) + if not (first_is_known or first_is_prefix): + return None + + # 1) Prefer a full history-line continuation. Catches the most + # natural case: "scrape https://exam" → finish the URL + # and any flags the user last paired with it. + self._refresh_history() + for line in self._cached_lines: + if line.startswith(text) and line != text: + return Suggestion(line[len(text):]) + + # 2) No matching history line. Suggest from the structured + # options (command names, flags, choice values). + has_trailing_space = text.endswith(" ") + last = words[-1] + on_first = (len(words) == 1) and not has_trailing_space + + if on_first: + cands = [ + c for c in self._command_names + if c.startswith(last) and c != last + ] + if not cands: + return None + best = self._rank_by_recency(cands)[0] + return Suggestion(best[len(last):]) + + # Multi-word — need a recognised command to suggest structure. + if not first_is_known: + return None + if has_trailing_space: + return None # no partial token to complete + + if last.startswith("-"): + flags = self._command_flags.get(first, []) + cands = [f for f in flags if f.startswith(last) and f != last] + if not cands: + return None + best = self._rank_by_recency(cands)[0] + return Suggestion(best[len(last):]) + + if len(words) >= 2: + prev = words[-2] + if prev in self._choice_flags: + cands = [ + v for v in self._choice_flags[prev] + if v.startswith(last) and v != last + ] + if not cands: + return None + best = self._rank_by_recency(cands)[0] + return Suggestion(best[len(last):]) + if prev in self._bool_flags: + for v in ("true", "false"): + if v.startswith(last.lower()) and v != last.lower(): + return Suggestion(v[len(last):]) + return None + return None + except Exception: + return None + + +def _make_capped_history(filename: str, max_entries: int = 10_000): + """Construct a ``FileHistory`` with the on-disk file pre-trimmed to + keep at most ``max_entries`` most-recent entries. + + prompt_toolkit's stock ``FileHistory`` appends forever — every + command you ever type lives in ``.history`` until you delete the + file manually. For long-running CLI users that file grows unbounded + and slows down the REPL's initial history-load. We keep the last + 10000 entries on disk (a few months of normal use, file stays + under ~2 MB). + + Trim runs once at construction. During the session, ``FileHistory`` + appends as normal — no per-write overhead. The file may briefly + exceed the cap mid-session; the excess is dropped on next startup. + """ + import datetime as _dt + import os as _os + + from prompt_toolkit.history import FileHistory + + if _os.path.exists(filename): + try: + tmp_history = FileHistory(filename) + strings = list(tmp_history.load_history_strings()) # newest-first + if len(strings) > max_entries: + keep_newest_first = strings[:max_entries] + keep_oldest_first = list(reversed(keep_newest_first)) + tmp = filename + ".tmp" + now = _dt.datetime.now() + try: + with open(tmp, "wb") as f: + for s in keep_oldest_first: + f.write(f"\n# {now}\n".encode("utf-8")) + for line in s.split("\n"): + f.write(f"+{line}\n".encode("utf-8")) + _os.replace(tmp, filename) + except Exception: + try: + _os.unlink(tmp) + except Exception: + pass + except Exception: + pass + return FileHistory(filename) + + def _split_fragments_to_width( line: list[tuple[str, str]], width: int ) -> list[list[tuple[str, str]]]: @@ -214,6 +425,35 @@ def append_fragments(self, fragments: list[tuple[str, str]]) -> None: drop = self.MAX_LINES // 10 del self.lines[:drop] + def replace_last_line(self, fragments: list[tuple[str, str]]) -> None: + """Overwrite the most recent line. Used for in-place progress + updates via the standard terminal ``\\r`` idiom — write + ``\\r\\n`` and the previous line gets replaced rather + than another row appended. + """ + with self._lock: + if self.lines: + self.lines[-1] = list(fragments) + else: + self.lines.append(list(fragments)) + + def replace_last_n_lines( + self, n: int, lines: list[list[tuple[str, str]]] + ) -> None: + """Replace the most recent ``n`` lines with the given ``lines``. + If fewer than ``n`` lines exist, the remainder is appended. + Used for multi-line in-place progress widgets (e.g. the + 3-row honeycomb progress bar). + """ + with self._lock: + if len(self.lines) >= n and n > 0: + # Replace tail in place — same count, no shift. + self.lines[len(self.lines) - n:] = [list(f) for f in lines] + else: + # Not enough prior lines to replace; append. + for f in lines: + self.lines.append(list(f)) + def append_ansi_text(self, text: str) -> None: """Parse ANSI codes in ``text`` and append the resulting line(s). @@ -221,6 +461,13 @@ def append_ansi_text(self, text: str) -> None: trailing newline (e.g. an in-progress progress bar). We split on ``\\n``; the final post-split chunk goes into a pending buffer that gets prepended to the next write. + + Carriage-return (``\\r``) handling: anything before the last + ``\\r`` on a line is discarded (standard terminal "go to start + of line" semantics), AND the resulting line replaces the + previous line in scrollback instead of appending. This lets + callers do in-place progress updates by writing + ``\\r\\n`` repeatedly. """ from prompt_toolkit.formatted_text import ANSI, to_formatted_text @@ -233,11 +480,19 @@ def append_ansi_text(self, text: str) -> None: complete = chunks[:-1] for raw in complete: + had_cr = "\r" in raw + if had_cr: + # Everything before the last \r is overwritten — keep + # only what comes after it. + raw = raw.rsplit("\r", 1)[1] try: fragments = list(to_formatted_text(ANSI(raw))) except Exception: fragments = [("", raw)] - self.append_fragments(fragments) + if had_cr: + self.replace_last_line(fragments) + else: + self.append_fragments(fragments) def flush_pending(self) -> None: """Commit any pending partial line as its own row.""" @@ -900,7 +1155,7 @@ def render() -> list[tuple[str, str]]: ) hint_chunk = [ ("class:toolbar.value", mode_label), - ("class:toolbar.hint", " · Tab to switch"), + ("class:toolbar.hint", " · Shift+Tab to switch"), ] LEADING = " " @@ -1250,11 +1505,12 @@ def _print_row(cmd: str, desc: str) -> None: for cmd, desc in [ (":help, :?", "Show this command list"), (":clear", "Clear the screen"), - (":view", "Scroll through the last command's full output"), + (":view", "Scroll the last command's output ('crawl' = crawl log, or pass a path)"), (":set K=V ...", "Set one or more session defaults"), (":unset K", "Remove a session default ('all' or '*' clears every)"), (":reset", "Clear every session default"), (":show", "Show current session defaults"), + ("!", "Run a shell command (requires unsafe mode)"), (":q, :quit", "Quit the REPL"), ]: _print_row(cmd, desc) @@ -1608,20 +1864,56 @@ def _handle_meta( if head_low == ":view": from pathlib import Path - cache_path = Path.home() / ".cache" / "scrapingbee-cli" / "last-output" - if not cache_path.exists(): - err_console.print(f" [{BEE_DIM}]no recent output to view[/]") + cache_dir = Path.home() / ".cache" / "scrapingbee-cli" + crawl_log = cache_dir / "crawl.log" + target_arg = rest.strip() + # `:view` → last command's output + # `:view crawl` → the crawl log written by the most recent + # `crawl` run in REPL mode + # `:view crawl ` → also alias-mode, but ONLY when the + # path after ``crawl`` resolves to the + # actual crawl.log on disk. This lets + # users copy the full hint line ("crawl + # /Users/.../crawl.log") into the + # prompt; random text after ``crawl`` + # falls through to "file not found" + # instead of silently opening the log. + # `:view ` → arbitrary file (must exist) + if not target_arg: + target_path = cache_dir / "last-output" + missing_msg = "no recent output to view" + elif target_arg.lower() == "crawl": + target_path = crawl_log + missing_msg = "no crawl log yet — run `crawl ...` first" + elif target_arg.lower().startswith("crawl "): + after = target_arg[len("crawl "):].strip() + try: + supplied_path = Path(after).expanduser().resolve(strict=False) + if supplied_path == crawl_log.resolve(strict=False): + target_path = crawl_log + missing_msg = "no crawl log yet — run `crawl ...` first" + else: + target_path = Path(target_arg).expanduser() + missing_msg = f"file not found: {target_arg}" + except Exception: + target_path = Path(target_arg).expanduser() + missing_msg = f"file not found: {target_arg}" + else: + target_path = Path(target_arg).expanduser() + missing_msg = f"file not found: {target_arg}" + if not target_path.exists(): + err_console.print(f" [{BEE_DIM}]{missing_msg}[/]") return "ok" try: - _open_pager(str(cache_path)) + _open_pager(str(target_path)) except FileNotFoundError: # File got deleted between exists() and read() — race with cleanup - err_console.print(f" [{BEE_DIM}]cached output no longer available[/]") + err_console.print(f" [{BEE_DIM}]file no longer available[/]") except Exception as e: err_console.print(f" [bold {BEE_RED}]pager error:[/] {e}") err_console.print( f" [{BEE_DIM}]full output saved at[/] " - f"[bold {BEE_YELLOW}]{cache_path}[/]" + f"[bold {BEE_YELLOW}]{target_path}[/]" ) return "ok" @@ -1849,6 +2141,42 @@ def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: set_repl_mode(True) + # ── Asyncio loop tracking for fast Ctrl+C ─────────────────────────────── + # Commands like ``scrape`` run ``asyncio.run(...)`` inside a worker + # thread to drive aiohttp. While the loop is in ``select()`` waiting + # on a socket, ``PyThreadState_SetAsyncExc`` doesn't deliver an + # interrupt — it only fires at the next Python bytecode boundary, and + # no bytecode runs until ``select()`` returns (typically when the + # ScrapingBee API responds, which can be 30+ seconds). + # + # We monkey-patch ``asyncio.run`` for the duration of this REPL + # session so we can keep a handle to the worker's loop. The Ctrl+C + # handler then uses ``call_soon_threadsafe`` to cancel in-flight + # tasks — that wakes the selector immediately and raises + # ``CancelledError`` on the await, which propagates out cleanly + # (the worker's except clause turns it into "stopped"). + import asyncio as _asyncio_mod + + _active_worker_loop: list[Any] = [None] + _original_asyncio_run = _asyncio_mod.run + + def _tracking_loop_factory(): + loop = _asyncio_mod.new_event_loop() + _active_worker_loop[0] = loop + return loop + + def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): + try: + return _original_asyncio_run( + main, + debug=debug, + loop_factory=loop_factory or _tracking_loop_factory, + ) + finally: + _active_worker_loop[0] = None + + _asyncio_mod.run = _tracking_asyncio_run + # ── Click tree introspection ──────────────────────────────────────────── command_help, command_flags, bool_flags, choice_flags = _walk_click_tree(cli_group) command_names = sorted(command_flags.keys()) @@ -1866,7 +2194,7 @@ def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: history_path = str(Path.home() / ".config" / "scrapingbee-cli" / ".history") Path(history_path).parent.mkdir(parents=True, exist_ok=True) try: - history = FileHistory(history_path) + history = _make_capped_history(history_path, max_entries=10_000) except Exception: history = None # type: ignore[assignment] @@ -1899,79 +2227,71 @@ def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: scrollback = ScrollbackBuffer() rows = shutil.get_terminal_size((80, 24)).lines # kept for API-key prompt sizing - # ── First-run API key prompt ──────────────────────────────────────────── - # Inline masked input — banner already announced us, no need to repeat. - # Validates against the live `/usage` endpoint, saves to ~/.config/ - # scrapingbee-cli/.env, and updates os.environ so the rest of this - # process sees the new key (the .env file alone wouldn't help — most - # call sites read os.environ directly). - # - # Required, not skippable: almost every command in the CLI hits the - # ScrapingBee API, so launching the REPL without a key would just give - # the user an unusable shell. We loop until a valid key is entered or - # they Ctrl+C / Ctrl+D out (which exits the whole program, since with - # no key the REPL is dead weight). - if not state.api_key_set: - from .commands.auth import _masked_getpass, _validate_api_key - from .config import ENV_API_KEY, save_api_key_to_dotenv + # ── Multi-line in-place progress renderer ─────────────────────────────── + # Wired so batch operations (``scrape --input-file ...``) can update a + # 3-row honeycomb progress widget in place rather than appending a new + # row per completion. The renderer keeps track of how many lines the + # previous frame consumed so the next frame overwrites the same band. + from .theme import set_progress_renderer as _set_progress_renderer - err_console.print( - f" [{BEE_DIM}]Enter your API key to get started — find it at " - f"[bold {BEE_YELLOW}]dashboard.scrapingbee.com/dashboard[/][{BEE_DIM}].[/]" - ) - err_console.print() - while not state.api_key_set: + _progress_line_count = [0] + + def _render_progress(rendered_lines: list[str]) -> None: + from prompt_toolkit.formatted_text import ANSI, to_formatted_text + + fragments_per_line: list[list[tuple[str, str]]] = [] + for raw in rendered_lines: try: - raw = _masked_getpass(" API key: ") - except (EOFError, KeyboardInterrupt): - err_console.print() - err_console.print( - f" [{BEE_DIM}]Exiting — an API key is required to use the CLI.[/]" - ) - if _set_black_bg: - try: - sys.stdout.write("\033]111\007") - sys.stdout.write("\033]110\007") - sys.stdout.flush() - except Exception: - pass - set_repl_mode(False) - return - raw = raw or "" - key = raw.strip() - # Pasted keys from password managers / clipboards often pick up - # a leading or trailing space. Strip silently but warn so the - # user knows we did — otherwise a key that "looks right" but - # fails to authenticate is bewildering. - if key and key != raw: - err_console.print( - f" [{BEE_DIM}]Note: stripped surrounding whitespace from your key.[/]" - ) - if not key: - err_console.print( - f" [bold {BEE_RED}]Empty key.[/] [{BEE_DIM}]Please paste your API key.[/]" - ) - continue - err_console.print(f" [{BEE_DIM}]Validating…[/]") - valid, err_msg = _validate_api_key(key) - if valid: - try: - save_api_key_to_dotenv(key) - except Exception as e: - err_console.print( - f" [bold {BEE_RED}]Could not save:[/] [{BEE_DIM}]{e}[/]" - ) - os.environ[ENV_API_KEY] = key - state.api_key_set = True - err_console.print(f" [bold {BEE_YELLOW}]✓[/] API key saved.") - else: - err_console.print( - f" [bold {BEE_RED}]Invalid:[/] [{BEE_DIM}]{err_msg or 'unknown error'}. Try again.[/]" - ) - # No clear / re-write needed — the API-key prompt happened in - # the real terminal, then app.run() will switch to the alt - # buffer and we get a clean screen automatically. The banner is - # already loaded in our scrollback buffer from earlier. + fragments_per_line.append(list(to_formatted_text(ANSI(raw)))) + except Exception: + fragments_per_line.append([("", raw)]) + n = len(fragments_per_line) + prev = _progress_line_count[0] + if prev > 0 and prev == n: + scrollback.replace_last_n_lines(prev, fragments_per_line) + else: + # First frame, or row-count changed (rare): append fresh and + # remember how many lines to overwrite next time. + for f in fragments_per_line: + scrollback.append_fragments(f) + _progress_line_count[0] = n + + _set_progress_renderer(_render_progress) + + # ── First-run API key state ───────────────────────────────────────────── + # When no API key is configured we open the REPL UI in a "first-run" + # mode: the bottom prompt changes from ``❯`` to ``API key: ``, the + # input field is masked via PasswordProcessor, and ``_submit`` routes + # to ``_handle_first_run_key`` (which validates against /usage and + # writes to ~/.config/scrapingbee-cli/.env). Once a key validates we + # flip the flag and the prompt transitions to normal command mode in + # place — no app restart, no screen flicker. + _first_run_needs_key = [not state.api_key_set] + if _first_run_needs_key[0]: + # Render the welcome lines into the scrollback area so the user + # sees them right below the banner while the input field shows + # ``API key:``. We use a throwaway rich Console to produce ANSI, + # then append to the scrollback buffer (the live ``err_console`` + # path doesn't work yet — patch_stdout isn't installed until + # ``app.run()`` starts). + try: + from io import StringIO as _SIO + from rich.console import Console as _RC + + _buf = _SIO() + _c = _RC( + file=_buf, force_terminal=True, color_system="truecolor", + highlight=False, width=shutil.get_terminal_size((80, 24)).columns, + ) + _c.print( + f" [{BEE_DIM}]Welcome! Enter your API key to get started — " + f"find it at [bold {BEE_YELLOW}]dashboard.scrapingbee.com/dashboard[/]" + f"[{BEE_DIM}].[/]" + ) + _c.print() + scrollback.append_ansi_text(_buf.getvalue()) + except Exception: + pass # ── Input buffer ──────────────────────────────────────────────────────── # Locked while a worker thread is running a command so the user can't @@ -1982,18 +2302,32 @@ def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: # Ctrl+C handler to inject KeyboardInterrupt into the worker so the user # can stop a long scrape without exiting the REPL. current_worker: list[threading.Thread | None] = [None] + # Currently-running shell subprocess (when the user submits ``!cmd``). + # Ctrl+C uses this to terminate the child process directly — injecting + # KeyboardInterrupt into the worker thread alone doesn't fire while the + # thread is blocked reading the subprocess's stdout in a C-level read(). + current_subprocess: list[Any] = [None] input_buffer = Buffer( history=history, completer=completer, complete_while_typing=False, - auto_suggest=AutoSuggestFromHistory(), + auto_suggest=BeeAutoSuggest( + command_names=command_names, + command_flags=command_flags, + bool_flags=bool_flags, + choice_flags=choice_flags, + history=history, + is_disabled=lambda: _first_run_needs_key[0], + ), multiline=False, read_only=Condition(lambda: is_input_locked[0]), ) def _line_prefix(line_no, _wrap_count): if line_no == 0: + if _first_run_needs_key[0]: + return [("class:promptmark", "API key: ")] return [("class:promptmark", "❯ ")] return [("", " ")] @@ -2009,8 +2343,34 @@ def _input_height(): return D.exact(0) return D(min=1, max=8) + # ``AppendAutoSuggestion`` is the input processor that renders ghost-text + # auto-suggestions after the cursor. Without it, ``buffer.suggestion`` + # is set correctly but never drawn — BufferControl alone only handles + # the typed text + lexer styling. ``HighlightMatchingBracketProcessor`` + # isn't applied so we don't add it. + # + # ``PasswordProcessor`` masks the input when ``_first_run_needs_key`` is + # True so an API key isn't visible on-screen. Wrapped in a + # ``ConditionalProcessor`` so masking flips off automatically once the + # key validates and we transition to normal command mode. + from prompt_toolkit.layout.processors import ( + AppendAutoSuggestion, + ConditionalProcessor, + PasswordProcessor, + ) + input_window = Window( - content=BufferControl(buffer=input_buffer, lexer=_make_lexer()), + content=BufferControl( + buffer=input_buffer, + lexer=_make_lexer(), + input_processors=[ + ConditionalProcessor( + PasswordProcessor(), + Condition(lambda: _first_run_needs_key[0]), + ), + AppendAutoSuggestion(), + ], + ), get_line_prefix=_line_prefix, wrap_lines=True, height=_input_height, @@ -2240,6 +2600,179 @@ def _echo_to_scrollback(line: str) -> None: echo.append(line, style=BEE_DIM) err_console.print(echo) + # ── First-run API key validation ──────────────────────────────────────── + # Called from ``_submit`` on the main thread when ``_first_run_needs_key`` + # is True. The user just submitted the masked key — we validate it + # against the live /usage endpoint, persist on success, and flip the + # flag so subsequent submits route to ``_execute`` (normal commands). + def _handle_first_run_key(key_raw: str, raw_with_ws: str) -> None: + from .commands.auth import _validate_api_key + from .config import ENV_API_KEY, save_api_key_to_dotenv + + key = key_raw.strip() + # Pasted keys from password managers often pick up surrounding + # whitespace. Silently strip but warn so the user knows we did. + if key and key != raw_with_ws.rstrip("\n"): + err_console.print( + f" [{BEE_DIM}]Note: stripped surrounding whitespace from your key.[/]" + ) + if not key: + err_console.print( + f" [bold {BEE_RED}]Empty key.[/] [{BEE_DIM}]Please paste your API key.[/]" + ) + return + err_console.print(f" [{BEE_DIM}]Validating…[/]") + valid, err_msg = _validate_api_key(key) + if valid: + try: + save_api_key_to_dotenv(key) + except Exception as e: + err_console.print( + f" [bold {BEE_RED}]Could not save:[/] [{BEE_DIM}]{e}[/]" + ) + os.environ[ENV_API_KEY] = key + state.api_key_set = True + _first_run_needs_key[0] = False + err_console.print(f" [bold {BEE_YELLOW}]✓[/] API key saved.") + # Toolbar credits/concurrency are stale (None); trigger a fresh + # /usage fetch so the bottom strip populates without waiting + # for the 30s tick. + _signal_refresh_from_thread() + try: + app.invalidate() + except Exception: + pass + else: + err_console.print( + f" [bold {BEE_RED}]Invalid:[/] [{BEE_DIM}]{err_msg or 'unknown error'}. Try again.[/]" + ) + + # ── Shell command execution (`!cmd` in the REPL) ──────────────────────── + # Runs in a worker thread so the REPL stays responsive. stdout+stderr + # are merged and streamed line-by-line through the patched + # ``sys.stdout`` (which writes into scrollback). Ctrl+C terminates the + # child process via ``current_subprocess[0].terminate()`` AND injects + # KeyboardInterrupt into the worker thread (so a hung read returns + # promptly). + def _execute_shell(shell_cmd: str, original_line: str, echo_idx: int) -> None: + import subprocess + + output_start_index = echo_idx + start = time.monotonic() + status_ref = ["ok"] + state.is_running = True + state.running_command = "shell" + state.running_command_text = original_line + state.run_start = start + + def _run() -> None: + try: + # Use the system shell so users can pipe / redirect / glob + # naturally. Merge stderr into stdout for unified streaming; + # any separation is the user's problem (they'd redirect + # 2>&1 themselves if they cared). + proc = subprocess.Popen( # noqa: S602 — gated by exec_gate + shell_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + current_subprocess[0] = proc + try: + assert proc.stdout is not None + for chunk in iter(proc.stdout.readline, ""): + sys.stdout.write(chunk) + finally: + code = proc.wait() + current_subprocess[0] = None + if code != 0: + status_ref[0] = "fail" + err_console.print( + f" [{BEE_DIM}]exit code {code}[/]" + ) + except KeyboardInterrupt: + # Ctrl+C: stop the child if it's still running, then mark + # the command as cancelled in the footer. + proc = current_subprocess[0] + if proc is not None: + try: + proc.terminate() + except Exception: + pass + err_console.print(f" [{BEE_DIM}]stopped[/]") + status_ref[0] = "stopped" + except Exception as e: + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status_ref[0] = "fail" + + def _finish() -> None: + duration = time.monotonic() - start + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + # Splice the dim echo line above the streamed output. + try: + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + to_formatted_text as _tft, + ) + from io import StringIO as _SIO + from rich.console import Console as _RC + + _buf = _SIO() + _c = _RC( + file=_buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(original_line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(output_start_index, _echo_fragments) + except Exception: + pass + _print_command_footer(status_ref[0], duration) + state.last_command = "shell" + state.last_status = status_ref[0] + state.last_duration = duration + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + is_input_locked[0] = True + try: + app.invalidate() + except Exception: + pass + + def _worker() -> None: + try: + _run() + finally: + current_worker[0] = None + try: + _finish() + except Exception: + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + worker_thread = threading.Thread(target=_worker, daemon=True) + current_worker[0] = worker_thread + worker_thread.start() + # ── Command execution (synchronous, output flows via patched stdout) ──── def _execute(line: str) -> bool: """Run a single REPL submission: meta-command or click command. @@ -2262,6 +2795,14 @@ def _execute(line: str) -> bool: # the live shimmering line above the input is the only on-screen # representation while the command runs. # `:q` is handled at the key-binding layer so we don't get here for it. + # + # Snapshot scrollback length before running the meta-handler so we + # can splice the ``❯ line`` echo at this position afterwards. Without + # this, the echo lands AFTER any error/info the meta-handler + # printed (e.g. ``file not found: foo`` then ``❯ :view foo``), which + # reads upside-down. Insert-at-position keeps the conversational + # order: command, then its output. + meta_echo_idx = scrollback.current_length() meta = _handle_meta( line, state, command_help, all_known_flags, bool_flags, choice_flags, scrollback=scrollback, @@ -2286,13 +2827,89 @@ def _execute(line: str) -> bool: app.invalidate() except Exception: pass - # Meta commands echo themselves with rich-styled output already; - # only echo the user's typed line for the record. - _echo_to_scrollback(line) + # Splice the dim echo line ABOVE whatever the meta-handler + # printed during its run. Fall back to appending if the + # rich-render or insert path fails. + try: + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + to_formatted_text as _tft, + ) + from io import StringIO as _SIO + from rich.console import Console as _RC + _buf = _SIO() + _c = _RC( + file=_buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(meta_echo_idx, _echo_fragments) + except Exception: + _echo_to_scrollback(line) return True if meta == "quit": # belt-and-braces; key binding usually catches it return True + # `!shell command` — run a shell command in a worker thread, + # streaming output into scrollback. Gated by the same unsafe-mode + # check used by --post-process / --on-complete / schedule. + if line.startswith("!"): + shell_cmd = line[1:].strip() + shell_echo_idx = scrollback.current_length() + if not shell_cmd: + err_console.print( + f" [{BEE_DIM}]usage: ![/]" + f"[bold {BEE_YELLOW}][/]" + ) + else: + from .exec_gate import ( + is_command_whitelisted, + is_exec_enabled, + is_whitelist_enabled, + ) + + if not is_exec_enabled(): + err_console.print( + f" [bold {BEE_RED}]Shell execution disabled.[/] " + f"[{BEE_DIM}]Enable it with `auth --unsafe` " + f"(requires SCRAPINGBEE_ALLOW_EXEC=1).[/]" + ) + elif is_whitelist_enabled() and not is_command_whitelisted(shell_cmd): + err_console.print( + f" [bold {BEE_RED}]Blocked:[/] " + f"[{BEE_DIM}]command not in whitelist or contains " + f"shell-injection patterns.[/]" + ) + else: + _execute_shell(shell_cmd, line, shell_echo_idx) + return True + # Echo the typed line above whatever error we just printed. + try: + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + to_formatted_text as _tft, + ) + from io import StringIO as _SIO + from rich.console import Console as _RC + _buf = _SIO() + _c = _RC( + file=_buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(shell_echo_idx, _echo_fragments) + except Exception: + _echo_to_scrollback(line) + return True + # Tolerate users typing `scrapingbee ...` out of muscle memory. if line.lower().startswith("scrapingbee "): line = line[len("scrapingbee "):].strip() @@ -2323,6 +2940,29 @@ def _execute(line: str) -> bool: err_console.print(f" [bold {BEE_RED}]unknown:[/] {cmd_name}") return True + # Bare ``auth`` in the REPL (no flags) is best served by flipping + # the bottom prompt into first-run mode instead of routing through + # ``run_in_terminal`` — the suspend/resume cycle to read a key in + # the bare terminal feels jarring, and the masked in-place prompt + # is the same flow the user just learned at startup. Variants + # like ``auth --api-key KEY`` or ``auth --unsafe`` still go + # through click normally. + if cmd_name == "auth" and len(args) == 1: + _echo_to_scrollback(original_line) + _first_run_needs_key[0] = True + try: + input_buffer.reset() + except Exception: + pass + err_console.print( + f" [{BEE_DIM}]Enter your API key below.[/]" + ) + try: + app.invalidate() + except Exception: + pass + return True + args = state.apply_settings_to_args(args) # Mark the scrollback position where this command's output will @@ -2364,10 +3004,14 @@ def _run() -> None: except click.ClickException as e: e.show() status_ref[0] = "fail" - except KeyboardInterrupt: - # Ctrl+C while running — the keybinding injected this into us - # via PyThreadState_SetAsyncExc. Surface it as a deliberate - # stop in the footer rather than a generic failure. + except (KeyboardInterrupt, _asyncio_mod.CancelledError): + # Ctrl+C while running — the keybinding either cancelled + # our asyncio tasks (CancelledError propagates out of the + # await chain) or injected KeyboardInterrupt via + # PyThreadState_SetAsyncExc. Either way surface it as a + # deliberate stop in the footer rather than a generic + # failure. (CancelledError is a BaseException since + # Python 3.8 and won't be caught by ``except Exception``.) err_console.print(f" [{BEE_DIM}]stopped[/]") status_ref[0] = "stopped" except SystemExit as e: @@ -2442,6 +3086,26 @@ def _apply_post_cmd_state() -> None: state.max_concurrency = None state.current_concurrency = None state.last_usage_refresh_mono = None + # Flip back into first-run mode in place — the prompt + # transitions to ``API key: `` and the input is masked + # so the user can paste a new key without re-running + # ``auth`` (which would suspend the REPL via + # ``run_in_terminal`` and feel jarring). + _first_run_needs_key[0] = True + err_console.print( + f" [{BEE_DIM}]Enter a new API key to continue, or " + f"[bold {BEE_YELLOW}]:q[/][{BEE_DIM}] to exit.[/]" + ) + # Clear the input buffer only on success — failed or + # cancelled commands leave the line in place so the user + # can edit and re-run without re-typing. Buffer mutations + # have to run on the main thread (this callback is + # already marshalled there via call_soon_threadsafe). + if status_ref[0] == "ok": + try: + input_buffer.reset() + except Exception: + pass try: app.invalidate() except Exception: @@ -2540,41 +3204,81 @@ def _submit(event): # than starting fresh from the most recent command. input_buffer.reset() return + # First-run API key entry path — text in the buffer is the raw key + # the user just pasted. Validate against /usage and, on success, + # persist + transition to normal command mode in place. + if _first_run_needs_key[0]: + input_buffer.reset() + _handle_first_run_key(stripped, text) + return if stripped.lower() in _QUIT_TOKENS: input_buffer.reset() event.app.exit() return # Persist the submitted line into the FileHistory before we kick off - # execution. We do this manually (rather than letting prompt_toolkit - # do it via Buffer.validate_and_handle) because our custom Enter - # binding bypasses that path. The default up/down arrow bindings on - # Buffer pull from this same history, so commands the user runs - # become navigable on the next prompt. + # execution. ``append_string`` is the right call (not + # ``store_string``): the latter only writes to disk, leaving the + # in-memory ``_loaded_strings`` stale, so newly-submitted commands + # don't show up on the next Up press until the REPL restarts and + # reloads from disk. ``append_string`` does both. if history is not None: try: - history.store_string(stripped) + history.append_string(stripped) except Exception: pass - # Clear the buffer only after a successful parse — _execute returns - # False for shlex errors so the user can fix their unclosed quote - # in-place instead of having to retype the whole line. - # We use ``reset()`` (not ``set_document``) so the - # history-navigation cursor is reset; otherwise a subsequent Up - # press would continue browsing from the prior position instead - # of starting at the newest entry. - if _execute(stripped): - input_buffer.reset() + # Don't clear the buffer here — we want the typed command to + # stay visible if it fails or is cancelled (Ctrl+C), so the user + # can edit and retry without re-typing. ``_finish`` clears it + # only when the command succeeded. Shlex parse errors return + # False from ``_execute`` and the text stays in place naturally. + _execute(stripped) @kb.add("c-c") def _ctrl_c(event): # If a worker thread is running, Ctrl+C stops that command rather - # than exiting the REPL. Uses PyThreadState_SetAsyncExc to inject - # KeyboardInterrupt into the worker — the inner _run catches it and - # surfaces a "stopped" footer. This is the documented mechanism for - # interrupting a misbehaving thread; for in-flight HTTP the - # exception fires when the request returns, which is acceptable. + # than exiting the REPL. We try two mechanisms in parallel: + # + # 1. Cancel all tasks on the worker's asyncio loop via + # ``call_soon_threadsafe``. This wakes the selector + # immediately and raises ``CancelledError`` on the in-flight + # await (e.g. an aiohttp request blocked on socket recv). + # This is the only thing that produces a *fast* stop for + # network commands — without it, a long ScrapingBee request + # would hold the worker until it returns naturally. + # + # 2. Inject ``KeyboardInterrupt`` into the worker thread via + # ``PyThreadState_SetAsyncExc``. Fires at the next Python + # bytecode boundary; covers commands that aren't currently + # blocked in asyncio (sync post-processing, slow loops, ...). worker = current_worker[0] if state.is_running and worker is not None and worker.is_alive(): + loop = _active_worker_loop[0] + if loop is not None: + def _cancel_all_tasks() -> None: + try: + for task in _asyncio_mod.all_tasks(loop): + if not task.done(): + task.cancel() + except Exception: + pass + try: + loop.call_soon_threadsafe(_cancel_all_tasks) + except Exception: + pass + + # If a ``!shell`` command is running, terminate the subprocess + # directly — the worker thread is blocked in a C-level read() + # on the child's stdout pipe, so a Python-level + # KeyboardInterrupt won't fire until the read returns. + # ``terminate()`` sends SIGTERM; closing the pipe also frees + # the readline() loop. + proc = current_subprocess[0] + if proc is not None: + try: + proc.terminate() + except Exception: + pass + import ctypes tid = worker.ident @@ -2609,24 +3313,62 @@ def _ctrl_d(event): if not input_buffer.text: event.app.exit() - @kb.add("tab", filter=~has_completions) + # Right arrow / End accept the ghost-text suggestion. We're using + # ``Application`` directly (not ``PromptSession``), so the default + # ``load_auto_suggest_bindings`` are NOT in the merged binding set — + # without these, the ghost text appears but no key consumes it. + # (Ctrl-F is intentionally NOT bound — it would be redundant with Right + # arrow and a small minority of users expect it to mean "find".) + @Condition + def _suggestion_at_eol() -> bool: + try: + buf = input_buffer + return ( + buf.suggestion is not None + and len(buf.suggestion.text) > 0 + and buf.document.is_cursor_at_the_end + ) + except Exception: + return False + + def _do_accept_suggestion(event): + buf = event.current_buffer + sug = buf.suggestion + if sug: + buf.insert_text(sug.text) + + kb.add("right", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion) + kb.add("end", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion) + kb.add( + "tab", + filter=~has_completions & _suggestion_at_eol, + eager=True, + )(_do_accept_suggestion) + + _not_first_run = Condition(lambda: not _first_run_needs_key[0]) + + @kb.add("tab", filter=~has_completions & ~_suggestion_at_eol & _not_first_run) def _tab_open(event): - # Tab on an EMPTY input → toggle Scroll/Select mode (no need for - # completions when there's nothing to complete). Tab while typing - # opens completions as before. - if not input_buffer.text: - _toggle_mouse_mode(event) - return + # Tab opens the completion popup when no ghost suggestion is + # visible. Shift+Tab is the mode toggle. Suppressed during the + # first-run API key prompt — command-name completions are + # irrelevant there. event.current_buffer.start_completion(select_first=False) @kb.add("tab", filter=has_completions) def _tab_next(event): event.current_buffer.complete_next() + # Shift+Tab — when the completion popup is open, navigate backwards; + # when it's not, toggle Scroll ↔ Select mouse mode. @kb.add("s-tab", filter=has_completions) - def _shift_tab(event): + def _shift_tab_in_completions(event): event.current_buffer.complete_previous() + @kb.add("s-tab", filter=~has_completions) + def _shift_tab_toggle_mode(event): + _toggle_mouse_mode(event) + @kb.add("escape", filter=has_completions, eager=True) def _esc(event): event.current_buffer.cancel_completion() @@ -2648,9 +3390,17 @@ def _history_back(event): # first Up after a submit actually shows the newest entry. try: if len(buf._working_lines) <= 1: + # ``get_strings()`` returns newest-first. prompt_toolkit's + # built-in ``_load_history`` calls ``appendleft`` for each + # yielded item in that order — newest gets pushed left + # FIRST, ending up closest to the current-edit slot at the + # right. Walking Up then visits newest before older. We + # mirror that exact order here so the first Up after a + # submit lands on the freshly-submitted command, not the + # oldest entry on disk. strings = list(buf.history.get_strings()) if strings: - for s in reversed(strings): + for s in strings: buf._working_lines.appendleft(s) buf.working_index = len(buf._working_lines) - 1 elif not buf.text and buf.working_index != len(buf._working_lines) - 1: @@ -2785,6 +3535,8 @@ def _toggle_mouse_mode(_event): async def _ticker(): import asyncio + from .theme import has_progress_state, tick_progress_render + idle_counter = 0 # Track terminal width and trigger a fresh invalidate on resize. # No manual resize-detection needed any more — in full_screen @@ -2795,6 +3547,19 @@ async def _ticker(): while True: await asyncio.sleep(0.1) + # Re-render the honeycomb progress widget while a batch is in + # flight so the boundary hex shimmers between completion + # events. ``tick_progress_render`` is a no-op when no batch + # state is set, so the cost is negligible when idle. + if has_progress_state(): + try: + tick_progress_render() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass if state.is_running: state.tick += 1 try: diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index 12fdaf4..a42cb88 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -71,6 +71,110 @@ def is_repl_mode() -> bool: return _repl_mode +# -- Multi-line progress renderer hook --------------------------------------- +# The REPL installs a renderer here at startup that knows how to replace +# the last N lines of its virtual scrollback in place. Batch operations +# call ``emit_progress_lines`` to update the honeycomb progress bar — +# in REPL mode it overwrites the previous frame; outside the REPL it +# falls back to printing the lines normally. + +_progress_renderer = None # type: ignore[var-annotated] + + +def set_progress_renderer(fn) -> None: + """Install a function ``fn(lines)`` where ``lines`` is a list of + ANSI-rendered strings. Called by the REPL to wire up in-place updates. + """ + global _progress_renderer # noqa: PLW0603 + _progress_renderer = fn + + +def emit_progress_lines(lines: list[str]) -> None: + """Emit a multi-line progress update. In REPL mode this overwrites + the previous frame; otherwise it falls back to writing to stderr. + ``lines`` is a list of already-rendered ANSI strings (one per row, + no trailing newlines). + """ + if _progress_renderer is not None: + try: + _progress_renderer(lines) + return + except Exception: + pass + # Fallback: plain stderr append. + for line in lines: + sys.stderr.write(line + "\n") + sys.stderr.flush() + + +# -- Shared progress state for the REPL ticker animation --------------------- +# batch.py calls ``update_progress_state`` on each completion to record +# latest counts/rates. The REPL ticker calls ``tick_progress_render`` at +# ~10 Hz so the in-progress (boundary) hex shimmers between frames even +# when no new completion has fired. ``clear_progress_state`` is called +# when the batch finishes so the ticker stops re-rendering. + +_progress_state: dict | None = None + + +def update_progress_state( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, +) -> None: + global _progress_state # noqa: PLW0603 + _progress_state = { + "completed": completed, + "total": total, + "rps": rps, + "eta": eta, + "failure_pct": failure_pct, + } + tick_progress_render() + + +def clear_progress_state() -> None: + global _progress_state # noqa: PLW0603 + _progress_state = None + + +def has_progress_state() -> bool: + return _progress_state is not None + + +def tick_progress_render() -> None: + """Re-render the progress widget with the latest state. Safe to call + when no batch is in progress (becomes a no-op). The shimmer phase + is derived from ``time.monotonic()`` inside ``format_honeycomb_grid``. + """ + if _progress_state is None: + return + rows = format_honeycomb_grid( + completed=_progress_state["completed"], + total=_progress_state["total"], + rps=_progress_state["rps"], + eta=_progress_state["eta"], + failure_pct=_progress_state["failure_pct"], + animate=True, + ) + import io + from rich.console import Console as _RC + + rendered: list[str] = [] + for row in rows: + buf = io.StringIO() + _c = _RC( + file=buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _c.print(row, end="") + rendered.append(buf.getvalue()) + emit_progress_lines(rendered) + + # -- Single-line bee frames -------------------------------------------------- # Each frame is a tuple of (segment, style) pairs rendered inline. @@ -462,20 +566,25 @@ def __exit__(self, *_: object) -> None: def format_honeycomb_meter(used: int, total: int) -> Text: - """Render a honeycomb-style credit meter. ⬡ = used, ⬢ = remaining.""" + """Render a honeycomb-style credit meter. + + Filled hex (⬢) = remaining credits (ScrapingBee brand yellow). + Outline hex (⬡) = used / consumed (dim grey). + Intuitive "fuel gauge" semantics — yellow shows what you have left. + """ width = 20 if total <= 0: pct = 0.0 else: pct = (total - used) / total remaining = total - used - filled = int(width * pct) # remaining portion (yellow) - empty = width - filled # used portion (dim) + filled = int(width * pct) # remaining portion (yellow, filled hex) + empty = width - filled # used portion (dim, outline hex) text = Text() text.append(" ") - text.append("⬡" * filled, style=f"bold {BEE_YELLOW}") - text.append("⬢" * empty, style="dim") + text.append("⬢" * filled, style=f"bold {BEE_YELLOW}") + text.append("⬡" * empty, style=f"dim {BEE_YELLOW}") text.append(f" {remaining:,} / {total:,} credits remaining", style="bold white") # Color the percentage based on health @@ -560,6 +669,89 @@ def print_completion_summary( # -- Honeycomb trail progress ------------------------------------------------ +def format_honeycomb_grid( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, + animate: bool = False, +) -> list[Text]: + """3-row honeycomb progress bar for batch operations. + + Filled hex (⬢) = completed (ScrapingBee brand yellow, bold). + Outline hex (⬡) = remaining (brand yellow, dim — still brand-colored, + just lower-emphasis so the difference reads visually). Cells fill in + row order, left to right. + + Row layout (offset to look like a honeycomb): + Row 0: ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ + Row 1: ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ + Row 2: ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ + + Returns a list of three Text objects, one per row. The third row also + carries the ``X/Y N req/s ETA …`` stats trailing the cells. + """ + # Single row of hexes — the terminal's line-height made a 3-row stack + # feel visually disconnected, and the user preferred a tighter + # single-line look. The multi-line plumbing (``replace_last_n_lines``, + # the ticker shimmer, the progress-state hook) is kept intact because + # it costs nothing and the single line is just ``n=1``. + width = 20 + if total <= 0: + filled = 0 + else: + filled = int(width * completed / total) + filled = min(filled, width) + + filled_style = f"bold {BEE_YELLOW}" + outline_style = f"dim {BEE_YELLOW}" + + # Boundary cell shimmer: the next-to-be-filled cell pulses between a + # mid-bright and a soft yellow so the user can see the batch is alive + # even when no completion has fired in the last few ms. Only active + # when ``animate=True`` (the REPL ticker passes that) and only when + # there is a still-empty cell at the front of the bar. + shimmer_styles: list[str] = [] + if animate and filled < width: + import math + import time as _time + + # 1.2 Hz pulse — slow enough to read, fast enough to feel alive. + phase = 0.5 + 0.5 * math.sin(_time.monotonic() * 2 * math.pi * 1.2) + if phase > 0.55: + shimmer_styles.append(f"bold {BEE_YELLOW}") + else: + shimmer_styles.append(f"{BEE_YELLOW}") + + def _render_row(row_text: Text) -> None: + if filled > 0: + row_text.append("⬢" * filled, style=filled_style) + if filled < width: + if shimmer_styles: + # First empty cell uses the shimmer style; the rest are + # the regular dim-yellow outline. + row_text.append("⬡", style=shimmer_styles[0]) + if (width - filled) > 1: + row_text.append("⬡" * (width - filled - 1), style=outline_style) + else: + row_text.append("⬡" * (width - filled), style=outline_style) + + row_text = Text() + row_text.append(" ") + _render_row(row_text) + # Stats trail directly off the single row. + row_text.append(f" {completed}/{total}", style="bold white") + if rps is not None: + row_text.append(f" {rps:.1f} req/s", style="dim") + if eta is not None: + row_text.append(f" ETA {eta}", style="dim") + if failure_pct is not None and failure_pct > 0: + row_text.append(f" Failures: {failure_pct:.0f}%", style=f"bold {BEE_RED}") + return [row_text] + + def format_honeycomb_trail( completed: int, total: int, @@ -568,7 +760,9 @@ def format_honeycomb_trail( eta: str | None = None, failure_pct: float | None = None, ) -> Text: - """Bee flying across a honeycomb trail: ⬡⬡⬡\\(◉ω◉)/⬢⬢⬢""" + """Backward-compatible single-line variant. New code should use + :func:`format_honeycomb_grid` for the richer 3-row layout. + """ width = 25 if total <= 0: pos = 0 @@ -576,18 +770,11 @@ def format_honeycomb_trail( pos = int(width * completed / total) pos = min(pos, width) - trail_done = "⬡" * pos - trail_left = "⬢" * (width - pos) - - bee_frames = ["\\(◉ω◉)/", "᎑(◉ω◉)᎑", "/(◉ω◉)\\", "᎑(◉ω◉)᎑"] - bee = bee_frames[completed % len(bee_frames)] - text = Text() text.append(" ") - text.append(trail_done, style=f"bold {BEE_YELLOW}") - text.append(bee, style=f"bold {BEE_YELLOW}") - text.append(trail_left, style="dim") - text.append(f" {completed}/{total}", style="bold white") + text.append("⬢" * pos, style=f"bold {BEE_YELLOW}") + text.append("⬡" * (width - pos), style=f"dim {BEE_YELLOW}") + text.append(f" {completed}/{total}", style="bold white") if rps is not None: text.append(f" {rps:.1f} req/s", style="dim") if eta is not None: From 89b32a6a739e01de0a869eaae6efbb77388d9e57 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 14 May 2026 00:28:18 +0530 Subject: [PATCH 09/15] feat(repl): swap banner to single-line SCRAPING BEE in ANSI Shadow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the compact smblock SCRAPING + stacked BEE block (10 logo rows total) with a single 6-row "SCRAPING BEE" wordmark in ANSI Shadow — yellow SCRAPING beside white BEE, mirroring the brand wordmark. Same letterforms as the legacy logo, just stitched onto one line of text instead of two so the banner takes less vertical space. SCRAPING rows are now padded to a uniform 62-column width so BEE starts at the same column on every row. Without the padding G's natural shape leaves a trailing space on rows 1, 2, 6 only — that shifted BEE one column right on rows 3, 4, 5 and the bottom of B / last E read as misaligned. --- src/scrapingbee_cli/interactive.py | 72 +++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 9c6d994..8b59172 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -1373,23 +1373,46 @@ def _escape_menu(event): # "ScrapingBee" rendered in the figlet ``smblock`` font — 4 rows × 32 cols, # roughly the same width as the "Web scraping from the terminal" tagline. # Same block-letter style as the old 6-row logo, just compact. -_SCRAPINGBEE_LOGO = [ - " ▞▀▖ ▗ ▛▀▖ ", - " ▚▄ ▞▀▖▙▀▖▝▀▖▛▀▖▄ ▛▀▖▞▀▌▙▄▘▞▀▖▞▀▖", - " ▖ ▌▌ ▖▌ ▞▀▌▙▄▘▐ ▌ ▌▚▄▌▌ ▌▛▀ ▛▀ ", - " ▝▀ ▝▀ ▘ ▝▀▘▌ ▀▘▘ ▘▗▄▘▀▀ ▝▀▘▝▀▘", +# ANSI Shadow letters for "SCRAPING" and "BEE", kept as separate halves +# so each can carry its own colour (yellow + white, matching the brand +# wordmark) when stitched together at render time. +# +# Note on widths: the rightmost letter ``G`` has a natural 1-column +# narrower silhouette on its top and bottom rows (its shape leaves a +# trailing space on rows 1, 2, 6 but extends to a full ``╗``/``║``/``╝`` +# on rows 3, 4, 5). Without explicit padding, that imbalance shifts +# BEE one column right on the middle rows when we concat them, which +# reads as a misaligned bottom-left/last-bottom-right on the BEE side. +# Each row below is normalised to the same width with a trailing space +# where the font naturally has one. +_SCRAPING_LETTERS = [ + "███████╗ ██████╗██████╗ █████╗ ██████╗ ██╗███╗ ██╗ ██████╗ ", + "██╔════╝██╔════╝██╔══██╗██╔══██╗██╔══██╗██║████╗ ██║██╔════╝ ", + "███████╗██║ ██████╔╝███████║██████╔╝██║██╔██╗ ██║██║ ███╗", + "╚════██║██║ ██╔══██╗██╔══██║██╔═══╝ ██║██║╚██╗██║██║ ██║", + "███████║╚██████╗██║ ██║██║ ██║██║ ██║██║ ╚████║╚██████╔╝", + "╚══════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚═════╝ ", +] +_BEE_LETTERS = [ + "██████╗ ███████╗███████╗", + "██╔══██╗██╔════╝██╔════╝", + "██████╔╝█████╗ █████╗ ", + "██╔══██╗██╔══╝ ██╔══╝ ", + "██████╔╝███████╗███████╗", + "╚═════╝ ╚══════╝╚══════╝", ] -# Legacy 6-row logos kept around in case we want to swap back later or -# use them elsewhere (e.g. a one-shot welcome screen). The pinned REPL -# banner uses the compact form above. -_BEE_LOGO = [ - " ██████╗ ███████╗███████╗", - " ██╔══██╗██╔════╝██╔════╝", - " ██████╔╝█████╗ █████╗ ", - " ██╔══██╗██╔══╝ ██╔══╝ ", - " ██████╔╝███████╗███████╗", - " ╚═════╝ ╚══════╝╚══════╝", +# Combined "SCRAPING BEE" wordmark on a single row of letterforms — 6 +# lines tall, ~90 cols wide. Replaces the prior 4-row smblock SCRAPING +# + 6-row BEE stack (10 logo rows) with this single 6-row version. +_SCRAPINGBEE_LOGO = [ + " " + s + " " + b for s, b in zip(_SCRAPING_LETTERS, _BEE_LETTERS) ] +# Column at which "BEE" begins inside each combined row, used by the +# pinned banner renderer to split the row into a yellow "SCRAPING" half +# and a white "BEE" half. +_BEE_OFFSET = 2 + len(_SCRAPING_LETTERS[0]) + 2 +# Legacy alias kept so any external callers still resolve. +_BEE_LOGO = _BEE_LETTERS def _render_banner(version: str) -> str: @@ -1416,10 +1439,15 @@ def _render_banner(version: str) -> str: width=200, # don't wrap the wide ASCII logo ) c.print() + # Each combined row is " ". Split + # at the known offset so the yellow/white wordmark colours mirror + # the brand mark (SCRAPING yellow, BEE white). for line in _SCRAPINGBEE_LOGO: - c.print(f"[bold {BEE_YELLOW}]{line}[/]") - for line in _BEE_LOGO: - c.print(f"[bold white]{line}[/]") + left = line[:_BEE_OFFSET] + right = line[_BEE_OFFSET:] + c.print( + f"[bold {BEE_YELLOW}]{left}[/][bold white]{right}[/]" + ) c.print() # Version c.print(f" [bold {BEE_YELLOW}]v{version}[/]") @@ -2488,11 +2516,15 @@ def mouse_handler(self, mouse_event): def _banner_render() -> list[tuple[str, str]]: out: list[tuple[str, str]] = [] - # SCRAPING logo in brand yellow. + # SCRAPING half in brand yellow, BEE half in white — matches the + # wordmark in the official brand assets. for i, logo_line in enumerate(_SCRAPINGBEE_LOGO): if i > 0: out.append(("", "\n")) - out.append((f"bold {BEE_YELLOW}", logo_line)) + left = logo_line[:_BEE_OFFSET] + right = logo_line[_BEE_OFFSET:] + out.append((f"bold {BEE_YELLOW}", left)) + out.append(("bold white", right)) # Spacer row out.append(("", "\n")) # v1.4.1 From d0788088ff85db4901cef4fb2600398d62d8f9ac Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 14 May 2026 00:42:45 +0530 Subject: [PATCH 10/15] feat(repl): request a usable terminal size at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Best-effort XTERM Window Manipulation ("CSI 8 ; H ; W t") to bump the window to 100 cols × 30 rows when the current size is below that. Fits the 90-col banner with room for the toolbar + input. Only fires when the window is actually too small, so users on a large terminal aren't disrupted. Apple Terminal.app and SSH / tmux sessions ignore the sequence and the REPL silently proceeds. --- src/scrapingbee_cli/interactive.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 8b59172..30dd292 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -2244,6 +2244,27 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): sys.stdout.write("\033]10;#EAEAEA\007") sys.stdout.flush() + # ── Request a usable terminal size (best-effort) ──────────────────────── + # The banner is 90 cols wide; with margins + input + toolbar the REPL + # really wants ~100 cols × ~30 rows. XTERM Window Manipulation + # sequence "CSI 8 ; H ; W t" asks the terminal to resize itself to + # the given rows/cols. Honoured by xterm (with allowWindowOps), + # iTerm2, kitty, alacritty, WezTerm, Windows Terminal, GNOME + # Terminal. macOS Terminal.app and SSH/tmux sessions ignore it — + # we silently accept whatever size we end up with. Only fires when + # the current size is below the target so a user who's already on a + # large window isn't disrupted. + try: + _cur_cols, _cur_rows = shutil.get_terminal_size((80, 24)) + _MIN_COLS, _MIN_ROWS = 100, 30 + if _cur_cols < _MIN_COLS or _cur_rows < _MIN_ROWS: + _new_cols = max(_cur_cols, _MIN_COLS) + _new_rows = max(_cur_rows, _MIN_ROWS) + sys.stdout.write(f"\033[8;{_new_rows};{_new_cols}t") + sys.stdout.flush() + except Exception: + pass + # Create the virtual scrollback buffer and seed it with the banner. # In full_screen mode we own the alt buffer entirely. The banner is # rendered as a FIXED Window at the top of the layout (not pushed into From b2fcb77c71a4fb9e87d08409a04a3c3f3c43125f Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Thu, 14 May 2026 00:42:58 +0530 Subject: [PATCH 11/15] chore: bump version to 1.5.0 Release covers the REPL overhaul series: in-place API key prompt, !shell exec, fast Ctrl+C cancellation, honeycomb batch progress with shimmering boundary cell, single-line SCRAPING BEE banner, terminal auto-resize at startup, and assorted polish. --- pyproject.toml | 2 +- src/scrapingbee_cli/__init__.py | 4 ++-- src/scrapingbee_cli/interactive.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index eaca5b7..4380313 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "scrapingbee-cli" -version = "1.4.1" +version = "1.5.0" description = "Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal." readme = "README.md" license = "MIT" diff --git a/src/scrapingbee_cli/__init__.py b/src/scrapingbee_cli/__init__.py index dc7d57e..9ba2602 100644 --- a/src/scrapingbee_cli/__init__.py +++ b/src/scrapingbee_cli/__init__.py @@ -3,7 +3,7 @@ import platform import sys -__version__ = "1.4.1" +__version__ = "1.5.0" def user_agent_headers() -> dict[str, str]: @@ -12,7 +12,7 @@ def user_agent_headers() -> dict[str, str]: Returns a dict of headers: User-Agent: ScrapingBee/CLI User-Agent-Client: scrapingbee-cli - User-Agent-Client-Version: 1.4.1 + User-Agent-Client-Version: 1.5.0 User-Agent-Environment: python User-Agent-Environment-Version: 3.14.2 User-Agent-OS: Darwin arm64 diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 30dd292..10406b1 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -2548,7 +2548,7 @@ def _banner_render() -> list[tuple[str, str]]: out.append(("bold white", right)) # Spacer row out.append(("", "\n")) - # v1.4.1 + # v1.5.0 out.append(("", "\n")) out.append((f"bold {BEE_YELLOW}", f" v{version}")) # Tagline From 269264b15d24024e74a58fdf1bc4be27752a457e Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 20 May 2026 09:09:22 +0530 Subject: [PATCH 12/15] feat(repl + crawl): pool-based screenshot crawl, REPL UX polish, bool-flag consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-REPL changes (affect `scrapingbee crawl` and the CLI outside the REPL too): crawl.py — screenshot crawl actually produces N files for --max-pages N Pristine v1.4.1 with `--max-pages 5 --screenshot-full-page true` saved only 1 PNG. Side-by-side test against pristine confirmed three stacked upstream/spider bugs: 1. `_requires_discovery_phase` only checked `screenshot`, missing `screenshot_full_page` and `screenshot_selector`. Those modes silently fell into the same-mode `parse()` path that runs link extraction on PNG bytes — yielding garbage URLs that crashed on dispatch. 2. `scrapy_scrapingbee`'s default errback calls `response.text` on binary 500 responses → `AttributeError` → killed the spider. Every `ScrapingBeeRequest` is now wired to our `_on_request_error` which logs the URL and continues. 3. The scheduler's LIFO ordering popped follow-discovery requests before the save requests yielded alongside them. With ~100 follow URLs per page, saves were never dequeued before `CLOSESPIDER_PAGECOUNT` bailed. Fix: `priority=10` on save requests + raise `CloseSpider` from `_push_saved_status` when `_save_count >= max_pages`, so the engine drops the rest of the queue immediately. crawl.py — pool-based discovery (binary / extract modes) Old flow paid one HTML discovery + one save per saved page ≈ 2× credits. New flow accumulates URLs into `_save_queue` while discovering; once the pool reaches `max_pages` we flip `_discovery_done`, dispatch one save per pooled URL in priority order, and stop discovering. For a `--max-pages 100 --screenshot-full-page true` run on a link-rich site that previously cost ~1000 credits, this is closer to ~510. A `spider_idle` handler flushes the pool when the site is smaller than the cap so small-site crawls still produce output. crawl.py — `--max-pages N` now means N SAVED pages Replaces the older `_fetch_count` cap with `_save_count` + `_save_pending`. `--max-pages N` previously could stop early when discovery requests counted against the cap; now it counts only successful saves and matches the help text. Includes save-failure backfill from the queue so flaky 5xx errors don't silently shrink the user's effective budget. crawl.py — `errback` + non-printable URL filter on every yielded request `scrapy_scrapingbee`'s default errback is the binary-500 landmine above. Our `_on_request_error` is now attached to every `ScrapingBeeRequest` in the spider. Additionally, links whose decoded path/query contains non-ASCII bytes (common when discovery extracts hrefs from a corrupted PNG response on crawler-test.com fixtures) are dropped at iteration time so they can't trip the upstream errback in the first place. commands/crawl.py — concurrency-warning shows the actual reason Was: `Warning: could not check plan concurrency. Defaulting to 1…` Now: `Warning: could not check plan concurrency (HTTP 429). Defaulting…` The `/usage` endpoint is rate-limited; without the reason, users couldn't distinguish a transient 429 from a real auth/network problem and would default to concurrency=1 unnecessarily. cli.py + cli_utils.py — `--flag true|false` accepted for every bool flag Scraping-side options (`--render-js true`, `--premium-proxy true`, …) already took explicit `true`/`false` while Click flags (`--verbose`, `--resume`, `--escalate-proxy`, etc.) were bare-only — inconsistent UX. An argv preprocessor (`normalize_bool_flag_args`) collects all `is_flag=True` option names from the click tree and rewrites `--verbose true` → `--verbose`, `--verbose false` → (dropped, default applies). Bare `--verbose` still works. Applied at both `cli.main()` entry and REPL dispatch so behaviour matches everywhere. REPL (new interactive mode — large but mostly self-contained): interactive.py / theme.py / batch.py / commands/*.py - Full-screen alt-buffer with banner, fixed status widget, virtual scrollback, bottom toolbar with live credit honeycomb. - Subprocess-per-crawl: Twisted's reactor is a process singleton and can't be reused; running crawls in a child process lets the REPL handle multiple consecutive crawls per session. - Crawl + batch share a unified fixed widget that shows banner-compact + honeycomb progress + URL line (crawl only). No more honeycomb rows leaking into scrollback. - Click-to-open paths: existing paths in scrollback are underlined brand-yellow; click opens in Finder / xdg-open / os.startfile. Detection handles paths with spaces, `:line:col` suffixes, and rejects URL `://path` false positives. - `:view` pager pretty-prints JSON (existing) and HTML (new via lxml); `r` toggles raw. - Multi-line paste preview: bracketed paste with newlines puts the pasted lines in a multi-line editable buffer (Up/Down navigate lines, Ctrl+J / Alt+Enter insert newline). Enter submits all, queueing rest via `_pending_commands`. Esc / Ctrl+C clear. - Tab completion: single-match inline-completes (bash-style), multi- match opens popup, ghost-text-word fallback when nothing to complete. Right accepts the next word of the ghost suggestion; End accepts the whole ghost suggestion. - Ctrl+C escalation: first press sends SIGTERM (graceful), second within 2 s sends SIGKILL — useful when Twisted is parked in a long screenshot fetch and SIGTERM lags. - Ctrl+R / Ctrl+S explicitly disabled (their default reverse-i-search writes into a hidden buffer we don't render — typing went to a black hole). - `auth --unsafe` intercepted in REPL with a "run outside" message; its multi-step disclaimer + masked-getpass fights our termios. - Bee facts list audited (9 corrected — Einstein quote, honey-as- sustenance myth, etc.) and rotation starts with a verb so quick commands don't flash trivia. commands/amazon.py, chatgpt.py, fast_search.py, google.py, scrape.py, usage.py, walmart.py, youtube.py — no net behavioural change vs main. The `LiveCreditTracker` / `MiniBeeSpinner` wrappers that were added earlier in the REPL branch have been removed (they were dead code), leaving only REPL-gated paths (`if is_repl_mode()` branches). --- src/scrapingbee_cli/batch.py | 84 +- src/scrapingbee_cli/cli.py | 10 + src/scrapingbee_cli/cli_utils.py | 51 + src/scrapingbee_cli/commands/amazon.py | 80 +- src/scrapingbee_cli/commands/chatgpt.py | 23 +- src/scrapingbee_cli/commands/crawl.py | 71 +- src/scrapingbee_cli/commands/fast_search.py | 23 +- src/scrapingbee_cli/commands/google.py | 35 +- src/scrapingbee_cli/commands/scrape.py | 34 +- src/scrapingbee_cli/commands/usage.py | 43 +- src/scrapingbee_cli/commands/walmart.py | 72 +- src/scrapingbee_cli/commands/youtube.py | 62 +- src/scrapingbee_cli/crawl.py | 807 ++++++++-- src/scrapingbee_cli/interactive.py | 1582 ++++++++++++++++--- src/scrapingbee_cli/theme.py | 701 ++++---- 15 files changed, 2792 insertions(+), 886 deletions(-) diff --git a/src/scrapingbee_cli/batch.py b/src/scrapingbee_cli/batch.py index 6883a3c..21d27ca 100644 --- a/src/scrapingbee_cli/batch.py +++ b/src/scrapingbee_cli/batch.py @@ -19,7 +19,6 @@ from .client import Client, parse_usage from .config import BASE_URL, get_api_key from .theme import ( - LiveCreditTracker, echo_warning, err_console, format_honeycomb_trail, @@ -403,18 +402,36 @@ def _release_usage_lock(lf: object) -> None: def get_batch_usage(api_key_flag: str | None) -> dict: - """Return usage info (max_concurrency, credits) from a live API call. + """Return usage info (max_concurrency, credits). - When SCRAPINGBEE_USAGE_CACHE=1 is set (test environments only), the file - cache is used to avoid 429 errors from repeated calls in the same session. + Inside the REPL the file cache (12 s TTL) is consulted first so the + several REPL-side callers (background refresher, batch / crawl + pre-flight) share a single live call per window and stay under the + ``/usage`` rate limit. + + Direct CLI invocations (``scrapingbee crawl ...`` outside the REPL) + keep their original behaviour: a live call every time, unless the + legacy ``SCRAPINGBEE_USAGE_CACHE=1`` test escape hatch is set. """ key = get_api_key(api_key_flag) - if os.environ.get("SCRAPINGBEE_USAGE_CACHE") == "1": + try: + from .theme import is_repl_mode + _in_repl = is_repl_mode() + except Exception: + _in_repl = False + cache_opt_in = ( + _in_repl + or os.environ.get("SCRAPINGBEE_USAGE_CACHE") == "1" + ) + if cache_opt_in: cached = read_usage_file_cache(key) if cached is not None: return cached result = asyncio.run(_fetch_usage_async(key)) - write_usage_file_cache(key, result) + try: + write_usage_file_cache(key, result) + except Exception: + pass return result return asyncio.run(_fetch_usage_async(key)) @@ -549,6 +566,16 @@ async def run_batch_async( completed = 0 failure_count = 0 start_time = time.monotonic() + # Seed the REPL progress widget at 0/total so the user sees the + # honeycomb the moment the batch starts, not after the first item + # finishes. Without this, a slow first request can leave the user + # staring at silence for ~1s before any visual feedback. + if is_repl_mode() and show_progress and total > 0: + try: + from .theme import update_progress_state + update_progress_state(0, total, rps=None, eta=None, failure_pct=None) + except Exception: + pass async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: nonlocal completed, failure_count @@ -1321,31 +1348,26 @@ def run_api_batch( output_file: str | None = None, extract_field: str | None = None, fields: str | None = None, - usage_info: dict | None = None, ) -> None: """Run a batch of single-item API calls and write results.""" - # In REPL mode show live credit updates every 20s during the batch. - initial_remaining = usage_info.get("credits") if usage_info else None - initial_total = usage_info.get("max_api_credit") if usage_info else None - with LiveCreditTracker(key, initial_remaining=initial_remaining, total=initial_total): - asyncio.run( - _run_api_batch_async( - key=key, - inputs=inputs, - concurrency=concurrency, - from_user=from_user, - skip_n=skip_n, - output_dir=output_dir, - verbose=verbose, - show_progress=show_progress, - api_call=api_call, - on_complete=on_complete, - output_format=output_format, - post_process=post_process, - update_csv_path=update_csv_path, - input_column=input_column, - output_file=output_file, - extract_field=extract_field, - fields=fields, - ) + asyncio.run( + _run_api_batch_async( + key=key, + inputs=inputs, + concurrency=concurrency, + from_user=from_user, + skip_n=skip_n, + output_dir=output_dir, + verbose=verbose, + show_progress=show_progress, + api_call=api_call, + on_complete=on_complete, + output_format=output_format, + post_process=post_process, + update_csv_path=update_csv_path, + input_column=input_column, + output_file=output_file, + extract_field=extract_field, + fields=fields, ) + ) diff --git a/src/scrapingbee_cli/cli.py b/src/scrapingbee_cli/cli.py index f1bb8c0..4b97aee 100644 --- a/src/scrapingbee_cli/cli.py +++ b/src/scrapingbee_cli/cli.py @@ -191,6 +191,16 @@ def main() -> None: sys.exit(0) _handle_scraping_config() + # Let users write ``--verbose true`` / ``--verbose false`` in + # addition to the bare ``--verbose`` shortcut, so all boolean + # options behave like the scraping-side ones (--render-js, etc.). + try: + from .cli_utils import collect_bool_flag_names, normalize_bool_flag_args + _bool_flags = collect_bool_flag_names(cli) + sys.argv[1:] = normalize_bool_flag_args(sys.argv[1:], _bool_flags) + except Exception: + pass + try: cli.main(standalone_mode=False) except click.ClickException as e: diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index bf19dba..60a166f 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -107,6 +107,57 @@ def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: return preview.encode("utf-8"), summary, full_path +def collect_bool_flag_names(cli_group: click.Group) -> set[str]: + """Walk a click group + every subcommand and return the set of all + option strings declared as ``is_flag=True``. Used by + ``normalize_bool_flag_args`` to extend bool flags so they ALSO + accept ``true``/``false`` values for consistency with the + scraping-side flags that already take string bools + (``--render-js true`` etc.). + """ + flags: set[str] = set() + try: + for cmd in cli_group.commands.values(): + for p in cmd.params: + if getattr(p, "is_flag", False): + for opt in p.opts: + flags.add(opt) + except Exception: + pass + return flags + + +def normalize_bool_flag_args( + args: list[str], flag_names: set[str] +) -> list[str]: + """Pre-parse boolean flags so they accept an explicit true/false + value in addition to the bare flag form: + ``--verbose true`` → ``--verbose`` (value dropped, flag kept) + ``--verbose false`` → flag dropped entirely (default = False) + ``--verbose`` → unchanged + ``--no-verbose`` → unchanged (Click's own ``--no-x`` form) + """ + _TRUE = {"true", "1", "yes", "on"} + _FALSE = {"false", "0", "no", "off"} + out: list[str] = [] + i = 0 + while i < len(args): + tok = args[i] + if tok in flag_names and i + 1 < len(args): + next_lv = args[i + 1].strip().lower() + if next_lv in _TRUE: + out.append(tok) + i += 2 + continue + if next_lv in _FALSE: + # Skip the flag entirely; default value applies. + i += 2 + continue + out.append(tok) + i += 1 + return out + + class NormalizedChoice(click.Choice): """Choice type that accepts both hyphens and underscores. diff --git a/src/scrapingbee_cli/commands/amazon.py b/src/scrapingbee_cli/commands/amazon.py index c4b4cfb..7a01a1c 100644 --- a/src/scrapingbee_cli/commands/amazon.py +++ b/src/scrapingbee_cli/commands/amazon.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio -from contextlib import nullcontext import click from click_option_group import optgroup @@ -30,7 +29,6 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode AMAZON_SORT_BY = [ "most-recent", @@ -149,7 +147,6 @@ async def api_call(client, a): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -158,23 +155,21 @@ async def api_call(client, a): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("amazon-product") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.amazon_product( - asin, - device=device, - domain=domain, - country=country, - zip_code=zip_code, - language=language, - currency=currency, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.amazon_product( + asin, + device=device, + domain=domain, + country=country, + zip_code=zip_code, + language=language, + currency=currency, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import amazon_credits @@ -324,7 +319,6 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -333,29 +327,27 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("amazon-search") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.amazon_search( - query, - start_page=start_page, - pages=pages, - sort_by=norm_val(sort_by), - device=device, - domain=domain, - country=country, - zip_code=zip_code, - language=language, - currency=currency, - category_id=category_id, - merchant_id=merchant_id, - autoselect_variant=parse_bool(autoselect_variant), - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.amazon_search( + query, + start_page=start_page, + pages=pages, + sort_by=norm_val(sort_by), + device=device, + domain=domain, + country=country, + zip_code=zip_code, + language=language, + currency=currency, + category_id=category_id, + merchant_id=merchant_id, + autoselect_variant=parse_bool(autoselect_variant), + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import amazon_credits diff --git a/src/scrapingbee_cli/commands/chatgpt.py b/src/scrapingbee_cli/commands/chatgpt.py index 351ee8c..7ac63cd 100644 --- a/src/scrapingbee_cli/commands/chatgpt.py +++ b/src/scrapingbee_cli/commands/chatgpt.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio -from contextlib import nullcontext import click @@ -25,7 +24,6 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode @click.command() @@ -117,7 +115,6 @@ async def api_call(client, p): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -128,17 +125,15 @@ async def api_call(client, p): prompt_str = " ".join(prompt) async def _single() -> None: - _spinner = MiniBeeSpinner("chatgpt") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.chatgpt( - prompt_str, - search=parse_bool(search), - add_html=parse_bool(add_html), - country_code=country_code, - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.chatgpt( + prompt_str, + search=parse_bool(search), + add_html=parse_bool(add_html), + country_code=country_code, + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) write_output( data, diff --git a/src/scrapingbee_cli/commands/crawl.py b/src/scrapingbee_cli/commands/crawl.py index 70681ed..b124869 100644 --- a/src/scrapingbee_cli/commands/crawl.py +++ b/src/scrapingbee_cli/commands/crawl.py @@ -24,7 +24,6 @@ run_project_spider, run_urls_spider, ) -from ..theme import LiveCreditTracker def _crawl_build_params( @@ -438,10 +437,15 @@ def crawl_cmd( concurrency = resolve_batch_concurrency(obj["concurrency"], usage_info, 1) from_concurrency = obj["concurrency"] > 0 plan_concurrency = usage_info.get("max_concurrency") or 0 - except Exception: + except Exception as e: + # The /usage endpoint is rate-limited; bursts of crawl runs can + # trip it. Surface the actual reason so the user can tell apart + # "rate limited, retry in a moment" from real network / auth + # problems. + reason = str(e).strip() or type(e).__name__ click.echo( - "Warning: could not check plan concurrency. Defaulting to 1 concurrent request. " - "Use --concurrency to set explicitly.", + f"Warning: could not check plan concurrency ({reason}). " + "Defaulting to 1 concurrent request. Use --concurrency to set explicitly.", err=True, ) usage_info = None @@ -540,30 +544,43 @@ def crawl_cmd( allowed_list: list[str] | None = None if allowed_domains: allowed_list = [d.strip() for d in allowed_domains.split(",") if d.strip()] - _initial_remaining = usage_info.get("credits") if usage_info else None - _initial_total = usage_info.get("max_api_credit") if usage_info else None try: - with LiveCreditTracker( - key, initial_remaining=_initial_remaining, total=_initial_total - ): - run_urls_spider( - urls, - key, - scrape_params=scrape_params or None, - custom_headers=custom_headers or None, - max_depth=max_depth, - max_pages=max_pages, - concurrency=concurrency, - output_dir=out_dir, - allowed_domains=allowed_list, - allow_external_domains=allow_external_domains, - download_delay=download_delay, - autothrottle_enabled=autothrottle or None, - resume=obj.get("resume", False), - include_pattern=include_pattern, - exclude_pattern=exclude_pattern, - save_pattern=save_pattern, - ) + # ``known_total`` enables a batch-style honeycomb + # progress bar in the REPL widget. Used when the total + # is bounded up front: + # - sitemap mode (--from-sitemap) gives an exact list + # - max_depth=1 stops at the seed URLs themselves + # - --max-pages N caps the crawl, even when + # link-following could otherwise discover more + # For genuinely open-ended crawls (max_pages=0) we fall + # back to a rolling "fetching: " line driven by + # the spider signal handlers. + _kt: int | None = None + if from_sitemap: + _kt = len(urls) + elif max_depth == 1: + _kt = len(urls) + elif max_pages and max_pages > 0: + _kt = max_pages + run_urls_spider( + urls, + key, + scrape_params=scrape_params or None, + custom_headers=custom_headers or None, + max_depth=max_depth, + max_pages=max_pages, + concurrency=concurrency, + output_dir=out_dir, + allowed_domains=allowed_list, + allow_external_domains=allow_external_domains, + download_delay=download_delay, + autothrottle_enabled=autothrottle or None, + resume=obj.get("resume", False), + include_pattern=include_pattern, + exclude_pattern=exclude_pattern, + save_pattern=save_pattern, + known_total=_kt, + ) except ValueError as e: click.echo(str(e), err=True) raise SystemExit(1) diff --git a/src/scrapingbee_cli/commands/fast_search.py b/src/scrapingbee_cli/commands/fast_search.py index 9f2b372..776b340 100644 --- a/src/scrapingbee_cli/commands/fast_search.py +++ b/src/scrapingbee_cli/commands/fast_search.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio -from contextlib import nullcontext import click from click_option_group import optgroup @@ -26,7 +25,6 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode @click.command("fast-search") @@ -110,7 +108,6 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -119,17 +116,15 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("fast-search") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.fast_search( - query, - page=page, - country_code=country_code, - language=language, - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.fast_search( + query, + page=page, + country_code=country_code, + language=language, + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import fast_search_credits diff --git a/src/scrapingbee_cli/commands/google.py b/src/scrapingbee_cli/commands/google.py index eec7ae1..2ce4c51 100644 --- a/src/scrapingbee_cli/commands/google.py +++ b/src/scrapingbee_cli/commands/google.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio -from contextlib import nullcontext import click from click_option_group import optgroup @@ -30,7 +29,6 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode def _warn_empty_organic(data: bytes, search_type: str | None) -> None: @@ -181,7 +179,6 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -190,23 +187,21 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("google") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.google_search( - query, - search_type=norm_val(search_type), - country_code=country_code, - device=device, - page=page, - language=language, - nfpr=parse_bool(nfpr), - extra_params=extra_params, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.google_search( + query, + search_type=norm_val(search_type), + country_code=country_code, + device=device, + page=page, + language=language, + nfpr=parse_bool(nfpr), + extra_params=extra_params, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) _warn_empty_organic(data, search_type) from ..credits import google_credits diff --git a/src/scrapingbee_cli/commands/scrape.py b/src/scrapingbee_cli/commands/scrape.py index 38fd818..b19881a 100644 --- a/src/scrapingbee_cli/commands/scrape.py +++ b/src/scrapingbee_cli/commands/scrape.py @@ -5,7 +5,6 @@ import asyncio import json import os -from contextlib import nullcontext import click from click_option_group import optgroup @@ -39,7 +38,7 @@ from ..client import Client, pretty_json from ..config import BASE_URL, get_api_key from ..crawl import _preferred_extension_from_scrape_params -from ..theme import LiveCreditTracker, MiniBeeSpinner, echo_error, is_repl_mode +from ..theme import echo_error, is_repl_mode def _apply_chunking(url: str, data: bytes, chunk_size: int, chunk_overlap: int) -> bytes: @@ -705,10 +704,7 @@ def _ndjson_cb(result): if failed: raise SystemExit(1) - _rem = usage_info.get("credits") if usage_info else None - _tot = usage_info.get("max_api_credit") if usage_info else None - with LiveCreditTracker(key, initial_remaining=_rem, total=_tot): - asyncio.run(_batch()) + asyncio.run(_batch()) return if not url and not scraping_config: @@ -720,20 +716,18 @@ def _ndjson_cb(result): async def _single() -> None: scrape_url = url or "" # empty when using --scraping-config (API uses config's URL) - _spinner = MiniBeeSpinner("scrape") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL, timeout=client_timeout) as client: - if escalate_proxy: - data, resp_headers, status_code = await scrape_with_escalation( - client, - scrape_url, - scrape_kwargs, - verbose=obj["verbose"], - ) - else: - data, resp_headers, status_code = await client.scrape( - scrape_url, **scrape_kwargs - ) + async with Client(key, BASE_URL, timeout=client_timeout) as client: + if escalate_proxy: + data, resp_headers, status_code = await scrape_with_escalation( + client, + scrape_url, + scrape_kwargs, + verbose=obj["verbose"], + ) + else: + data, resp_headers, status_code = await client.scrape( + scrape_url, **scrape_kwargs + ) if not scrape_kwargs.get("transparent_status_code") and status_code >= 400: if is_repl_mode(): echo_error(f"Error: HTTP {status_code}") diff --git a/src/scrapingbee_cli/commands/usage.py b/src/scrapingbee_cli/commands/usage.py index a358f53..b140b59 100644 --- a/src/scrapingbee_cli/commands/usage.py +++ b/src/scrapingbee_cli/commands/usage.py @@ -4,7 +4,6 @@ import asyncio import json as _json -from contextlib import nullcontext import click @@ -12,7 +11,7 @@ from ..cli_utils import _output_options, store_common_options from ..client import Client, parse_usage, pretty_json from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode +from ..theme import is_repl_mode @click.command() @@ -30,28 +29,26 @@ def usage_cmd(obj: dict, **kwargs) -> None: backoff = float(obj.get("backoff") or 2.0) async def _run() -> None: - _spinner = MiniBeeSpinner("usage") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, _, status_code = await client.usage(retries=retries, backoff=backoff) - if status_code != 200: - click.echo( - f"API returned status {status_code}: {data.decode('utf-8', errors='replace')}", - err=True, - ) - raise SystemExit(1) - # Warm the shared file cache so concurrent batch subprocesses skip the API call. - write_usage_file_cache(key, parse_usage(data)) - - if is_repl_mode(): - _show_repl_usage(data) + async with Client(key, BASE_URL) as client: + data, _, status_code = await client.usage(retries=retries, backoff=backoff) + if status_code != 200: + click.echo( + f"API returned status {status_code}: {data.decode('utf-8', errors='replace')}", + err=True, + ) + raise SystemExit(1) + # Warm the shared file cache so concurrent batch subprocesses skip the API call. + write_usage_file_cache(key, parse_usage(data)) + + if is_repl_mode(): + _show_repl_usage(data) + else: + output_file = obj.get("output_file") + if output_file: + with open(output_file, "w", encoding="utf-8") as f: + f.write(pretty_json(data) + "\n") else: - output_file = obj.get("output_file") - if output_file: - with open(output_file, "w", encoding="utf-8") as f: - f.write(pretty_json(data) + "\n") - else: - click.echo(pretty_json(data)) + click.echo(pretty_json(data)) asyncio.run(_run()) diff --git a/src/scrapingbee_cli/commands/walmart.py b/src/scrapingbee_cli/commands/walmart.py index 6b43918..a9100a2 100644 --- a/src/scrapingbee_cli/commands/walmart.py +++ b/src/scrapingbee_cli/commands/walmart.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio -from contextlib import nullcontext import click from click_option_group import optgroup @@ -31,7 +30,6 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode WALMART_SORT_BY = ["best-match", "price-low", "price-high", "best-seller"] @@ -165,7 +163,6 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -174,27 +171,25 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("walmart-search") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.walmart_search( - query, - start_page=start_page, - min_price=min_price, - max_price=max_price, - sort_by=norm_val(sort_by), - device=device, - domain=domain, - fulfillment_speed=norm_val(fulfillment_speed), - fulfillment_type=norm_val(fulfillment_type), - delivery_zip=delivery_zip, - store_id=store_id, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.walmart_search( + query, + start_page=start_page, + min_price=min_price, + max_price=max_price, + sort_by=norm_val(sort_by), + device=device, + domain=domain, + fulfillment_speed=norm_val(fulfillment_speed), + fulfillment_type=norm_val(fulfillment_type), + delivery_zip=delivery_zip, + store_id=store_id, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import walmart_credits @@ -307,7 +302,6 @@ async def api_call(client, pid): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -316,21 +310,19 @@ async def api_call(client, pid): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("walmart-product") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.walmart_product( - product_id, - device=device, - domain=domain, - delivery_zip=delivery_zip, - store_id=store_id, - add_html=parse_bool(add_html), - light_request=parse_bool(light_request), - screenshot=parse_bool(screenshot), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.walmart_product( + product_id, + device=device, + domain=domain, + delivery_zip=delivery_zip, + store_id=store_id, + add_html=parse_bool(add_html), + light_request=parse_bool(light_request), + screenshot=parse_bool(screenshot), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) from ..credits import walmart_credits diff --git a/src/scrapingbee_cli/commands/youtube.py b/src/scrapingbee_cli/commands/youtube.py index f5ee828..b41e436 100644 --- a/src/scrapingbee_cli/commands/youtube.py +++ b/src/scrapingbee_cli/commands/youtube.py @@ -4,7 +4,6 @@ import asyncio import re -from contextlib import nullcontext import click from click_option_group import optgroup @@ -29,7 +28,6 @@ ) from ..client import Client from ..config import BASE_URL, get_api_key -from ..theme import MiniBeeSpinner, is_repl_mode YOUTUBE_UPLOAD_DATE = ["today", "last-hour", "this-week", "this-month", "this-year"] @@ -252,7 +250,6 @@ async def api_call(client, q): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -261,29 +258,27 @@ async def api_call(client, q): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("youtube-search") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.youtube_search( - query, - upload_date=norm_val(upload_date), - type=type_, - duration=duration, - sort_by=norm_val(sort_by), - hd=parse_bool(hd), - is_4k=parse_bool(is_4k), - subtitles=parse_bool(subtitles), - creative_commons=parse_bool(creative_commons), - live=parse_bool(live), - is_360=parse_bool(is_360), - is_3d=parse_bool(is_3d), - hdr=parse_bool(hdr), - location=parse_bool(location), - vr180=parse_bool(vr180), - purchased=parse_bool(purchased), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.youtube_search( + query, + upload_date=norm_val(upload_date), + type=type_, + duration=duration, + sort_by=norm_val(sort_by), + hd=parse_bool(hd), + is_4k=parse_bool(is_4k), + subtitles=parse_bool(subtitles), + creative_commons=parse_bool(creative_commons), + live=parse_bool(live), + is_360=parse_bool(is_360), + is_3d=parse_bool(is_3d), + hdr=parse_bool(hdr), + location=parse_bool(location), + vr180=parse_bool(vr180), + purchased=parse_bool(purchased), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) data = _normalize_youtube_search(data) write_output( @@ -367,7 +362,6 @@ async def api_call(client, vid): output_file=obj.get("output_file") or None, extract_field=obj.get("extract_field"), fields=obj.get("fields"), - usage_info=usage_info, ) return @@ -376,14 +370,12 @@ async def api_call(client, vid): raise SystemExit(1) async def _single() -> None: - _spinner = MiniBeeSpinner("youtube-metadata") if is_repl_mode() else nullcontext() - with _spinner: - async with Client(key, BASE_URL) as client: - data, headers, status_code = await client.youtube_metadata( - _extract_video_id(video_id), - retries=int(obj.get("retries") or 3), - backoff=float(obj.get("backoff") or 2.0), - ) + async with Client(key, BASE_URL) as client: + data, headers, status_code = await client.youtube_metadata( + _extract_video_id(video_id), + retries=int(obj.get("retries") or 3), + backoff=float(obj.get("backoff") or 2.0), + ) check_api_response(data, status_code) write_output( data, diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py index 50d54b4..0ee5d73 100644 --- a/src/scrapingbee_cli/crawl.py +++ b/src/scrapingbee_cli/crawl.py @@ -21,7 +21,7 @@ from scrapy_scrapingbee import ScrapingBeeRequest from . import user_agent_headers -from .batch import _batch_subdir_for_extension, extension_for_crawl, extension_from_url_path +from .batch import _batch_subdir_for_extension, extension_for_crawl if TYPE_CHECKING: from scrapy import Request @@ -30,6 +30,126 @@ MIDDLEWARE_PRIORITY = 725 +class _CrawlerReactorAlreadyUsed(RuntimeError): + """Raised when Twisted's reactor has already been started + stopped + in this Python process and can't be re-used for another crawl. The + REPL surfaces a friendly message asking the user to restart the + session, rather than letting Scrapy's raw error bubble up. + """ + + +def stop_running_reactor() -> bool: + """Thread-safely stop the running Twisted reactor if it's currently + running a crawl. Returns True if a stop was scheduled, False if no + reactor is currently running (so the caller can fall through to its + other Ctrl+C paths). + + Used by the REPL's Ctrl+C handler — the Twisted reactor in the + worker thread is blocked in a C-level ``epoll``/``kqueue``/``select`` + waiting on sockets, so neither ``PyThreadState_SetAsyncExc`` nor + ``asyncio.Task.cancel`` reaches it. ``reactor.callFromThread`` is + the blessed cross-thread escape hatch: it wakes the selector via + the reactor's self-pipe and schedules the callback on the reactor + thread, where ``reactor.stop()`` can run safely. + """ + try: + from twisted.internet import reactor # type: ignore[import-not-found] + except Exception: + return False + if not getattr(reactor, "running", False): + return False + try: + reactor.callFromThread(reactor.stop) + return True + except Exception: + return False + + +def _ensure_reactor_usable() -> None: + """Sanity check before we hand a new crawl to Twisted. + + Twisted's reactor is a process-wide singleton — once ``reactor.run()`` + returns (either naturally or because the user cancelled the crawl) + the reactor's ``_startedBefore`` flag stays True, and calling + ``run()`` again raises ``ReactorNotRestartable``. The REPL invokes + ``run_urls_spider`` / ``run_project_spider`` in a worker thread per + command, so the second crawl in a REPL session always trips this. + + We INSPECT the reactor via ``sys.modules`` rather than importing + ``twisted.internet.reactor`` ourselves — a bare import triggers the + default reactor (SelectReactor on macOS) to install eagerly, which + then conflicts with Scrapy's ``TWISTED_REACTOR`` setting that wants + ``AsyncioSelectorReactor``. The result was every crawl failing + immediately with ``RuntimeError: The installed reactor … does not + match`` before any signal could fire. + + Detect the dead-reactor state early and raise a clean error the + REPL can render as "Restart the REPL to crawl again" instead of a + multi-line Twisted traceback. (A true fix would spawn each crawl + in a subprocess; that's a follow-up.) + """ + import sys as _sys + reactor = _sys.modules.get("twisted.internet.reactor") + if reactor is None: + return # No reactor has been installed yet, nothing to check. + if getattr(reactor, "_startedBefore", False): + raise _CrawlerReactorAlreadyUsed( + "Crawls in this REPL session have ended. Twisted's reactor " + "is single-shot per process — please run ``:q`` and relaunch " + "scrapingbee to crawl again." + ) + + +def _target_url_from_request(request) -> str: + """Extract the user-facing target URL from a Scrapy request. + + ``scrapy-scrapingbee`` rewrites outgoing requests so they hit + ``app.scrapingbee.com/api/v1/?api_key=…&url=…``. Stick that URL in + the REPL's live status line and the user sees their API key in + plain text plus a totally unhelpful host — they want their own + target URL. The request's ``meta["scrapingbee"]["url"]`` (set by + the middleware before it rewrites the request) is the cleanest + source; if that's missing we fall back to decoding the ``url`` + query param from ``request.url``, and to ``request.url`` itself if + even that fails (so the line stays populated rather than going + blank). + + Output is always a clean printable string — non-printable bytes + that sometimes show up in target URLs (e.g. screenshot-mode pages + with binary blobs in the path) are stripped so the status widget + never renders mojibake. + """ + raw = "" + try: + meta_url = (request.meta or {}).get("scrapingbee_target_url") + if meta_url: + raw = meta_url + except Exception: + pass + if not raw: + raw = getattr(request, "url", "") or "" + if "app.scrapingbee.com" in raw and "url=" in raw: + try: + from urllib.parse import parse_qs, unquote, urlparse + qs = parse_qs(urlparse(raw).query) + target = qs.get("url", [None])[0] + if target: + raw = unquote(target, errors="replace") + except Exception: + pass + if isinstance(raw, bytes): + raw = raw.decode("utf-8", errors="replace") + # Keep only ASCII printable code points (32–126). URLs are + # supposed to be 7-bit ASCII with %-encoding for everything + # else; anything outside that range here is decoded garbage + # (sites like crawler-test.com host pages with deliberate + # binary blobs in their paths for scraper-stress-testing). + # ``isprintable()`` alone passes too much through — combining + # marks, zero-width chars, exotic whitespace all render weird + # in the status widget. + return "".join(ch for ch in raw if 32 <= ord(ch) <= 126) + + def _install_signal_handlers() -> bool: """Whether Scrapy / Twisted should install Unix signal handlers. @@ -52,24 +172,38 @@ def _install_signal_handlers() -> bool: def _maybe_set_repl_log_file(settings) -> str | None: - """In REPL mode, also pipe Scrapy logs to a file on disk. - - The REPL's virtual scrollback caps at ~10K lines and drops the oldest - 10% when full, so long crawls lose history. Setting ``LOG_FILE`` makes - Scrapy *also* write its full log stream to the given path (terminal - output stays — LOG_FILE adds a file sink, doesn't replace stderr). - Returns the log path so the caller can surface it in the UI, or None - if logging-to-file wasn't enabled (non-REPL or on filesystem failure). + """In REPL mode (or a REPL-spawned subprocess), pipe the full Scrapy + log to a file on disk and silence the noisy ``py.warnings`` logger + so the in-flight crawl UI isn't drowned in deprecation tracebacks. + + The REPL's virtual scrollback caps at ~10K lines and drops the + oldest 10% when full, so long crawls would otherwise lose their + history. ``LOG_FILE`` mirrors everything Scrapy emits (at the + configured ``LOG_LEVEL``) to ``~/.cache/scrapingbee-cli/crawl.log``; + the user can open it any time with ``:view crawl``. + + ``py.warnings`` is the logger Scrapy uses to forward Python + ``warnings.warn`` calls. Multi-line deprecation tracebacks (Scrapy + nagging about old middleware APIs etc.) belong in the file, not on + screen — we raise THAT specific logger to ERROR so those entries + stop reaching the terminal stream while the rest of Scrapy's + routine logging continues at its configured level. """ try: from .theme import is_repl_mode - if not is_repl_mode(): + in_repl = is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" + if not in_repl: return None log_dir = Path.home() / ".cache" / "scrapingbee-cli" log_dir.mkdir(parents=True, exist_ok=True) log_path = log_dir / "crawl.log" settings.set("LOG_FILE", str(log_path)) settings.set("LOG_FILE_APPEND", False) # fresh log per run + try: + import logging as _logging + _logging.getLogger("py.warnings").setLevel(_logging.ERROR) + except Exception: + pass return str(log_path) except Exception: return None @@ -78,24 +212,6 @@ def _maybe_set_repl_log_file(settings) -> str | None: DEFAULT_MAX_DEPTH = 0 DEFAULT_MAX_PAGES = 0 -# URL extensions that will never contain HTML links — skip discovery re-requests for these. -_NON_HTML_URL_EXTENSIONS = frozenset( - { - "jpg", - "jpeg", - "png", - "gif", - "webp", - "svg", - "ico", # images - "pdf", - "zip", # binary downloads - "css", - "js", # web assets - } -) - - def _normalize_url(url: str) -> str: """Strip fragment and trailing slash for deduplication.""" parsed = urlparse(url) @@ -174,9 +290,13 @@ def _requires_discovery_phase(scrape_params: dict[str, Any]) -> bool: if _param_truthy(scrape_params, "return_page_text"): return True # Raw screenshot (no JSON wrapper) → binary PNG, no extractable links. - if _param_truthy(scrape_params, "screenshot") and not _param_truthy( - scrape_params, "json_response" - ): + # All three screenshot params produce PNG output unless wrapped in JSON. + screenshot_requested = ( + _param_truthy(scrape_params, "screenshot") + or _param_truthy(scrape_params, "screenshot_full_page") + or scrape_params.get("screenshot_selector") + ) + if screenshot_requested and not _param_truthy(scrape_params, "json_response"): return True return False @@ -262,9 +382,16 @@ def __init__( include_pattern: str | None = None, exclude_pattern: str | None = None, save_pattern: str | None = None, + known_total: int | None = None, **kwargs: Any, ) -> None: super().__init__(name=name, **kwargs) + # Optional: when the caller knows up front how many pages will be + # fetched (e.g. sitemap mode), we surface a batch-style honeycomb + # progress bar in the REPL. Left None for open-ended crawls. + self._known_total: int | None = ( + int(known_total) if known_total and known_total > 0 else None + ) self.start_urls = start_urls or [] self.scrape_params = scrape_params or {} self.custom_headers = custom_headers @@ -286,7 +413,255 @@ def __init__( self._exclude_re = re.compile(exclude_pattern) if exclude_pattern else None self._save_re = re.compile(save_pattern) if save_pattern else None self._save_count = 0 + # Save requests that have been dispatched but not yet completed. + # Used together with ``_save_count`` to enforce ``max_pages`` + # tightly even when several discovery callbacks fire saves + # before the first save completes (without this we overshoot + # the cap by ``concurrency``). + self._save_pending = 0 + # Pool-based discovery for binary modes (screenshot / extract / + # ai / return_page_text). Discovery callbacks accumulate URLs + # into ``_save_queue`` without firing a save per page; once the + # queue contains >= ``max_pages`` candidates we flip + # ``_discovery_done`` and dispatch all save requests in one go, + # then drop any further discoveries that come back late. This + # avoids paying for an HTML discovery per saved page when a + # handful of pages already expose more URLs than the cap. + # ``_save_queue_next`` is the index of the next un-dispatched + # URL in ``_save_queue`` — used by ``_on_save_error`` to backfill + # from the remainder of the pool when a dispatched save fails, + # so a few errors don't leave the user with < max_pages files + # despite there being more candidates available. + self._save_queue: list[str] = [] + self._save_queue_set: set[str] = set() + self._save_queue_next: int = 0 + self._discovery_done: bool = False self._fetch_count = 0 + # Live-status counters surfaced to the REPL via theme._crawl_status. + # Only populated under REPL mode; the signal handlers below early- + # exit otherwise so the standalone CLI path stays unchanged. + self._queued_count = 0 + # Counted at signal time (response_received), independent of the + # parse callbacks that increment ``_fetch_count`` later in the + # pipeline. Used for the dim-row "X fetched" indicator and the + # honeycomb progress widget so the count advances the instant a + # response lands, not when its body is parsed. + self._response_count = 0 + + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + """Standard Scrapy hook — instantiate the spider AND wire signal + handlers that push live status into ``theme._crawl_status`` so + the REPL's dim row can show the current URL + fetched count + in real time. Outside REPL mode the handlers are no-ops. + """ + spider = super().from_crawler(crawler, *args, **kwargs) + try: + from scrapy import signals as _scrapy_signals + from .theme import is_repl_mode + + # Stash the crawler so signal handlers can dispatch new + # requests via ``crawler.engine.crawl`` (needed from + # ``spider_idle`` to flush the pool when discovery exhausts + # without saturating). + spider._crawler = crawler + + # The pool-based discovery flow needs to flush queued URLs + # at spider_idle (when discovery exhausts before reaching + # ``max_pages``). Wire this regardless of REPL mode — it's + # a credit-saving optimisation, not a UI feature. + crawler.signals.connect( + spider._on_spider_idle, signal=_scrapy_signals.spider_idle + ) + + # Register signal handlers when running inside the REPL + # (legacy in-process path) OR when the parent REPL spawned + # us as a subprocess and set the status-file env var (the + # new subprocess-per-crawl path). The handlers themselves + # call ``update_crawl_status`` which atomically mirrors + # state to the file if the env var is set. + _want_status = is_repl_mode() or bool( + os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE") + ) + if _want_status: + crawler.signals.connect( + spider._on_spider_opened, signal=_scrapy_signals.spider_opened + ) + crawler.signals.connect( + spider._on_request_scheduled, + signal=_scrapy_signals.request_scheduled, + ) + crawler.signals.connect( + spider._on_request_reached, + signal=_scrapy_signals.request_reached_downloader, + ) + crawler.signals.connect( + spider._on_response_received, + signal=_scrapy_signals.response_received, + ) + crawler.signals.connect( + spider._on_spider_closed, signal=_scrapy_signals.spider_closed + ) + except Exception: + pass + return spider + + # ── Live-status signal handlers (REPL mode only) ────────────────────── + def _on_spider_opened(self, spider) -> None: + try: + from .theme import update_crawl_status, update_progress_state + update_crawl_status( + current_url=None, fetched=0, queued=0, saved=0, phase="discovering", + ) + # If we already know the total (sitemap mode), seed the + # progress widget at 0/total so the user sees the bar from + # frame one. + if self._known_total is not None and self._known_total > 0: + update_progress_state(0, self._known_total) + except Exception: + pass + + def _on_request_scheduled(self, request, spider) -> None: + try: + self._queued_count += 1 + from .theme import update_crawl_status + update_crawl_status(queued=self._queued_count) + except Exception: + pass + + def _on_request_reached(self, request, spider) -> None: + try: + from .theme import update_crawl_status + # Scrapy sees the outgoing proxy URL + # (``app.scrapingbee.com/api/v1/?api_key=…&url=…``) — that's + # leaky (API key) and not what the user thinks of as "their" + # URL. Pull the target out of the ``url`` query param so the + # status widget reads naturally: ``fetching: https://example.com``. + display_url = _target_url_from_request(request) + update_crawl_status(current_url=display_url) + except Exception: + pass + + def _on_response_received(self, response, request, spider) -> None: + try: + self._response_count += 1 + from .theme import update_crawl_status, update_progress_state + update_crawl_status( + fetched=self._response_count, + saved=self._save_count, + phase="fetching", + ) + if self._known_total is not None and self._known_total > 0: + update_progress_state( + min(self._response_count, self._known_total), + self._known_total, + ) + except Exception: + pass + + def _on_spider_closed(self, spider, reason) -> None: + try: + from .theme import clear_crawl_status, clear_progress_state + clear_crawl_status() + clear_progress_state() + except Exception: + pass + + def _on_spider_idle(self, spider) -> None: + """Flush the pool when discovery exhausts before saturation. + + Pool-based binary mode only dispatches saves once the queue + reaches ``max_pages``. If the site is smaller than the cap (or + ``max_pages`` is 0 / unlimited), the queue never reaches the + threshold and would never trigger save dispatch — the spider + would close with the pool full and zero files saved. + + ``spider_idle`` fires when the scheduler is empty and no + requests are in flight. We use it to commit whatever URLs we + gathered: dispatch save requests for every queued URL (capped + at ``max_pages`` if set), then raise ``DontCloseSpider`` so + Scrapy waits for the saves to complete before shutting down. + + Only relevant for binary-mode crawls (the same-mode and + HTML-save-pattern flows save in place, no pool involved). + """ + if self._discovery_done: + return + if not _requires_discovery_phase(self.scrape_params): + return + if not self._save_queue: + return + # Resolve the engine BEFORE latching ``_discovery_done`` — if + # the engine isn't available (very unlikely by the time + # spider_idle fires, but worth being defensive), bail without + # leaving the flag set, so a later idle tick gets another + # chance instead of permanently skipping flush. + engine = getattr(getattr(self, "_crawler", None), "engine", None) + if engine is None: + return + self._discovery_done = True + budget = ( + min(self.max_pages, len(self._save_queue)) + if self.max_pages + else len(self._save_queue) + ) + for url in self._save_queue[:budget]: + self._save_pending += 1 + self._save_queue_next += 1 + try: + engine.crawl(self._make_save_request(url), spider) + except Exception: + if self._save_pending > 0: + self._save_pending -= 1 + from scrapy.exceptions import DontCloseSpider + raise DontCloseSpider + + def _push_saved_status(self) -> None: + """Re-push the live ``saved`` count after a successful save, + and tear the spider down once we've hit ``max_pages``. + + ``_on_response_received`` (Scrapy signal) fires BEFORE the + ``parse``/``_parse_save_only`` callback writes the file, so the + widget's ``saved`` count always lags by one until the next + response arrives. With ``--max-pages N`` the spider closes + before that next response, leaving a stale ``N fetched + N-1 saved`` reading on screen until ``_on_spider_closed`` + clears the widget. Calling this right after the save commits + keeps the display honest. + + Once the cap is reached we also raise ``CloseSpider`` so the + engine drops anything still queued (e.g. the ~N follow-up + discoveries that the seed callback already yielded). Without + this the spider would happily keep fetching no-op pages until + the framework safety cap ``CLOSESPIDER_PAGECOUNT`` kicks in — + burning credits the user expects ``--max-pages`` to bound. + """ + try: + from .theme import update_crawl_status + update_crawl_status(saved=self._save_count) + except Exception: + pass + if self.max_pages != 0 and self._save_count >= self.max_pages: + from scrapy.exceptions import CloseSpider + raise CloseSpider("max_pages") + + def _on_request_error(self, failure) -> None: + """Swallow request-level errors so one bad URL doesn't kill the + whole crawl. ``scrapy_scrapingbee`` ships an errback that + crashes on binary error responses (``response.text`` raises + ``AttributeError`` when the body isn't decodable as text — + which happens any time the API returns a non-200 in screenshot + mode). Attaching our own errback to every request short- + circuits that and just logs the failure. + """ + try: + req = getattr(failure, "request", None) + url = getattr(req, "url", "?") if req is not None else "?" + exc = type(failure.value).__name__ if hasattr(failure, "value") else "error" + self.logger.warning("Skipped %s (%s)", url, exc) + except Exception: + pass + return None def _allowed_netlocs_set(self) -> set[str]: if self._allowed_netlocs is not None: @@ -308,16 +683,29 @@ def _url_allowed(self, url: str) -> bool: return not allowed or netloc in allowed def start_requests(self) -> Iterator[Request]: + # Two flows: + # 1. "Same-mode": one request per page; the response is both saved + # and parsed for outgoing links. Works only when scrape_params + # yield HTML/JSON-with-body (no screenshot/extract/etc). + # 2. "Discovery-first": fetch each page in HTML mode for link + # extraction, and (if it should be saved) fire a SECOND + # request with the user's full scrape_params to obtain the + # saved artifact (PNG, extract-rules JSON, etc). + # Discovery-first is required whenever the user asks for binary or + # non-link-bearing output, AND whenever --save-pattern is set + # (so the cheap HTML pass can find links without spending the full + # per-page cost on every crawled URL). + use_discovery_flow = self._save_re is not None or _requires_discovery_phase( + self.scrape_params + ) for url in self.start_urls: normalized = _normalize_url(url) if normalized in self.seen_urls: continue - if self.max_pages != 0 and self._fetch_count >= self.max_pages: + if self.max_pages != 0 and self._save_count >= self.max_pages: continue self.seen_urls.add(normalized) - # When --save-pattern is set, use discovery params for initial crawl - # (HTML for link finding). Full params only for save-worthy pages. - if self._save_re: + if use_discovery_flow: params = _params_for_discovery(self.scrape_params) callback = self._parse_crawl_and_save else: @@ -329,6 +717,7 @@ def start_requests(self) -> Iterator[Request]: headers=self.custom_headers, meta={"depth": 0}, callback=callback, + errback=self._on_request_error, ) def _response_headers_dict(self, response: Response) -> dict: @@ -397,20 +786,18 @@ def closed(self, reason: str) -> None: _save_batch_meta(abs_dir, len(self._url_file_map), len(self._url_file_map), 0) - def _iter_follow_requests( - self, - response: Response, - params: dict[str, Any], - callback: Any, - ) -> Any: - """Yield ScrapingBeeRequests for allowed, same-domain - (or allowed-domains) links from response.""" + def _iter_follow_urls(self, response: Response) -> Any: + """Yield ``(url, next_depth)`` for each link in ``response`` that + passes the spider's URL filters (scheme, ASCII, domain + allow-list, include/exclude regex, dedup). Centralised so the + same filter chain is used by both the request-yielding flow + (``_iter_follow_requests``) and the pool-based discovery flow + (``_parse_crawl_and_save`` for binary modes). + """ depth = response.meta.get("depth", 0) if self.max_depth != 0 and depth >= self.max_depth: return - # max_pages = max pages fetched from API (credits spent) - if self.max_pages != 0 and self._fetch_count >= self.max_pages: - return + from urllib.parse import unquote as _unquote for href in _extract_hrefs_from_response(response): if not href or href.startswith(("#", "mailto:", "javascript:")): continue @@ -418,6 +805,24 @@ def _iter_follow_requests( parsed = urlparse(full_url) if parsed.scheme not in ("http", "https"): continue + # Skip URLs whose decoded path/query carries non-printable + # or non-ASCII bytes. Such URLs (common on the + # crawler-test.com fixture pages) trip a known + # ``scrapy_scrapingbee`` bug: when ScrapingBee's API + # returns 500 for the malformed URL, the library's errback + # tries to format the error using ``response.text`` — + # which raises ``AttributeError`` on a binary + # screenshot-mode response and kills the whole spider. + # Filtering them out keeps the crawl going. + try: + _path_tail = _unquote( + (parsed.path or "") + (parsed.query or ""), + errors="replace", + ) + if not all(32 <= ord(ch) <= 126 for ch in _path_tail): + continue + except Exception: + continue if not self._url_allowed(full_url): continue if self._include_re and not self._include_re.search(full_url): @@ -428,90 +833,243 @@ def _iter_follow_requests( if normalized in self.seen_urls: continue self.seen_urls.add(normalized) + yield full_url, depth + 1 + + def _iter_follow_requests( + self, + response: Response, + params: dict[str, Any], + callback: Any, + ) -> Any: + """Yield ScrapingBeeRequests for allowed links from response. + Used by the same-mode ``parse()`` flow (HTML crawl) and by the + HTML-save-pattern branch of ``_parse_crawl_and_save``. + """ + # max_pages = max saved pages. Stop queueing follow-ups once + # the budget (already-saved + in-flight saves) is committed. + if ( + self.max_pages != 0 + and self._save_count + self._save_pending >= self.max_pages + ): + return + for full_url, next_depth in self._iter_follow_urls(response): yield ScrapingBeeRequest( full_url, params=params, headers=self.custom_headers, - meta={"depth": depth + 1}, + meta={"depth": next_depth}, callback=callback, + errback=self._on_request_error, ) + def _make_save_request(self, url: str) -> ScrapingBeeRequest: + """Build a save request (full ``scrape_params``) for ``url``. + Used in the pool-based discovery flow once we've accumulated + enough candidate URLs. Caller is responsible for incrementing + ``_save_pending`` before yielding. + """ + return ScrapingBeeRequest( + url, + params=dict(self.scrape_params), + headers=self.custom_headers, + callback=self._parse_save_only, + errback=self._on_save_error, + dont_filter=True, + priority=10, + ) + def parse(self, response: Response, **kwargs: object) -> Any: - """Save response, then yield follow requests. If no links found in response, - yield a discovery request (same URL with HTML-only params) to extract links.""" + """Same-mode callback: the response is both saved and parsed for + outgoing links. Only used when scrape_params return HTML or + json_response with a parseable body — binary/extract modes are + routed through ``_parse_crawl_and_save`` from ``start_requests``. + """ + from scrapy.exceptions import CloseSpider + self._fetch_count += 1 self.logger.info("Fetched %s (%d bytes)", response.url, len(response.body)) - # Only save if URL matches --save-pattern (or no pattern set) - if not self._save_re or self._save_re.search(response.url): - try: - self._save_response(response) - except Exception as e: - self.logger.warning("Failed to save %s: %s", response.url, e) + try: + self._save_response(response) + self._save_count += 1 + self._push_saved_status() + except CloseSpider: + # The cap-reached signal from _push_saved_status MUST + # propagate to Scrapy's engine — catching it as a generic + # exception below would silence the shutdown and let the + # already-queued follow requests keep firing. + raise + except Exception as e: + self.logger.warning("Failed to save %s: %s", response.url, e) try: hrefs = _extract_hrefs_from_response(response) except Exception: hrefs = [] if hrefs: - yield from self._iter_follow_requests(response, dict(self.scrape_params), self.parse) - else: - # Skip discovery re-request for URLs that are clearly binary/non-HTML resources - # (images, PDFs, CSS, JS, etc.) — they will never contain links. - url_ext = extension_from_url_path(response.url) - if url_ext in _NON_HTML_URL_EXTENSIONS: - return - discovery_params = _params_for_discovery(self.scrape_params) - yield ScrapingBeeRequest( - response.url, - params=discovery_params, - headers=self.custom_headers, - meta=response.meta, - callback=self._parse_discovery_links_only, - dont_filter=True, + yield from self._iter_follow_requests( + response, dict(self.scrape_params), self.parse ) def _parse_crawl_and_save(self, response: Response, **kwargs: object) -> Any: - """Used when --save-pattern is set. Receives HTML (discovery params), - extracts links, follows them, and fires a save request for matching pages.""" + """Discovery-first callback. Two flows live here: + + * **Binary / extract modes** (``_requires_discovery_phase``): + POOL-BASED. Each discovery response contributes its own URL + and its outbound links to ``_save_queue``. We do NOT fire a + save per page. Once the queue reaches ``max_pages`` we flip + ``_discovery_done``, dispatch one save request per queued + URL up to the cap, and stop discovering. Save credits paid + per pre-cap discovery: 0. Compare the old "save+follow each + page" flow, which paid one full-param fetch per saved page + PLUS one HTML discovery per saved page — roughly 2× credits. + + * **HTML save-pattern mode**: SAVE-IN-PLACE. The response IS + the HTML we want to save (the user's ``scrape_params`` + already yield HTML), so we write it directly and follow + links. No separate save request needed. + """ + from scrapy.exceptions import CloseSpider as _CloseSpider + self._fetch_count += 1 self.logger.info("Fetched %s (%d bytes) [crawl]", response.url, len(response.body)) - # If this page matches --save-pattern, fire a separate request with full params to save - if self._save_re and self._save_re.search(response.url): + binary_mode = _requires_discovery_phase(self.scrape_params) + + if not binary_mode: + # ── HTML save-pattern flow (unchanged) ─────────────────── + save_this = (self._save_re is None) or bool( + self._save_re.search(response.url) + ) + within_cap = ( + self.max_pages == 0 + or self._save_count + self._save_pending < self.max_pages + ) + if save_this and within_cap: + try: + self._save_response(response) + self._save_count += 1 + self._push_saved_status() + except _CloseSpider: + raise + except Exception as e: + self.logger.warning("Failed to save %s: %s", response.url, e) + try: + hrefs = _extract_hrefs_from_response(response) + except Exception: + hrefs = [] + if hrefs: + yield from self._iter_follow_requests( + response, + _params_for_discovery(self.scrape_params), + self._parse_crawl_and_save, + ) + return + + # ── Binary / extract mode: pool-based discovery ────────────── + if self._discovery_done: + # A late-arriving discovery response after saturation. The + # save dispatches for the first ``max_pages`` URLs are + # already in flight; this page contributes nothing new. + return + + # Add the current URL to the save queue (if it passes the + # save filter) so the seed and every successfully-discovered + # page becomes a save candidate. ``seen_urls`` and the pool + # both dedup on the normalized form so a trailing-slash + # difference between the seed and an extracted link doesn't + # produce two entries for the same logical page. + if (self._save_re is None) or bool(self._save_re.search(response.url)): + norm = _normalize_url(response.url) + if norm not in self._save_queue_set: + self._save_queue.append(response.url) + self._save_queue_set.add(norm) + + # Extract links from this page and grow both queues. + new_discovery_targets: list[tuple[str, int]] = [] + for full_url, next_depth in self._iter_follow_urls(response): + new_discovery_targets.append((full_url, next_depth)) + if (self._save_re is None) or bool(self._save_re.search(full_url)): + norm = _normalize_url(full_url) + if norm not in self._save_queue_set: + self._save_queue.append(full_url) + self._save_queue_set.add(norm) + + # Saturation: pool has enough candidates → stop discovery, + # dispatch saves for the first ``max_pages`` URLs in queue + # order (seed first, then breadth-first by discovery). The + # remaining URLs stay in the queue as reserves — ``_on_save_error`` + # pulls from them if a dispatched save fails. + if self.max_pages and len(self._save_queue) >= self.max_pages: + self._discovery_done = True + for url in self._save_queue[: self.max_pages]: + self._save_pending += 1 + self._save_queue_next += 1 + yield self._make_save_request(url) + return + + # Still hungry — yield discoveries for the newly-extracted URLs. + discovery_params = _params_for_discovery(self.scrape_params) + for full_url, next_depth in new_discovery_targets: yield ScrapingBeeRequest( - response.url, - params=dict(self.scrape_params), + full_url, + params=discovery_params, headers=self.custom_headers, - meta=response.meta, - callback=self._parse_save_only, - dont_filter=True, - ) - # Extract links from HTML and follow them - try: - hrefs = _extract_hrefs_from_response(response) - except Exception: - hrefs = [] - if hrefs: - yield from self._iter_follow_requests( - response, - _params_for_discovery(self.scrape_params), - self._parse_crawl_and_save, + meta={"depth": next_depth}, + callback=self._parse_crawl_and_save, + errback=self._on_request_error, ) def _parse_save_only(self, response: Response, **kwargs: object) -> Any: """Save the response (fetched with full params). No link following.""" + from scrapy.exceptions import CloseSpider + self.logger.info("Fetched %s (%d bytes) [save]", response.url, len(response.body)) try: self._save_response(response) self._save_count += 1 + self._push_saved_status() + except CloseSpider: + raise except Exception as e: self.logger.warning("Failed to save %s: %s", response.url, e) - - def _parse_discovery_links_only(self, response: Response, **kwargs: object) -> Any: - """Handle HTML response from discovery request: extract links and follow (no save).""" - self.logger.info("Fetched %s (%d bytes) [discovery]", response.url, len(response.body)) + finally: + # ``finally`` runs even when CloseSpider is re-raised, so the + # pending counter is still decremented cleanly during shutdown. + if self._save_pending > 0: + self._save_pending -= 1 + + def _on_save_error(self, failure) -> None: + """Errback for save requests — decrement the pending counter, + log, and backfill from the pool if the user's cap isn't yet + committed. Without backfill, a handful of network failures + would silently shrink the user's effective ``max_pages``. + """ + if self._save_pending > 0: + self._save_pending -= 1 + # If we have reserves in ``_save_queue`` AND the cap (already- + # saved + still-in-flight) hasn't been committed yet, dispatch + # a replacement save. Only relevant when discovery is done + # (i.e. we've already started flushing the queue). try: - yield from self._iter_follow_requests(response, dict(self.scrape_params), self.parse) - except Exception as e: - self.logger.warning("Discovery failed for %s: %s", response.url, e) + if ( + self._discovery_done + and self.max_pages + and self._save_queue_next < len(self._save_queue) + and self._save_count + self._save_pending < self.max_pages + ): + engine = getattr( + getattr(self, "_crawler", None), "engine", None + ) + if engine is not None: + url = self._save_queue[self._save_queue_next] + self._save_queue_next += 1 + self._save_pending += 1 + try: + engine.crawl(self._make_save_request(url), self) + except Exception: + if self._save_pending > 0: + self._save_pending -= 1 + except Exception: + pass + return self._on_request_error(failure) def _fetch_sitemap_urls(url: str, *, api_key: str | None = None, depth: int = 0) -> list[str]: @@ -662,6 +1220,11 @@ def run_project_spider( download_delay=download_delay, autothrottle_enabled=autothrottle_enabled, ) + from .theme import is_repl_mode as _is_repl_mode + _repl_log_active = _is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" + if _repl_log_active: + # Verbose file log, quiet stream — see run_urls_spider for why. + settings.set("LOG_LEVEL", "INFO") log_path = _maybe_set_repl_log_file(settings) if log_path: click.echo( @@ -669,7 +1232,15 @@ def run_project_spider( f"(use `:view crawl` to scroll through it)", err=True, ) + _ensure_reactor_usable() process = CrawlerProcess(settings) + if _repl_log_active: + import logging as _logging + for _h in _logging.getLogger().handlers: + if isinstance(_h, _logging.FileHandler): + continue + if isinstance(_h, _logging.StreamHandler): + _h.setLevel(_logging.WARNING) process.crawl(spider_name) process.start(install_signal_handlers=_install_signal_handlers()) finally: @@ -693,6 +1264,7 @@ def run_urls_spider( include_pattern: str | None = None, exclude_pattern: str | None = None, save_pattern: str | None = None, + known_total: int | None = None, ) -> None: """Run the built-in generic spider: start from URLs and follow links. By default only same-domain links are followed; use allowed_domains or @@ -731,9 +1303,31 @@ def run_urls_spider( download_delay=download_delay, autothrottle_enabled=autothrottle_enabled, ) - settings.set("LOG_LEVEL", "WARNING") + # In REPL mode we want the *file* log to be verbose (so ``:view crawl`` + # is actually useful) while keeping the *stream* output quiet (so the + # REPL scrollback isn't drowned in per-request INFO chatter). We do + # that by raising LOG_LEVEL to INFO globally and then bumping ONLY + # the StreamHandler back up to WARNING after CrawlerProcess wires up + # the handlers (see below). Outside REPL there's no file log, so the + # stream handler picks up LOG_LEVEL directly — keep that at WARNING. + from .theme import is_repl_mode as _is_repl_mode + _repl_log_active = _is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" + settings.set("LOG_LEVEL", "INFO" if _repl_log_active else "WARNING") if max_pages > 0: - settings.set("CLOSESPIDER_PAGECOUNT", max_pages) + # The authoritative cap is the spider's ``_save_count >= + # max_pages`` check (in both ``_iter_follow_requests`` and the + # per-page save dispatch in ``_parse_crawl_and_save``). Scrapy's + # ``CLOSESPIDER_PAGECOUNT`` counts EVERY response — in the + # discovery-flow modes that fire one HTML pass plus one save + # request per page, the response count can easily reach + # ``max_pages × N`` where N depends on how many hrefs a typical + # page exposes. Set the framework cap to a generous multiple + # so it never fires before the spider's own cap stops queuing. + use_discovery_flow = bool(save_pattern) or _requires_discovery_phase( + scrape_params or {} + ) + framework_cap = max_pages * 20 if use_discovery_flow else max_pages + settings.set("CLOSESPIDER_PAGECOUNT", framework_cap) log_path = _maybe_set_repl_log_file(settings) if log_path: click.echo( @@ -741,7 +1335,21 @@ def run_urls_spider( f"(use `:view crawl` to scroll through it)", err=True, ) + _ensure_reactor_usable() process = CrawlerProcess(settings) + # CrawlerProcess just configured the root logger with handlers + # honouring LOG_LEVEL. In REPL mode we asked for INFO so the file + # captures everything, but the StreamHandler also got INFO and + # would spam the REPL scrollback. Demote ONLY the StreamHandler + # (not the FileHandler, which is a StreamHandler subclass) so the + # file stays verbose while stderr stays clean. + if _repl_log_active: + import logging as _logging + for _h in _logging.getLogger().handlers: + if isinstance(_h, _logging.FileHandler): + continue + if isinstance(_h, _logging.StreamHandler): + _h.setLevel(_logging.WARNING) process.crawl( GenericScrapingBeeSpider, start_urls=urls, @@ -757,5 +1365,6 @@ def run_urls_spider( include_pattern=include_pattern, exclude_pattern=exclude_pattern, save_pattern=save_pattern, + known_total=known_total, ) process.start(install_signal_handlers=_install_signal_handlers()) diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 10406b1..526ceac 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -765,7 +765,17 @@ def __init__(self) -> None: # without a modifier, but wheel scroll stops). Alt+S toggles. self.mouse_mode: str = "scroll" - def apply_settings_to_args(self, args: list[str]) -> list[str]: + def apply_settings_to_args( + self, args: list[str], accepted: set[str] | None = None + ) -> list[str]: + """Append session defaults to ``args`` for any flag that: + - is not already present on the command line, AND + - is accepted by the target command (when ``accepted`` is given). + + Without the ``accepted`` filter, session defaults would leak into + commands that don't take them (e.g. ``--json-response`` into + ``usage``), causing "No such option" errors. + """ if not self.settings: return args present = {a for a in args if a.startswith("--")} @@ -774,6 +784,8 @@ def apply_settings_to_args(self, args: list[str]) -> list[str]: flag = f"--{key}" if flag in present: continue + if accepted is not None and flag not in accepted: + continue out.extend([flag, value]) return out @@ -998,95 +1010,22 @@ def render() -> list[tuple[str, str]]: width = shutil.get_terminal_size((80, 24)).columns segs: list[tuple[str, str]] = [("class:toolbar", " ")] - # --- In-flight: running label + elapsed + rotating usage stats ─── - # Layout: ``running · 12.3s`` pinned on the left, ``Ctrl+C to stop`` - # pinned on the right, and a rotating stat (Used Session / Concurrency - # / Next Update) in the middle. The rotation cycles every 5s so the - # user can monitor credits being consumed during a long scrape - # without leaving the command. - if state.is_running: - segs.append(("class:toolbar.label", "running")) - if state.run_start is not None: - elapsed = time.monotonic() - state.run_start - segs.append(("class:toolbar", f" · {elapsed:.1f}s")) - - # Build rotating stat chunks (subset of the idle toolbar's info). - stat_chunks: list[list[tuple[str, str]]] = [] - if state.api_key_set and state.credits is not None: - stat_chunks.append([ - ("class:toolbar.label", "Available "), - ("class:toolbar.value", _format_credits(state.credits)), - ]) - scu = state.session_credits_used if state.api_key_set else None - stat_chunks.append([ - ("class:toolbar.label", "Used (Session) "), - ("class:toolbar.value", _format_credits(scu) if scu is not None else "N/A"), - ]) - if state.api_key_set and state.max_concurrency is not None: - cur = state.current_concurrency if state.current_concurrency is not None else 0 - stat_chunks.append([ - ("class:toolbar.label", "Concurrency "), - ("class:toolbar.value", f"{cur}/{state.max_concurrency}"), - ]) - if state.api_key_set: - nxt = state.seconds_until_next_refresh - if nxt is not None: - stat_chunks.append([ - ("class:toolbar.label", "Next Update "), - ("class:toolbar.value", f"{nxt}s"), - ]) - - stop_hint = "Ctrl+C to stop" - stop_hint_len = len(stop_hint) - so_far = sum(len(t) for _, t in segs) - # Reserve room for: " · ..." + right-aligned stop hint - available = max(0, width - so_far - stop_hint_len - 6) - - # Pick the stat chunk for this rotation tick — only if it fits. - if stat_chunks and available > 8: - idx = int(time.monotonic() / 5) % len(stat_chunks) - chunk = stat_chunks[idx] - chunk_len = sum(len(t) for _, t in chunk) - if chunk_len + 5 <= available: - segs.append(("class:toolbar", " · ")) - segs.extend(chunk) - - # Setting chips still show below if any room remains - if state.settings: - so_far = sum(len(t) for _, t in segs) - budget = max(0, width - so_far - stop_hint_len - 4) - shown = 0 - for k, v in state.settings.items(): - chip = f" {k}={v} " - if budget < len(chip) + 2 and shown > 0: - break - segs.append(("class:toolbar", " ")) - segs.append(("class:toolbar.chip", chip)) - budget -= len(chip) + 2 - shown += 1 - remaining = len(state.settings) - shown - if remaining > 0: - segs.append(("class:toolbar", " ")) - segs.append(("class:toolbar.hint", f"+{remaining} more")) - - # Right-align "Ctrl+C to stop" hint - used = sum(len(t) for _, t in segs) - if width - used > stop_hint_len + 4: - segs.append(("class:toolbar", " " * max(2, width - used - stop_hint_len - 2))) - segs.append(("class:toolbar.hint", stop_hint)) - return segs - - # --- Idle: build all fields, then either render statically or paginate - # When the joined toolbar text exceeds the terminal width we'd - # otherwise emit a line longer than the screen — the terminal soft- - # wraps it into a phantom 2nd row that prompt_toolkit doesn't know - # about, leaving a ghost-toolbar in scrollback on resize. To keep - # everything visible without scrolling jitter, we greedy-pack fields - # into "pages" that each fit, then cycle pages every PAGE_SECONDS. - # Each page is rendered statically — no per-frame motion — so it - # reads cleanly and doesn't waste redraws. + # Unified toolbar pipeline for both idle and in-flight modes: + # build fields → greedy-pack into pages → render the current + # page with a pinned hint on the right. While running we + # prepend a live ``12.3s`` elapsed-time field so the user can + # see how long the command has been going; the bee verb that + # used to live here now alternates with bee facts in the dim + # row above the input. fields: list[list[tuple[str, str]]] = [] + if state.is_running and state.run_start is not None: + elapsed = time.monotonic() - state.run_start + fields.append([ + ("class:toolbar.label", "Elapsed "), + ("class:toolbar.value", f"{elapsed:.1f}s"), + ]) + # Available Credits avail: list[tuple[str, str]] = [("class:toolbar.label", "Available Credits ")] if state.api_key_set and state.credits is not None: @@ -1133,19 +1072,22 @@ def render() -> list[tuple[str, str]]: # ✓/✗ footer are already visible in the scrollback echo, so a # toolbar copy doesn't add information and just consumes width.) - # Session setting chips + # Session setting chips — one chunk PER setting so the pagination + # loop below can split them across pages. Long values (e.g. a + # multi-step ``--js-scenario`` JSON blob) are truncated so a + # single chip never overflows the toolbar line. if state.settings: - chip_segs: list[tuple[str, str]] = [] + _MAX_CHIP_VALUE = 28 for k, v in state.settings.items(): - if chip_segs: - chip_segs.append(("class:toolbar", " ")) - chip_segs.append(("class:toolbar.chip", f" {k}={v} ")) - fields.append(chip_segs) - - # Hint chunk — surfaces the active mouse mode and how to switch. - # Replaces the older "tab · ↑↓ · :help · :q" cheat-sheet, since the - # mode toggle is the one keybinding the user might actually need - # to *change* during a session. The other shortcuts are in :help. + display_v = v if len(v) <= _MAX_CHIP_VALUE else v[: _MAX_CHIP_VALUE - 1] + "…" + fields.append([("class:toolbar.chip", f" {k}={display_v} ")]) + + # Hint chunk pinned bottom-right. Always shows the active mouse + # mode label (Scroll / Select) so the user can see what mouse + # behaviour they have at a glance — even while a command is + # running. The Shift+Tab toggle is documented in ``:help`` to + # keep this strip clean. While running we additionally append + # ``Ctrl+C to stop`` so the cancel affordance stays visible. if not state.api_key_set: hint_text = "type `auth` to set API key" hint_chunk: list[tuple[str, str]] = [("class:toolbar.hint", hint_text)] @@ -1153,10 +1095,9 @@ def render() -> list[tuple[str, str]]: mode_label = ( "Scroll mode" if state.mouse_mode == "scroll" else "Select mode" ) - hint_chunk = [ - ("class:toolbar.value", mode_label), - ("class:toolbar.hint", " · Shift+Tab to switch"), - ] + hint_chunk = [("class:toolbar.value", mode_label)] + if state.is_running: + hint_chunk.append(("class:toolbar.hint", " · Ctrl+C to stop")) LEADING = " " SEP = " · " @@ -1533,16 +1474,34 @@ def _print_row(cmd: str, desc: str) -> None: for cmd, desc in [ (":help, :?", "Show this command list"), (":clear", "Clear the screen"), - (":view", "Scroll the last command's output ('crawl' = crawl log, or pass a path)"), + (":view", "Scroll the last command's output (auto-picks crawl.log after crawl; pass a path to view any file)"), (":set K=V ...", "Set one or more session defaults"), (":unset K", "Remove a session default ('all' or '*' clears every)"), (":reset", "Clear every session default"), - (":show", "Show current session defaults"), + (":show, :list", "Show current session defaults"), ("!", "Run a shell command (requires unsafe mode)"), (":q, :quit", "Quit the REPL"), ]: _print_row(cmd, desc) err_console.print() + err_console.print(f" [{BEE_DIM}]Shortcuts[/]") + for cmd, desc in [ + ("Tab", "Complete (inline if 1 match, popup if many, ghost word otherwise)"), + ("Shift+Tab", "Cycle popup back / toggle Scroll ↔ Select mode"), + ("Esc", "Close the completion popup"), + ("→", "Accept the next word of the ghost suggestion"), + ("End", "Accept the whole ghost suggestion"), + ("↑ / ↓", "Walk history (single-line) / move cursor (multi-line)"), + ("PgUp / PgDn", "Scroll the scrollback buffer up / down"), + ("Ctrl+Home/End","Jump to top / bottom of scrollback"), + ("Ctrl+J", "Insert a newline (multi-line compose; also Alt/Option+Enter)"), + ("Ctrl+W", "Delete the word before the cursor (also Alt/Option+⌫)"), + ("Click", "Open a highlighted path in Finder / default app"), + ("Ctrl+C", "Stop running command / cancel queue / clear multi-line / exit when idle"), + ("Ctrl+D", "Exit the REPL (when no command is running)"), + ]: + _print_row(cmd, desc) + err_console.print() def _print_command_header(args: list[str]) -> None: @@ -1603,11 +1562,11 @@ def _open_pager(path: str) -> None: raw_text = Path(path).read_text(encoding="utf-8", errors="replace") - # If the cached output is valid JSON, prepare a pretty-printed - # version up-front. We default to pretty mode so the user sees the - # human-readable form first; `r` toggles raw if they need to grep - # the original bytes. When the content isn't JSON, pretty is - # unavailable and we stick with raw. + # If the cached output is valid JSON or recognisable HTML, prepare + # a pretty-printed version up-front. We default to pretty mode so + # the user sees the human-readable form first; ``r`` toggles raw + # if they need to grep the original bytes. When the content + # matches neither, pretty is unavailable and we stick with raw. pretty_text: str | None try: pretty_text = json.dumps( @@ -1615,6 +1574,20 @@ def _open_pager(path: str) -> None: ) except Exception: pretty_text = None + if pretty_text is None: + # Cheap heuristic: looks like HTML if a leading non-whitespace + # chunk starts with ``<``. lxml accepts both well-formed XML + # and tag-soup HTML, so this stays fast and lenient. + stripped = raw_text.lstrip() + if stripped.startswith("<"): + try: + from lxml import etree as _etree, html as _lxml_html + tree = _lxml_html.fromstring(raw_text) + pretty_text = _etree.tostring( + tree, pretty_print=True, encoding="unicode", method="html" + ) + except Exception: + pretty_text = None mode = ["pretty" if pretty_text is not None else "raw"] @@ -1878,7 +1851,7 @@ def _handle_meta( sys.stderr.write("\033[2J\033[H") sys.stderr.flush() return "ok" - if head_low == ":show": + if head_low in (":show", ":list"): if not state.settings: err_console.print(f" [{BEE_DIM}]No session defaults set.[/]") else: @@ -1894,38 +1867,27 @@ def _handle_meta( cache_dir = Path.home() / ".cache" / "scrapingbee-cli" crawl_log = cache_dir / "crawl.log" + last_output = cache_dir / "last-output" target_arg = rest.strip() - # `:view` → last command's output - # `:view crawl` → the crawl log written by the most recent - # `crawl` run in REPL mode - # `:view crawl ` → also alias-mode, but ONLY when the - # path after ``crawl`` resolves to the - # actual crawl.log on disk. This lets - # users copy the full hint line ("crawl - # /Users/.../crawl.log") into the - # prompt; random text after ``crawl`` - # falls through to "file not found" - # instead of silently opening the log. - # `:view ` → arbitrary file (must exist) + # `:view` → whatever the most-recent command produced. + # ``crawl`` writes a Scrapy log to crawl.log; every + # other API command (scrape, google, batch items, + # …) writes its response body to last-output. So + # the routing key is ``state.last_command``. + # `:view crawl` → backwards-compat shortcut for crawl.log; still + # useful when the user just wants to peek at the + # log without having re-run crawl most recently. + # `:view ` → arbitrary file (must exist). if not target_arg: - target_path = cache_dir / "last-output" - missing_msg = "no recent output to view" + if state.last_command == "crawl" and crawl_log.exists(): + target_path = crawl_log + missing_msg = "no crawl log yet — run `crawl ...` first" + else: + target_path = last_output + missing_msg = "no recent output to view" elif target_arg.lower() == "crawl": target_path = crawl_log missing_msg = "no crawl log yet — run `crawl ...` first" - elif target_arg.lower().startswith("crawl "): - after = target_arg[len("crawl "):].strip() - try: - supplied_path = Path(after).expanduser().resolve(strict=False) - if supplied_path == crawl_log.resolve(strict=False): - target_path = crawl_log - missing_msg = "no crawl log yet — run `crawl ...` first" - else: - target_path = Path(target_arg).expanduser() - missing_msg = f"file not found: {target_arg}" - except Exception: - target_path = Path(target_arg).expanduser() - missing_msg = f"file not found: {target_arg}" else: target_path = Path(target_arg).expanduser() missing_msg = f"file not found: {target_arg}" @@ -2054,7 +2016,7 @@ def _make_completer( meta_cmds = [ ":help", ":?", ":clear", ":view", ":set", ":unset", ":reset", ":show", - ":q", ":quit", + ":list", ":q", ":quit", ] # Precompute the union of every flag known to any command. Used as a @@ -2092,18 +2054,31 @@ def get_completions(self, document, complete_event): flags_for_cmd = ( command_flags[cmd_name] if cmd_known else _all_known_flags ) - last = words[-1] if words else "" - prev = words[-2] if len(words) >= 2 else "" + ends_with_space = text.endswith(" ") + last_word = words[-1] if words else "" + # When the buffer ends with a space the user has *finished* + # typing the previous arg and is starting a new one. The + # "current partial" is empty; the "previous arg" (used for + # bool/choice value suggestions) shifts to the last typed + # word. Earlier this was off-by-one and would cause Tab to + # replace the wrong span — e.g. ``--verbose `` + Tab would + # corrupt to ``---verbose``. + if ends_with_space: + last = "" + prev = last_word + else: + last = last_word + prev = words[-2] if len(words) >= 2 else "" - if text.endswith(" ") and prev in bool_flags: + if ends_with_space and prev in bool_flags: yield Completion("true", display_meta="enable") yield Completion("false", display_meta="disable") return - if text.endswith(" ") and prev in choice_flags: + if ends_with_space and prev in choice_flags: for v in choice_flags[prev]: yield Completion(v) return - if len(words) >= 2 and not last.startswith("-"): + if (not ends_with_space) and len(words) >= 2 and not last.startswith("-"): if prev in bool_flags: for v in ("true", "false"): if v.startswith(last.lower()): @@ -2114,7 +2089,13 @@ def get_completions(self, document, complete_event): if v.startswith(last.lower()): yield Completion(v, start_position=-len(last)) return - if last.startswith("-"): + # Flag completions: either the user is typing a partial flag + # (``--ver``), or they're at a trailing space ready for a + # new flag (``last == ""`` here matches every flag). In both + # cases start_position is ``-len(last)`` — which is 0 in + # the trailing-space case, so flags get inserted at the + # cursor without disturbing previous text. + if last.startswith("-") or (ends_with_space and last == ""): meta_label = "" if cmd_known else "(unknown command)" for flag in flags_for_cmd: if flag.startswith(last): @@ -2184,16 +2165,30 @@ def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: # ``CancelledError`` on the await, which propagates out cleanly # (the worker's except clause turns it into "stopped"). import asyncio as _asyncio_mod + import threading as _threading_mod _active_worker_loop: list[Any] = [None] _original_asyncio_run = _asyncio_mod.run + _main_thread = _threading_mod.main_thread() def _tracking_loop_factory(): loop = _asyncio_mod.new_event_loop() + # CRITICAL: only track loops that belong to *worker* threads. The + # main thread's loop is prompt_toolkit's own — cancelling tasks + # on it kills the entire REPL. ``app.run()`` calls + # ``asyncio.run`` (which routes through us here), so without this + # guard the very first call at REPL startup registers the main + # loop as the "worker" loop and any subsequent Ctrl+C tears the + # REPL down with a CancelledError. + if _threading_mod.current_thread() is _main_thread: + return loop _active_worker_loop[0] = loop return loop def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): + # Same guard on the cleanup side — only clear the worker-loop + # ref if THIS call was a worker-thread call. If we're on the main + # thread we never touched the ref in the first place. try: return _original_asyncio_run( main, @@ -2201,7 +2196,8 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): loop_factory=loop_factory or _tracking_loop_factory, ) finally: - _active_worker_loop[0] = None + if _threading_mod.current_thread() is not _main_thread: + _active_worker_loop[0] = None _asyncio_mod.run = _tracking_asyncio_run @@ -2256,7 +2252,7 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # large window isn't disrupted. try: _cur_cols, _cur_rows = shutil.get_terminal_size((80, 24)) - _MIN_COLS, _MIN_ROWS = 100, 30 + _MIN_COLS, _MIN_ROWS = 150, 50 if _cur_cols < _MIN_COLS or _cur_rows < _MIN_ROWS: _new_cols = max(_cur_cols, _MIN_COLS) _new_rows = max(_cur_rows, _MIN_ROWS) @@ -2281,31 +2277,14 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # 3-row honeycomb progress widget in place rather than appending a new # row per completion. The renderer keeps track of how many lines the # previous frame consumed so the next frame overwrites the same band. + # Install a no-op progress renderer in the REPL. ``emit_progress_lines`` + # would otherwise fall back to writing the honeycomb directly to + # stderr — which lands in scrollback via patch_stdout and causes + # duplicate rows. The fixed ``crawl_status_window`` widget renders + # the live honeycomb directly from ``_progress_state``, so the + # scrollback path is no longer needed in REPL mode. from .theme import set_progress_renderer as _set_progress_renderer - - _progress_line_count = [0] - - def _render_progress(rendered_lines: list[str]) -> None: - from prompt_toolkit.formatted_text import ANSI, to_formatted_text - - fragments_per_line: list[list[tuple[str, str]]] = [] - for raw in rendered_lines: - try: - fragments_per_line.append(list(to_formatted_text(ANSI(raw)))) - except Exception: - fragments_per_line.append([("", raw)]) - n = len(fragments_per_line) - prev = _progress_line_count[0] - if prev > 0 and prev == n: - scrollback.replace_last_n_lines(prev, fragments_per_line) - else: - # First frame, or row-count changed (rare): append fresh and - # remember how many lines to overwrite next time. - for f in fragments_per_line: - scrollback.append_fragments(f) - _progress_line_count[0] = n - - _set_progress_renderer(_render_progress) + _set_progress_renderer(lambda _lines: None) # ── First-run API key state ───────────────────────────────────────────── # When no API key is configured we open the REPL UI in a "first-run" @@ -2356,6 +2335,23 @@ def _render_progress(rendered_lines: list[str]) -> None: # KeyboardInterrupt into the worker thread alone doesn't fire while the # thread is blocked reading the subprocess's stdout in a C-level read(). current_subprocess: list[Any] = [None] + # Monotonic timestamp of the most recent Ctrl+C while a command was + # running. Lets the next Ctrl+C escalate from SIGTERM → SIGKILL if + # the user is impatient (subprocess didn't exit within 2 s). + _last_ctrl_c_time: list[float] = [0.0] + # Queue of pending commands. Populated when ``_submit`` receives a + # buffer with newlines (typically from a multi-line paste) — only + # the first line runs immediately, the rest wait their turn. + # ``_ticker`` drains the queue once the input lock clears. + _pending_commands: list[str] = [] + # ``_multiline_visible[0]`` toggles the input buffer between single- + # line and multi-line mode. Default False (single-line). Multi-line + # paste flips it True so the pasted commands stick in the buffer + # (otherwise prompt_toolkit's single-line buffer would strip the + # newlines on insert). The user can then edit each line and press + # Enter to submit the whole batch — ``_submit`` already splits + # multi-line text into the queue. Reset on submit / Ctrl+C. + _multiline_visible: list[bool] = [False] input_buffer = Buffer( history=history, @@ -2369,16 +2365,21 @@ def _render_progress(rendered_lines: list[str]) -> None: history=history, is_disabled=lambda: _first_run_needs_key[0], ), - multiline=False, + multiline=Condition(lambda: _multiline_visible[0]), read_only=Condition(lambda: is_input_locked[0]), ) - def _line_prefix(line_no, _wrap_count): - if line_no == 0: - if _first_run_needs_key[0]: - return [("class:promptmark", "API key: ")] - return [("class:promptmark", "❯ ")] - return [("", " ")] + def _line_prefix(line_no, wrap_count): + # ``❯`` marks the START of a logical command line — both the + # first line and any subsequent line introduced by an explicit + # newline (multi-line paste or Alt+Enter). Visual wraps of a + # single long command get the continuation indent instead, so + # one long command stays visually one command. + if wrap_count > 0: + return [("", " ")] + if line_no == 0 and _first_run_needs_key[0]: + return [("class:promptmark", "API key: ")] + return [("class:promptmark", "❯ ")] # While a command is in flight we collapse the input window's height to # 0 — instead of hiding it via ConditionalContainer. Hiding via Conditional @@ -2451,6 +2452,402 @@ def _running_text() -> list[tuple[str, str]]: filter=Condition(lambda: state.is_running), ) + # ── Bee-blurb row (only while a command is running) ───────────────────── + # A single dim italic line just above the input that alternates every + # ~5 seconds between a bee fact ("Did you know? Bees have 5 eyes.") + # and a bee verb ("pollinating…"). Adds personality during long + # scrapes / crawls without competing with the shimmering command + # line right above it. Hidden when idle so the prompt is the only + # thing below the scrollback. + def _bee_fact_text() -> list[tuple[str, str]]: + if not state.is_running: + return [] + from .theme import current_bee_blurb + + blurb = current_bee_blurb(state.tick) + return [(f"italic {BEE_DIM}", f" {blurb}")] + + # Shared FormattedTextControl that forwards wheel-scroll events to + # the scrollback buffer. Used for every fixed-area Window (banner, + # crawl status, bee facts) so the user can scroll regardless of + # where their mouse pointer is — without this, mouse events that + # land on the fixed widgets get dropped because those windows + # don't have their own scroll handler. + from prompt_toolkit.mouse_events import ( + MouseEventType as _MET, + MouseModifier as _MM, + ) + from prompt_toolkit.layout.controls import FormattedTextControl as _PTFTC + + # ── Path detection for Ctrl/Alt+Click open ─────────────────────────────── + # Matches just the *start* of a path candidate — absolute (``/``), + # home-relative (``~/``), or directory-relative (``./``, ``../``). + # The lookbehind excludes word chars and ``:`` so URLs like + # ``http://...`` don't match their ``//path`` suffix as a path + # start. From each matched start, ``_find_path_at`` greedily + # extends to end of line and trims back at whitespace / slash + # boundaries until it finds the longest substring that exists on + # disk. This is what lets real-world paths with spaces work — + # ``/Applications/Some App.app``, ``~/Library/Application Support/...``, + # ``/var/folders/.../Screenshot 2026-05-18 at 11.44.12 PM.png``. + _PATH_START_RE = re.compile(r"(? '\"\t" + + def _resolve_path_str(raw: str) -> str: + if raw.startswith("~/"): + return os.path.expanduser(raw) + if raw.startswith(("./", "../")): + return os.path.abspath(raw) + return raw + + def _resolve_clicked_path(raw: str) -> str | None: + """Backwards-compat single-string resolver: return the resolved + absolute path if it exists, else ``None``. + """ + resolved = _resolve_path_str(raw) + return resolved if os.path.exists(resolved) else None + + def _open_path(path: str) -> None: + """Open ``path`` with the OS default handler (Finder, Explorer, + ``xdg-open``). Non-blocking; failures are silently swallowed so + a broken handler doesn't crash the REPL. + """ + import platform + import subprocess + system = platform.system() + try: + if system == "Darwin": + subprocess.Popen(["open", path]) + elif system == "Windows": + os.startfile(path) # type: ignore[attr-defined] + else: + subprocess.Popen(["xdg-open", path]) + except Exception: + pass + + # The scrollback renderer caches the visible visual rows here so the + # click handler can find what text was at the click position without + # re-running expensive layout calculations. + _last_scrollback_view: dict[str, list] = {"rows": []} + + class _ScrollForwardingFTC(_PTFTC): + """Wheel forwarder + optional modifier+click → path opener. + + ``click_handler`` is invoked on MOUSE_DOWN events that carry a + modifier (Ctrl, Alt, or Shift). Plain clicks are ignored so the + terminal's native drag-select stays functional. + """ + + _click_handler = None + + def set_click_handler(self, handler) -> None: + self._click_handler = handler + + def mouse_handler(self, mouse_event): + et = mouse_event.event_type + if et == _MET.SCROLL_UP: + scrollback.scroll_up(1) + try: + app.invalidate() + except Exception: + pass + return None + if et == _MET.SCROLL_DOWN: + scrollback.scroll_down(1) + try: + app.invalidate() + except Exception: + pass + return None + if et == _MET.MOUSE_DOWN and self._click_handler is not None: + # Plain click opens highlighted paths. The scrollback is + # read-only so a click has no other purpose there. The + # click handler returns NotImplemented when the click + # didn't land on a path, which falls through to default + # mouse handling — drag-to-select still works in Select + # mode (toggle with Shift+Tab) because that mode turns + # mouse capture off entirely. + try: + return self._click_handler(mouse_event) + except Exception: + pass + return NotImplemented + + # ── Path-existence cache for render-time linkification ─────────────────── + # Path detection runs on every invalidate (10 Hz ticker + every + # keystroke), so a naive ``os.path.exists`` per match would issue + # thousands of stat() syscalls per second. Cache the result for + # 30 s — long enough to be cheap, short enough that a file written + # during a crawl shows up as clickable within half a minute. + _path_exists_cache: dict[str, tuple[float, bool]] = {} + _PATH_EXISTS_TTL = 30.0 + + def _path_exists_cached(path: str) -> bool: + now = time.monotonic() + hit = _path_exists_cache.get(path) + if hit is not None and (now - hit[0]) < _PATH_EXISTS_TTL: + return hit[1] + try: + exists = os.path.exists(path) + except Exception: + exists = False + _path_exists_cache[path] = (now, exists) + if len(_path_exists_cache) > 512: + cutoff = sorted( + _path_exists_cache.items(), key=lambda kv: kv[1][0] + )[:128] + for k, _ in cutoff: + _path_exists_cache.pop(k, None) + return exists + + def _find_path_at(text: str, start: int) -> tuple[int, str | None]: + """Greedy-then-shrink: starting at ``start``, take everything up + to end-of-line / clear delimiter, then trim back at whitespace + and slash boundaries until the substring exists on disk. + Returns ``(end_index, raw_match)`` or ``(start, None)`` if no + prefix resolves to an existing path. + """ + end = start + while end < len(text) and text[end] not in '\n\r"\'<>|`': + end += 1 + while end > start: + candidate = text[start:end].rstrip(_PATH_TRIM_CHARS) + if len(candidate) < 2: + return (start, None) + resolved = _resolve_path_str(candidate) + if _path_exists_cached(resolved): + return (start + len(candidate), candidate) + # Shrink at the rightmost of whitespace or colon — both are + # common boundaries between a real path and trailing text: + # "/tmp/foo bar baz" → trim at last space + # "/tmp/foo.py:42:10" → trim at the colon (line/col suffix) + # Then fall back to the last slash if neither produced a hit. + last_space = max(candidate.rfind(" "), candidate.rfind("\t")) + last_colon = candidate.rfind(":") + last_punct = max(last_space, last_colon) + if last_punct > 0: + end = start + last_punct + continue + last_slash = candidate.rfind("/") + if last_slash > 0: + end = start + last_slash + continue + return (start, None) + return (start, None) + + def _existing_paths_in(text: str): + """Yield ``(start, end, raw)`` for every existing path substring + in ``text``. Non-overlapping; resumes scanning past each match. + """ + i = 0 + while i < len(text): + m = _PATH_START_RE.search(text, i) + if not m: + break + start = m.start() + end, raw = _find_path_at(text, start) + if raw is not None: + yield (start, end, raw) + i = end + else: + # No existing path here — advance past the ``/`` so we + # don't infinite-loop on the same candidate start. + i = m.end() + + def _scrollback_click_handler(mouse_event): + """Resolve a modifier-click on the scrollback to a path open. + Looks at the visual row at click.y and the existing path-like + substring spanning click.x — opens it if found. + """ + rows = _last_scrollback_view.get("rows") or [] + pos = mouse_event.position + y, x = pos.y, pos.x + if y < 0 or y >= len(rows): + return NotImplemented + text = "".join(t for _, t in rows[y]) + for start, end, raw in _existing_paths_in(text): + if start <= x < end: + _open_path(_resolve_path_str(raw)) + return None + return NotImplemented + + def _styled_with_links( + fragments: list[tuple[str, str]], + ) -> list[tuple[str, str]]: + """Re-emit each fragment with brand-yellow + underline applied + to any path-like substring that exists on disk. The detection + runs on the concatenated text of the row so paths split across + style boundaries (e.g. when ANSI styling colours just the + filename) still get caught. + """ + if not fragments: + return fragments + text = "".join(t for _, t in fragments) + if "/" not in text and "~" not in text: + return fragments + # Build an offset map: position → (fragment_index, char_offset_in_fragment). + # Used to split fragments at path boundaries. + spans = list(_existing_paths_in(text)) + if not spans: + return fragments + # Walk fragments + spans together, splitting where needed. + out: list[tuple[str, str]] = [] + cursor = 0 # absolute offset in concatenated text + span_iter = iter(spans) + cur_span = next(span_iter, None) + for style, frag_text in fragments: + if not frag_text: + out.append((style, frag_text)) + continue + frag_end = cursor + len(frag_text) + i = 0 + while i < len(frag_text): + # Skip past consumed spans. + while cur_span is not None and cur_span[1] <= cursor + i: + cur_span = next(span_iter, None) + if cur_span is None or cur_span[0] >= frag_end: + out.append((style, frag_text[i:])) + i = len(frag_text) + break + span_start, span_end, _raw = cur_span + local_start = max(0, span_start - cursor) + local_end = min(len(frag_text), span_end - cursor) + if local_start > i: + out.append((style, frag_text[i:local_start])) + link_style = f"{style} underline fg:{BEE_YELLOW}".strip() + out.append((link_style, frag_text[local_start:local_end])) + i = local_end + cursor = frag_end + return out + + bee_fact_window = ConditionalContainer( + content=Window( + content=_ScrollForwardingFTC(_bee_fact_text), + height=D.exact(1), + ), + filter=Condition(lambda: state.is_running), + ) + + # ── Crawl status line (fixed Window, not scrollback) ──────────────────── + # Originally we rendered this via ``emit_progress_lines`` which + # APPENDS / REPLACES tail rows of the scrollback. That works for + # batch (writes between ticks are file writes), but crawl pumps + # Scrapy logs into stderr → scrollback constantly. Every Scrapy + # log line invalidated the "last N lines are mine" assumption, + # causing the widget to multiply into ghost copies interleaved + # with logs. A fixed layout Window sits at a known position and + # gets re-rendered each frame — no scrollback noise. + def _has_crawl_status_safe() -> bool: + try: + from .theme import has_crawl_status + return has_crawl_status() + except Exception: + return False + + def _has_active_job_status() -> bool: + """True when the fixed task widget should be visible — either a + crawl is in flight (``_crawl_status``) or a batch is reporting + progress (``_progress_state``). Used as the ConditionalContainer + filter for ``crawl_status_window``.""" + if _has_crawl_status_safe(): + return True + try: + from .theme import has_progress_state + return has_progress_state() + except Exception: + return False + + def _crawl_status_text() -> list[tuple[str, str]]: + """Build the fragments for the active-job status widget pinned + right below the (compact) banner. + + Layout: + - Honeycomb progress bar + counter, when ``_progress_state`` + is set (crawl-with-known-total or any batch). + - ``: (X fetched[, Y saved])`` line ONLY when a + crawl is in flight (``_crawl_status`` is set). Batch has no + per-item URL to show, so its widget is honeycomb-only. + """ + from . import theme as _theme # live module reference + from .theme import BEE_WHITE, format_honeycomb_grid, get_crawl_status + cs = get_crawl_status() + ps = getattr(_theme, "_progress_state", None) + if cs is None and ps is None: + return [] + + frags: list[tuple[str, str]] = [] + + # Honeycomb row when progress total is known. + if ps is not None: + try: + rows = format_honeycomb_grid( + completed=ps["completed"], + total=ps["total"], + rps=ps.get("rps"), + eta=ps.get("eta"), + failure_pct=ps.get("failure_pct"), + animate=True, + ) + for i, row_text in enumerate(rows): + if i > 0 or (cs is not None): + frags.append(("", "\n")) + if i == 0 and cs is None: + # First (and usually only) honeycomb row for + # batch-only mode — no preceding \n. + pass + frags.extend(_text_to_fragments(row_text)) + if cs is not None: + # Separator between honeycomb and URL row. + frags.append(("", "\n")) + except Exception: + pass + + # URL / fetched-count line — crawl only. + if cs is not None: + phase = cs.get("phase") or "fetching" + url = cs.get("current_url") + fetched = cs.get("fetched") or 0 + saved = cs.get("saved") or 0 + if url and len(url) > 80: + url = url[:48] + "…" + url[-25:] + frags.append(("", " ")) + frags.append((f"bold {BEE_YELLOW}", f"{phase}: ")) + if url: + frags.append((BEE_WHITE, url)) + else: + frags.append((f"{BEE_DIM}", "…")) + suffix = f" ({fetched} fetched" + if saved: + suffix += f", {saved} saved" + suffix += ")" + frags.append((f"{BEE_DIM}", suffix)) + return frags + + def _crawl_status_height() -> "D": + """Compute widget height based on what's shown. + Cases: + • crawl only (no progress) → 1 row (URL line) + • crawl + progress (known total) → 2 rows + • batch only (progress, no crawl URL) → 1 row (honeycomb only) + """ + cs_set = _has_crawl_status_safe() + try: + from .theme import has_progress_state + ps_set = has_progress_state() + except Exception: + ps_set = False + if cs_set and ps_set: + return D.exact(2) + return D.exact(1) + + crawl_status_window = ConditionalContainer( + content=Window( + content=_ScrollForwardingFTC(_crawl_status_text), + height=_crawl_status_height, + ), + filter=Condition(_has_active_job_status), + ) + # ── Scrollback Window — virtual buffer rendered as the top section ───── # This Window fills the vertical space above the running line / input / # toolbar. It renders whatever ScrollbackBuffer says is visible based @@ -2464,11 +2861,30 @@ def _scrollback_render() -> list[tuple[str, str]]: _app = _get_app() if getattr(_app, "is_running", False): size = _app.output.get_size() - # Reserve rows for the full banner + everything below the - # scrollback in the layout: banner_visual + spacer_top(1) + # Reserve rows for the banner + everything below the + # scrollback in the layout: banner_height + spacer_top(1) # + separator(1) + running_or_input(1) + spacer_bottom(1) - # + toolbar(1) = banner_visual + 5. - reserved = _banner_visual_height + 5 + # + toolbar(1) = banner + 5. Banner is now dynamic + # (full ASCII when idle, single line during crawl / + # batch), so we ask ``_banner_height`` for the live + # value rather than using the static visual height. + banner_h = 1 if _active_job_in_progress() else _banner_visual_height + reserved = banner_h + 5 + if state.is_running: + # bee_fact_window row above the (collapsed) input. + reserved += 1 + if _has_active_job_status(): + # The active-job status widget is pinned right under + # the banner — 2 rows when both crawl URL and + # honeycomb are shown, otherwise 1 row (URL-only + # crawl, or honeycomb-only batch). + cs_set = _has_crawl_status_safe() + try: + from .theme import has_progress_state + ps_set = has_progress_state() + except Exception: + ps_set = False + reserved += 2 if (cs_set and ps_set) else 1 height = max(1, size.rows - reserved) width = max(1, size.columns) except Exception: @@ -2479,6 +2895,14 @@ def _scrollback_render() -> list[tuple[str, str]]: # full-width row never accidentally pushes the cursor onto the # next terminal row (which some terminals do at col == width). visual_rows = scrollback.get_visible_visual(height, max(1, width - 1)) + # Re-style each row so path-like substrings that exist on disk + # are rendered in brand-yellow with an underline — a visible + # affordance for the Ctrl/Alt+Click open-in-Finder feature. + visual_rows = [_styled_with_links(row) for row in visual_rows] + # Cache so the modifier+click handler on the scrollback Window + # can look up what text was at the click position without + # recomputing wrap/scroll math. + _last_scrollback_view["rows"] = visual_rows out: list[tuple[str, str]] = [] for i, row in enumerate(visual_rows): if i > 0: @@ -2486,39 +2910,14 @@ def _scrollback_render() -> list[tuple[str, str]]: out.extend(row) return out - # FormattedTextControl subclass that routes mouse wheel / trackpad - # scroll events to our virtual buffer. prompt_toolkit's default mouse - # mode (1000) captures button events but NOT motion, so the terminal - # still handles drag-select natively (or with a modifier — Option on - # Mac, Shift on most Linux terminals — depending on the terminal). - from prompt_toolkit.mouse_events import MouseEventType - from prompt_toolkit.layout.controls import FormattedTextControl as _PTFTC - - class _ScrollbackControl(_PTFTC): - def mouse_handler(self, mouse_event): - et = mouse_event.event_type - # 1 line per wheel/trackpad event keeps motion smooth — trackpads - # send a flurry of small events per gesture, so a tight step - # tracks the user's finger movement closely. Larger steps (3+) - # feel jumpy / snap-y. - if et == MouseEventType.SCROLL_UP: - scrollback.scroll_up(1) - try: - app.invalidate() - except Exception: - pass - return None - if et == MouseEventType.SCROLL_DOWN: - scrollback.scroll_down(1) - try: - app.invalidate() - except Exception: - pass - return None - return NotImplemented - + # The scrollback window uses the same scroll-forwarding control as + # the rest of the fixed-area widgets so a wheel event anywhere on + # screen feeds the scrollback buffer. The click_handler hook + # additionally opens path-like substrings under Ctrl/Alt/Shift+Click. + _scrollback_ftc = _ScrollForwardingFTC(_scrollback_render) + _scrollback_ftc.set_click_handler(_scrollback_click_handler) scrollback_window = Window( - content=_ScrollbackControl(_scrollback_render), + content=_scrollback_ftc, # We pre-wrap content ourselves (see _split_fragments_to_width) so # each line passed to prompt_toolkit is already ≤ terminal width. # Disable prompt_toolkit's own line-wrapping so it doesn't try to @@ -2536,6 +2935,16 @@ def mouse_handler(self, mouse_event): _banner_visual_height = len(_SCRAPINGBEE_LOGO) + 5 # logo + 5 text rows def _banner_render() -> list[tuple[str, str]]: + # While a long-running command (crawl / batch scrape) is in + # flight, collapse the ASCII wordmark to a single-line + # ``ScrapingBee v1.5.0`` so the freed rows above scrollback can + # show the live task widget — URL, fetched count, honeycomb + # progress bar. The big banner returns once the run ends. + if _active_job_in_progress(): + line = Text() + line.append(" ScrapingBee ", style=f"bold {BEE_YELLOW}") + line.append(f"v{version}", style="bold white") + return _text_to_fragments(line) out: list[tuple[str, str]] = [] # SCRAPING half in brand yellow, BEE half in white — matches the # wordmark in the official brand assets. @@ -2563,11 +2972,47 @@ def _banner_render() -> list[tuple[str, str]]: out.append((f"{BEE_DIM}", " to quit")) return out + def _active_job_in_progress() -> bool: + """True while a crawl or batch is running — used to collapse + the banner so the live task widget gets prominent placement.""" + if _has_crawl_status_safe(): + return True + try: + from .theme import has_progress_state + return has_progress_state() + except Exception: + return False + + def _text_to_fragments(t: "Text") -> list[tuple[str, str]]: + """Render a rich Text object to the (style, text) fragment list + prompt_toolkit's ``FormattedTextControl`` expects.""" + try: + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + to_formatted_text as _tft, + ) + from io import StringIO as _SIO + from rich.console import Console as _RC + + buf = _SIO() + _c = _RC( + file=buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _c.print(t, end="") + return list(_tft(_ANSI(buf.getvalue()))) + except Exception: + return [("", t.plain)] + def _banner_height() -> "D": + # Compact one-liner while a crawl / batch is active; full ASCII + # banner otherwise. + if _active_job_in_progress(): + return D.exact(1) return D.exact(_banner_visual_height) banner_window = Window( - content=FormattedTextControl(_banner_render), + content=_ScrollForwardingFTC(_banner_render), height=_banner_height, wrap_lines=False, always_hide_cursor=True, @@ -2612,10 +3057,12 @@ def _hr_render() -> list[tuple[str, str]]: main_split = HSplit( [ banner_window, + crawl_status_window, scrollback_window, spacer_top, separator, running_window, + bee_fact_window, input_window, spacer_bottom, toolbar_window, @@ -2793,6 +3240,14 @@ def _finish() -> None: state.last_status = status_ref[0] state.last_duration = duration is_input_locked[0] = False + # Clear the typed ``!cmd`` only when it ran cleanly. A + # non-zero exit or Ctrl+C-stopped run leaves the line in the + # buffer so the user can tweak it and retry without retyping. + if status_ref[0] == "ok": + try: + input_buffer.reset() + except Exception: + pass try: app.invalidate() except Exception: @@ -2826,6 +3281,280 @@ def _worker() -> None: current_worker[0] = worker_thread worker_thread.start() + # ── Crawl execution via subprocess ────────────────────────────────────── + # Twisted's reactor is a single-shot process-wide singleton. Once + # ``reactor.run()`` has been entered and returned, the same Python + # process can never call it again. Running each crawl in a fresh + # ``python -m scrapingbee_cli.cli crawl ...`` subprocess gives us + # a brand-new reactor per crawl, so the user can issue many + # crawls in one REPL session. + # + # IPC for live status: the parent sets ``SCRAPINGBEE_CRAWL_STATUS_FILE`` + # to a per-pid path; the child's spider signal handlers atomically + # mirror ``theme._crawl_status`` to that file via + # ``_maybe_mirror_to_status_file``. A polling thread on the parent + # reads the file every 100 ms and forwards updates into the + # parent's own ``_crawl_status`` so the layout-window crawl status + # display keeps showing live URL / fetched count. + def _execute_crawl_subprocess( + crawl_args: list[str], original_line: str, echo_idx: int + ) -> None: + import os as _os + import subprocess + + output_start_index = echo_idx + start = time.monotonic() + status_ref = ["ok"] + state.is_running = True + state.running_command = "crawl" + state.running_command_text = original_line + state.run_start = start + + status_file = ( + Path.home() / ".cache" / "scrapingbee-cli" + / f"crawl-status-{_os.getpid()}.json" + ) + try: + status_file.parent.mkdir(parents=True, exist_ok=True) + except Exception: + pass + + # Clear any leftover state file from a prior run. + try: + status_file.unlink() + except Exception: + pass + + # Pre-populate the parent's _crawl_status so the layout window + # shows "starting…" the instant the user submits, rather than + # waiting for the child to fire its first signal. + try: + from .theme import update_crawl_status + update_crawl_status( + current_url=None, fetched=0, queued=0, saved=0, phase="starting", + ) + except Exception: + pass + + _stop_poll = threading.Event() + + def _poll_status_file() -> None: + """Watch the child's status JSON file and forward updates + into the parent's in-memory state. 100 ms cadence so URL + + counter changes feel live in the fixed status widget. + + The payload also carries the child's progress state + (``progress_total``, ``progress_completed``) when a known + total is in play — sitemap mode, ``--max-depth 1``, or + ``--max-pages N``. That's how the parent's fixed widget + learns to show the honeycomb above the URL line. + """ + import json as _json + from .theme import update_crawl_status, update_progress_state + + last_mtime = 0.0 + while not _stop_poll.wait(0.1): + try: + stat = status_file.stat() + except FileNotFoundError: + continue + except Exception: + continue + if stat.st_mtime == last_mtime: + continue + last_mtime = stat.st_mtime + try: + with open(status_file, encoding="utf-8") as fh: + data = _json.load(fh) + update_crawl_status( + current_url=data.get("current_url"), + fetched=data.get("fetched"), + queued=data.get("queued"), + saved=data.get("saved"), + phase=data.get("phase"), + ) + pt = data.get("progress_total") + pc = data.get("progress_completed") + if isinstance(pt, int) and pt > 0 and isinstance(pc, int): + # ``update_progress_state`` no-ops on the + # scrollback render path when ``_crawl_status`` + # is set, so we just set state — the fixed + # widget will pick it up next frame. + update_progress_state( + pc, + pt, + rps=data.get("progress_rps"), + eta=data.get("progress_eta"), + failure_pct=data.get("progress_failure_pct"), + ) + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + poll_thread = threading.Thread(target=_poll_status_file, daemon=True) + poll_thread.start() + + def _run() -> None: + try: + env = _os.environ.copy() + env["SCRAPINGBEE_CRAWL_STATUS_FILE"] = str(status_file) + # Mark the child as "spawned by REPL" so it can adjust + # output (no colors / no spinner) if we ever want that. + env["SCRAPINGBEE_FROM_REPL"] = "1" + cmd = [sys.executable, "-m", "scrapingbee_cli.cli"] + crawl_args + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + current_subprocess[0] = proc + try: + assert proc.stdout is not None + for chunk in iter(proc.stdout.readline, ""): + sys.stdout.write(chunk) + finally: + code = proc.wait() + current_subprocess[0] = None + if code != 0: + # ``terminate()`` from Ctrl+C exits with -SIGTERM + # (-15 on POSIX). A second Ctrl+C escalates to + # ``proc.kill()`` which exits with -SIGKILL (-9). + # Treat any of these as a deliberate stop rather + # than a failure so the footer reads ■ stopped. + if code in (-15, -9, -2): + status_ref[0] = "stopped" + else: + status_ref[0] = "fail" + err_console.print( + f" [{BEE_DIM}]exit code {code}[/]" + ) + except KeyboardInterrupt: + proc = current_subprocess[0] + if proc is not None: + try: + proc.terminate() + except Exception: + pass + err_console.print(f" [{BEE_DIM}]stopped[/]") + status_ref[0] = "stopped" + except Exception as e: + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status_ref[0] = "fail" + + def _finish() -> None: + duration = time.monotonic() - start + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + # Stop polling BEFORE clearing in-memory state. Join the poll + # thread so it can't race past the event check, read the file + # one last time, and resurrect ``_crawl_status`` after we + # cleared it — the bug that left the crawl status window + # visible after Ctrl+C. + _stop_poll.set() + try: + poll_thread.join(timeout=0.5) + except Exception: + pass + try: + from .theme import clear_crawl_status, clear_progress_state + clear_crawl_status() + clear_progress_state() + except Exception: + pass + try: + status_file.unlink() + except Exception: + pass + # Splice the dim echo line above the streamed output. + try: + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + to_formatted_text as _tft, + ) + from io import StringIO as _SIO + from rich.console import Console as _RC + + _buf = _SIO() + _c = _RC( + file=_buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(original_line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(output_start_index, _echo_fragments) + except Exception: + pass + _print_command_footer(status_ref[0], duration) + state.last_command = "crawl" + state.last_status = status_ref[0] + state.last_duration = duration + is_input_locked[0] = False + # Buffer mutations have to run on the prompt_toolkit main + # loop thread — this ``_finish`` is on the worker thread, + # and calling ``input_buffer.reset()`` from here directly + # doesn't actually propagate to the displayed input + # (which is why the typed crawl command was still + # appearing in the prompt after ``✓ 28.10s``). Marshal + # the clear + invalidate through ``call_soon_threadsafe``. + def _apply_finish_state() -> None: + if status_ref[0] == "ok": + try: + input_buffer.reset() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + try: + loop = getattr(app, "loop", None) + if loop is not None: + loop.call_soon_threadsafe(_apply_finish_state) + else: + _apply_finish_state() + except Exception: + _apply_finish_state() + + is_input_locked[0] = True + try: + app.invalidate() + except Exception: + pass + + def _worker() -> None: + try: + _run() + finally: + current_worker[0] = None + try: + _finish() + except Exception: + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + worker_thread = threading.Thread(target=_worker, daemon=True) + current_worker[0] = worker_thread + worker_thread.start() + # ── Command execution (synchronous, output flows via patched stdout) ──── def _execute(line: str) -> bool: """Run a single REPL submission: meta-command or click command. @@ -2903,6 +3632,14 @@ def _execute(line: str) -> bool: scrollback.insert_line(meta_echo_idx, _echo_fragments) except Exception: _echo_to_scrollback(line) + # Successful meta command — clear the input so the prompt is + # ready for the next entry. Failed parses / typos take the + # ``meta is None`` path (unknown command) which leaves the + # buffer in place for the user to edit. + try: + input_buffer.reset() + except Exception: + pass return True if meta == "quit": # belt-and-braces; key binding usually catches it return True @@ -2997,9 +3734,8 @@ def _execute(line: str) -> bool: # the bottom prompt into first-run mode instead of routing through # ``run_in_terminal`` — the suspend/resume cycle to read a key in # the bare terminal feels jarring, and the masked in-place prompt - # is the same flow the user just learned at startup. Variants - # like ``auth --api-key KEY`` or ``auth --unsafe`` still go - # through click normally. + # is the same flow the user just learned at startup. ``auth + # --api-key KEY`` is non-interactive and still goes through click. if cmd_name == "auth" and len(args) == 1: _echo_to_scrollback(original_line) _first_run_needs_key[0] = True @@ -3016,7 +3752,42 @@ def _execute(line: str) -> bool: pass return True - args = state.apply_settings_to_args(args) + # ``auth --unsafe`` opens a multi-step disclaimer + masked-key + # prompt that fights our alt-buffer / termios state when invoked + # through ``run_in_terminal``. The flow appears to exit the REPL + # and leaves the terminal non-reactive while it blocks on + # synchronous stdin reads. Redirect the user to run it from a + # plain shell, where its interactive prompts work correctly. + if cmd_name == "auth" and "--unsafe" in args: + _echo_to_scrollback(original_line) + err_console.print( + f" [bold {BEE_YELLOW}]auth --unsafe[/] must be run from a " + f"plain shell, not inside the REPL." + ) + err_console.print( + f" [{BEE_DIM}]exit the REPL ([bold {BEE_YELLOW}]:q[/][{BEE_DIM}]) " + f"then run:[/] [bold]scrapingbee auth --unsafe[/]" + ) + return True + + # Only inject session defaults that the target command actually + # accepts; otherwise ``:set --json-response true`` would also + # apply to ``usage``, which rejects it as an unknown option. + args = state.apply_settings_to_args( + args, accepted=set(command_flags.get(cmd_name, [])) + ) + # Let users type ``--verbose true|false`` (etc.) in the REPL + # too — same normalisation as the CLI ``main()`` entry. + try: + from .cli_utils import ( + collect_bool_flag_names, + normalize_bool_flag_args, + ) + args = normalize_bool_flag_args( + args, collect_bool_flag_names(cli_group) + ) + except Exception: + pass # Mark the scrollback position where this command's output will # start. We DO NOT echo here — while the command runs, only the @@ -3030,6 +3801,15 @@ def _execute(line: str) -> bool: # moment of completion, without doubling up the live shimmer. output_start_index = scrollback.current_length() + # ``crawl`` is special — Twisted's reactor is a process-wide + # singleton, so we run each crawl in a fresh subprocess to make + # multiple crawls per REPL session work. The function below + # owns the full lifecycle (worker thread, status-file polling, + # _finish), so we return immediately here. + if cmd_name == "crawl": + _execute_crawl_subprocess(args, original_line, output_start_index) + return True + start = time.monotonic() status_ref = ["ok"] state.is_running = True @@ -3249,6 +4029,10 @@ def _accept(event): def _submit(event): text = input_buffer.text stripped = text.strip() + # Whether the user typed/pasted is collapsing back to a single + # line — once Enter fires we drop out of multi-line mode so + # the next prompt is single-line again. + _multiline_visible[0] = False if not stripped: # ``reset()`` clears the buffer AND the history-navigation # cursor (``working_index``). A plain set_document keeps the @@ -3268,6 +4052,13 @@ def _submit(event): input_buffer.reset() event.app.exit() return + # Multi-line submission (typically a paste of several commands): + # run the first line immediately, queue the rest. ``_ticker`` + # picks them up one at a time as soon as the input lock clears. + lines = [s for s in (ln.strip() for ln in stripped.splitlines()) if s] + if len(lines) > 1: + stripped = lines[0] + _pending_commands.extend(lines[1:]) # Persist the submitted line into the FileHistory before we kick off # execution. ``append_string`` is the right call (not # ``store_string``): the latter only writes to disk, leaving the @@ -3288,6 +4079,29 @@ def _submit(event): @kb.add("c-c") def _ctrl_c(event): + # Clear any queued multi-line commands so an aborted paste + # doesn't keep firing after the user explicitly cancels. + cleared_queue = False + if _pending_commands: + n_dropped = len(_pending_commands) + _pending_commands.clear() + cleared_queue = True + err_console.print( + f" [{BEE_DIM}]cancelled {n_dropped} queued command" + f"{'s' if n_dropped != 1 else ''}[/]" + ) + # If the input buffer is currently in multi-line mode (active + # paste preview), Ctrl+C clears it and drops back to single- + # line — treated as "consumed" so we don't fall through to + # ``event.app.exit()`` below. + cleared_multiline = False + if _multiline_visible[0]: + _multiline_visible[0] = False + cleared_multiline = True + try: + input_buffer.reset() + except Exception: + pass # If a worker thread is running, Ctrl+C stops that command rather # than exiting the REPL. We try two mechanisms in parallel: # @@ -3319,16 +4133,40 @@ def _cancel_all_tasks() -> None: except Exception: pass - # If a ``!shell`` command is running, terminate the subprocess - # directly — the worker thread is blocked in a C-level read() - # on the child's stdout pipe, so a Python-level - # KeyboardInterrupt won't fire until the read returns. - # ``terminate()`` sends SIGTERM; closing the pipe also frees - # the readline() loop. + # If a Scrapy crawl is running, the worker is parked inside + # Twisted's reactor (epoll/kqueue/select in C code), so + # neither asyncio cancellation nor PyThreadState_SetAsyncExc + # reaches it. ``reactor.callFromThread`` wakes the selector + # via the reactor's self-pipe and runs ``reactor.stop()`` on + # the reactor thread — the only thread-safe way to stop a + # running Twisted reactor from outside. + try: + from .crawl import stop_running_reactor + stop_running_reactor() + except Exception: + pass + + # If a ``!shell`` command or crawl subprocess is running, + # signal the child — the worker thread is blocked in a + # C-level read() on the child's stdout pipe, so a + # Python-level KeyboardInterrupt won't fire until the read + # returns. First Ctrl+C sends SIGTERM (lets Scrapy write + # the manifest, preserves partial output). A SECOND Ctrl+C + # within 2 s while the child is still running escalates to + # SIGKILL — useful when a long screenshot fetch keeps + # Twisted's reactor parked in select() and SIGTERM + # processing lags behind. Standard Unix Ctrl+C convention. proc = current_subprocess[0] if proc is not None: + now = time.monotonic() + last = _last_ctrl_c_time[0] + _last_ctrl_c_time[0] = now + still_running = proc.poll() is None try: - proc.terminate() + if still_running and (now - last) < 2.0: + proc.kill() + else: + proc.terminate() except Exception: pass @@ -3354,6 +4192,11 @@ def _cancel_all_tasks() -> None: # exiting; daemon worker dies with the process. event.app.exit() return + # No worker running. If we just dropped queued commands OR + # closed a multi-line paste preview, that was the user's intent + # for this Ctrl+C — don't also exit the REPL on top of it. + if cleared_queue or cleared_multiline: + return event.app.exit() @kb.add("c-d") @@ -3385,28 +4228,102 @@ def _suggestion_at_eol() -> bool: return False def _do_accept_suggestion(event): + """Accept the entire ghost-text suggestion (bound to End).""" buf = event.current_buffer sug = buf.suggestion if sug: buf.insert_text(sug.text) - kb.add("right", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion) + def _do_accept_suggestion_word(event): + """Accept the next word of the ghost-text suggestion (bound to + Right arrow). Splits at the first space — so on a suggestion + ``scrape https://www.scrapingbee.com --premium-proxy true``, + successive Right presses accept ``scrape `` → ``https://… `` → + ``--premium-proxy `` → ``true``. End remains the shortcut for + accepting the whole thing in one keystroke. + """ + buf = event.current_buffer + sug = buf.suggestion + if not sug or not sug.text: + return + text = sug.text + space_idx = text.find(" ") + if space_idx == -1: + buf.insert_text(text) + else: + buf.insert_text(text[: space_idx + 1]) + + kb.add("right", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion_word) kb.add("end", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion) - kb.add( - "tab", - filter=~has_completions & _suggestion_at_eol, - eager=True, - )(_do_accept_suggestion) _not_first_run = Condition(lambda: not _first_run_needs_key[0]) - @kb.add("tab", filter=~has_completions & ~_suggestion_at_eol & _not_first_run) + @kb.add("tab", filter=~has_completions & _not_first_run) def _tab_open(event): - # Tab opens the completion popup when no ghost suggestion is - # visible. Shift+Tab is the mode toggle. Suppressed during the - # first-run API key prompt — command-name completions are - # irrelevant there. - event.current_buffer.start_completion(select_first=False) + # Bash-style Tab behaviour with a ghost-text fallback: + # • exactly one completion match → accept inline (no popup), + # with a trailing space when the match is a command name. + # • multiple matches → open the popup WITHOUT modifying the + # buffer. We deliberately don't auto-insert the common + # prefix because doing so wipes any active ghost-text + # suggestion (the prefix change invalidates the ghost's + # attachment point). Users can pick from the popup with + # arrow keys, accept the ghost word with Right, or just + # keep typing. + # • zero matches BUT a ghost-text suggestion is visible → + # accept the next word of the suggestion (same as Right + # arrow does in our isolated suggestion handler). + # • zero matches AND no suggestion → open an empty popup + # (visual no-op). + # Wrapped in try/except so a flaky completer can't kill the + # binding handler. + buf = event.current_buffer + try: + from prompt_toolkit.completion import CompleteEvent as _CE + cmps = list(buf.completer.get_completions(buf.document, _CE())) + except Exception: + buf.start_completion(select_first=False) + return + # Helper: accept the next word (up to & including next space) of + # the ghost-text suggestion, mirroring what Right arrow does. + def _accept_ghost_word() -> bool: + sug = buf.suggestion + if not sug or not sug.text: + return False + text = sug.text + space_idx = text.find(" ") + buf.insert_text(text if space_idx == -1 else text[: space_idx + 1]) + return True + + if len(cmps) == 1: + c = cmps[0] + # Is the single match REDUNDANT with what's already typed? + # E.g. typing 'scrape' then Tab — the completer yields + # Completion(text='scrape', start_position=-6), which would + # replace 'scrape' with 'scrape' (net zero text change, just + # adds a trailing space). When that happens AND a ghost + # suggestion is showing, prefer advancing into the ghost — + # that's what the user actually wants progress on. + typed_before = buf.document.text_before_cursor + replaced = ( + typed_before[c.start_position:] if c.start_position < 0 else "" + ) + if c.text == replaced and _accept_ghost_word(): + return + try: + if c.start_position < 0: + buf.delete_before_cursor(count=-c.start_position) + # Trailing space for command names; flags (start with + # ``-``) get none so ``--key=value`` is still typable. + suffix = "" if c.text.startswith("-") else " " + buf.insert_text(c.text + suffix) + except Exception: + buf.start_completion(select_first=False) + return + if len(cmps) == 0: + if _accept_ghost_word(): + return + buf.start_completion(select_first=False) @kb.add("tab", filter=has_completions) def _tab_next(event): @@ -3426,12 +4343,124 @@ def _shift_tab_toggle_mode(event): def _esc(event): event.current_buffer.cancel_completion() + # ── Word-wise backward delete ───────────────────────────────────────── + # Bound to the conventional combos so muscle memory works regardless + # of OS / terminal: + # • Option+Backspace on macOS Terminal/iTerm sends ``escape`` + # followed by ``c-h`` (most common) or ``backspace`` (a few + # terminals) — we bind both. + # • Ctrl+W is the POSIX standard for ``unix-word-rubout``. + # ``find_start_of_previous_word`` returns a negative offset to the + # start of the previous word, or ``None`` when the cursor is at the + # buffer start. + def _word_delete_backward(event): + buf = event.current_buffer + pos = buf.document.find_start_of_previous_word(count=1, WORD=False) + if pos: + buf.delete_before_cursor(count=-pos) + + kb.add("escape", "backspace")(_word_delete_backward) + kb.add("escape", "c-h")(_word_delete_backward) + kb.add("c-w")(_word_delete_backward) + + # ── Disable reverse/forward incremental search ──────────────────────── + # prompt_toolkit's emacs defaults bind Ctrl+R and Ctrl+S to incremental + # history search, which writes into a hidden search buffer. Our layout + # has no SearchToolbar, so the search query renders nowhere — the user + # types into a black hole. Up/Down already walk the FileHistory, so we + # explicitly swallow the keys to avoid the broken default behaviour. + @kb.add("c-r") + @kb.add("c-s") + def _disable_incremental_search(event): + pass + + # ── Manual newline insertion ─────────────────────────────────────────── + # When the user wants to compose a multi-command batch by hand + # (rather than via paste), bind Alt+Enter and Ctrl+J to "insert + # newline + flip to multi-line mode". Most terminals don't + # distinguish Shift+Enter from plain Enter, so these are the + # portable shortcuts. Plain Enter remains "submit". + def _insert_newline(event): + _multiline_visible[0] = True + event.current_buffer.insert_text("\n") + + kb.add("escape", "enter")(_insert_newline) + kb.add("c-j")(_insert_newline) + + # ── Bracketed-paste cleanup ─────────────────────────────────────────── + # Pasted text comes in two flavours: + # 1. A single command soft-wrapped by the source (IDE, doc render, + # etc.) — the wrap inserts whitespace (sometimes CR) and may + # also insert a newline. We want to flatten those back into a + # single line so the command parses correctly. + # 2. A genuine multi-command paste (e.g. ``scrape …\ncrawl …``) + # where the user intends each line to run separately via the + # ``_pending_commands`` queue. + # Heuristic: if EVERY non-empty line begins with a recognized + # command name or REPL meta-prefix (``:``, ``!``), treat as + # multi-command (keep ``\n``). Otherwise treat as soft-wrap and + # join with spaces. ``\r`` is always normalised (CR is never a + # useful separator in our buffer). + from prompt_toolkit.keys import Keys as _Keys + _command_name_set = set(command_names) + + def _looks_like_command_line(line: str) -> bool: + s = line.strip() + if not s: + return False + if s.startswith((":", "!")): + return True + first = s.split(None, 1)[0] + return first in _command_name_set + + @kb.add(_Keys.BracketedPaste) + def _bracketed_paste(event): + # Bracketed paste handler. Two modes: + # • Pasted text contains a newline → switch buffer to + # multi-line mode and insert each line. The user can then + # edit any line and press Enter to submit the whole batch + # — ``_submit`` already splits multi-line text and queues + # subsequent lines via ``_pending_commands``. Esc / Ctrl+C + # clear the buffer and return to single-line mode. + # • No newlines → single-line paste. Collapse runs of + # spaces/tabs to single spaces (handles soft-wrap in the + # source rendering), CR to space, and insert normally. + import re as _re + # Normalise line endings: CRLF (Windows), CR (classic Mac), and + # LF all become a single ``\n``. Treating a lone CR as a space + # would silently collapse multi-line paste into one line on + # paste from sources that use CR only. + text = event.data.replace("\r\n", "\n").replace("\r", "\n") + if "\n" in text: + non_empty = [ln.strip() for ln in text.split("\n") if ln.strip()] + if non_empty: + _multiline_visible[0] = True + # Replace any current buffer contents with the pasted + # lines (no preserving partial input — multi-line paste + # is the dominant intent). The user can edit any line + # and Enter submits the batch. + event.current_buffer.text = "\n".join(non_empty) + event.current_buffer.cursor_position = len( + event.current_buffer.text + ) + return + text = _re.sub(r"[ \t]+", " ", text) + event.current_buffer.insert_text(text) + # ── History navigation ───────────────────────────────────────────────── # Plain Up/Down navigate the FileHistory at ~/.config/scrapingbee-cli/ # .history. When the completion menu is open these keys instead # navigate the menu (prompt_toolkit's default behaviour); the # ``~has_completions`` filter ensures we don't compete. - @kb.add("up", filter=~has_completions) + # Suppressed during the first-run API key prompt — otherwise an Up + # press would inject the previous command into the (masked) API key + # field, with no visible cue that the buffer is no longer empty. + # In multi-line mode (after a multi-line paste) arrow keys must + # navigate within the buffer instead of walking history, otherwise + # the user can't edit lines 2+ after pasting them. + _single_line_buffer = Condition(lambda: not _multiline_visible[0]) + + @kb.add("up", filter=~has_completions & _not_first_run & _single_line_buffer) def _history_back(event): buf = event.current_buffer # prompt_toolkit loads history asynchronously via a background @@ -3465,7 +4494,10 @@ def _history_back(event): pass buf.history_backward() - @kb.add("down", filter=~has_completions) + @kb.add( + "down", + filter=~has_completions & _not_first_run & _single_line_buffer, + ) def _history_forward(event): event.current_buffer.history_forward() @@ -3588,7 +4620,7 @@ def _toggle_mouse_mode(_event): async def _ticker(): import asyncio - from .theme import has_progress_state, tick_progress_render + from .theme import has_progress_state idle_counter = 0 # Track terminal width and trigger a fresh invalidate on resize. @@ -3600,15 +4632,31 @@ async def _ticker(): while True: await asyncio.sleep(0.1) - # Re-render the honeycomb progress widget while a batch is in - # flight so the boundary hex shimmers between completion - # events. ``tick_progress_render`` is a no-op when no batch - # state is set, so the cost is negligible when idle. - if has_progress_state(): + # Drain queued commands from a multi-line paste — only when + # the input lock is clear (previous command done) AND we're + # not in the API-key prompt. Pop one per tick so each + # command's footer renders before the next starts. + if ( + _pending_commands + and not is_input_locked[0] + and not _first_run_needs_key[0] + ): + next_cmd = _pending_commands.pop(0) try: - tick_progress_render() + if history is not None: + try: + history.append_string(next_cmd) + except Exception: + pass + _execute(next_cmd) except Exception: pass + # Trigger a frame redraw while progress is reporting so + # the honeycomb's boundary-hex shimmer animates. The fixed + # ``crawl_status_window`` reads progress state directly via + # ``_crawl_status_text`` on each invalidate — no separate + # scrollback rendering needed. + if has_progress_state(): try: app.invalidate() except Exception: @@ -3644,7 +4692,7 @@ async def _do_usage_refresh() -> None: import hashlib as _hashlib import json as _json - from .batch import write_usage_file_cache + from .batch import read_usage_file_cache, write_usage_file_cache from .client import Client, parse_usage from .config import BASE_URL, get_api_key @@ -3656,6 +4704,44 @@ async def _do_usage_refresh() -> None: # with the *same* key vs a different one, so the session counter # continues for the former and resets for the latter. key_hash = _hashlib.sha256(key.encode("utf-8")).hexdigest()[:16] + + # ── Cache-first fast path (REPL only) ────────────────────────── + # Sibling REPL sessions and batch/crawl pre-flight write to the + # same file cache. If the cache was refreshed within the TTL we + # can populate the + # toolbar without a live call — saving us a slot in the /usage + # rate limit. ``update_from_usage_response`` reads the same keys + # ``parse_usage`` writes, so we build a synthetic raw dict from + # the cache entry. ``current_concurrency`` isn't preserved in + # the cache, so the toolbar's `0/N` slot will lag by one tick; + # that's an acceptable trade for the rate-limit headroom. + cached = read_usage_file_cache(key) + if cached is not None: + try: + max_credit = cached.get("max_api_credit") + credits = cached.get("credits") + used_credit = ( + int(max_credit) - int(credits) + if isinstance(max_credit, (int, float)) + and isinstance(credits, (int, float)) + else None + ) + synthetic = { + "max_concurrency": cached.get("max_concurrency"), + "max_api_credit": max_credit, + "used_api_credit": used_credit, + } + state.update_from_usage_response(synthetic, key_hash=key_hash) + try: + app.invalidate() + except Exception: + pass + return + except Exception: + # Cache was malformed in some unexpected way — fall + # through to the live call. + pass + try: async with Client(key, BASE_URL) as client: data, _hdrs, status_code = await client.usage(retries=1, backoff=1.0) diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index a42cb88..4d60ae6 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -1,14 +1,10 @@ -"""ScrapingBee CLI theme: colors, styled output, and flapping-bee spinner. - -The spinner shows a single-line coloured bee with flapping wings and rotating -fun status messages tailored to each command. -""" +"""ScrapingBee CLI theme: colours and styled output helpers used by the +REPL renderer.""" from __future__ import annotations import os import sys -import threading from rich.console import Console from rich.text import Text @@ -107,6 +103,415 @@ def emit_progress_lines(lines: list[str]) -> None: sys.stderr.flush() +# -- Bee facts (rotating trivia shown while a command is in flight) --------- +# Surfaced on the dim row above the input in the REPL. Kept short so they +# fit on a single line even on narrow terminals. + +BEE_FACTS: list[str] = [ + "Did you know? Bees can fly up to 15 mph.", + "Did you know? A bee visits 50–100 flowers per trip.", + "Did you know? Bees have 5 eyes — two compound, three simple.", + "Did you know? Honey never spoils — jars from ancient Egypt are still edible.", + "Did you know? Bees communicate by dancing — the famous waggle dance.", + "Did you know? A single hive can house up to 60,000 bees.", + "Did you know? Bees flap their wings about 200 times per second.", + "Did you know? Bees can recognize individual human faces.", + "Did you know? One bee makes about 1/12 of a teaspoon of honey in its life.", + "Did you know? Bees navigate using the sun's position in the sky.", + "Did you know? Bees pollinate about one third of the food we eat.", + "Did you know? A queen bee can lay up to 2,000 eggs per day.", + "Did you know? Worker bees are all female.", + "Did you know? Bees see ultraviolet patterns we can't.", + "Did you know? Honeycomb hexagons tile flat space using the least wax — a property mathematicians proved only in 1999.", + "Did you know? Worker bees in a hive are about 75% genetically related to each other — human siblings are only 50%.", + "Did you know? A bee's brain is the size of a sesame seed.", + "Did you know? Bees have been around for more than 100 million years — older than most flowering plants.", + "Did you know? The buzzing sound is the rapid beat of a bee's wings.", + "Did you know? Bees can sense the Earth's magnetic field.", + "Did you know? In ancient Babylon, newlyweds drank honey-wine for a month — the likely origin of the word 'honeymoon'.", + "Did you know? A queen bee can live up to 5 years; a worker, only 6 weeks in summer.", + "Did you know? Drones (male bees) have no stinger.", + "Did you know? Bees fan their wings to cool the hive on hot days.", + "Did you know? Bees can tell time using internal circadian rhythms.", + "Did you know? A foraging bee can carry nectar weighing nearly half her body weight.", + "Did you know? Bumblebees can fly in the rain.", + "Did you know? Honeybees evolved from ancient predatory wasps.", + "Did you know? A swarm of bees can contain over 50,000 individuals.", + "Did you know? Bees regulate hive temperature within a degree of 35°C / 95°F.", + "Did you know? The queen's pheromones hold a colony together.", + "Did you know? Bees can recognize the smell of TNT — they're used in landmine detection.", + "Did you know? Bees make beeswax from special glands on their abdomen.", + "Did you know? Royal jelly is what turns a regular larva into a queen.", + "Did you know? Bees do a 'cleansing flight' after winter to relieve themselves.", + "Did you know? Honey is naturally antibacterial.", + "Did you know? Bees can travel up to 6 miles from their hive in a single trip.", + "Did you know? A bee colony collectively visits about 2 million flowers to make one pound of honey.", + "Did you know? Bees have hair on their eyes to collect more pollen.", + "Did you know? Worker bees switch jobs as they age — nurse, builder, guard, then forager.", + "Did you know? The bee was a heraldic emblem of Napoleon's imperial regime.", + "Did you know? Honey has been found preserved in pharaohs' tombs.", + "Did you know? Bees can be trained to detect cancer in human breath.", + "Did you know? The phrase 'busy as a bee' first appeared in Chaucer's Canterbury Tales.", + "Did you know? Stingless bees exist — about 500 species worldwide.", + "Did you know? The mason bee is a far more efficient pollinator than honeybees.", + "Did you know? Bees produce six different products: honey, beeswax, pollen, propolis, royal jelly, and venom.", + "Did you know? 'Propolis' is Greek for 'before the city' — bees seal the hive entrance with it to keep out invaders.", + "Did you know? Bees prefer flowers with caffeine — it boosts their memory.", + "Did you know? Bees actually build round cells first — surface tension in the warm wax reshapes them into hexagons.", + "Did you know? Worker bees flap their wings to evaporate water from nectar, making honey.", + "Did you know? Bumblebees are excellent at 'buzz pollination' — vibrating flowers to release pollen.", + "Did you know? Honey's color depends on which flowers the bees visited.", + "Did you know? A bee's stomach holds 70 mg of nectar — nearly its own weight.", + "Did you know? Africanized 'killer' bees came from a 1957 lab accident in Brazil.", + "Did you know? Honeybees are not native to the Americas — they were brought from Europe.", + "Did you know? A bee's alarm pheromone smells like banana — isoamyl acetate, the very same compound.", + "Did you know? The smallest bee in the world is just 2 mm long (Perdita minima).", + "Did you know? The largest bee is Wallace's giant bee, about the length of a thumb.", + "Did you know? Foraging bees find efficient routes between flowers using simple flight-rule heuristics.", + "Did you know? Honey takes 7 days to ripen from nectar inside the hive.", + "Did you know? Bees were used in ancient warfare — Greeks catapulted hives over castle walls.", + "Did you know? Bees use 'undertakers' — workers whose job is to remove dead bees from the hive.", + "Did you know? Bees can count up to four.", + "Did you know? A single bee can produce only about half a gram of wax in her lifetime.", + "Did you know? Bumblebees can carry a load close to their own body weight in pollen and nectar.", + "Did you know? In Mycenaean Greece, priestesses of the goddess Demeter were called 'Melissai' — the bees.", + "Did you know? Mead — honey wine — may be humanity's oldest fermented drink.", + "Did you know? A worker bee can sting only once; the stinger is barbed.", + "Did you know? Honey contains hydrogen peroxide, produced by an enzyme bees add to nectar.", + "Did you know? Bees can be left-handed or right-handed when entering flowers.", + "Did you know? Beekeeping appears in Egyptian wall art dating back 4,500 years.", + "Did you know? The 'Queen of the Hive' is actually selected by worker bees in larval stage.", + "Did you know? Without bees, most almonds, blueberries, and apples wouldn't exist as we know them.", + "Did you know? A bee's wings beat fast enough to generate static electricity, which attracts pollen.", + "Did you know? Bees have two stomachs — one for eating, one for storing nectar.", + "Did you know? Killer bees are not particularly venomous — they're just very aggressive.", + "Did you know? Honey crystallization is normal — gentle warming returns it to liquid.", + "Did you know? Bees prefer blue, purple, and yellow flowers — red appears black to them.", + "Did you know? Nearly 90% of wild plants depend on animal pollinators, mostly bees.", + "Did you know? Bees take orientation flights before becoming foragers, memorizing landmarks.", + "Did you know? Some bee species are solitary — they don't form colonies at all.", + "Did you know? A bee scientist is called a melittologist.", + "Did you know? Bees were the totem of the Egyptian pharaohs.", + "Did you know? The Mayans practiced beekeeping with stingless Melipona bees.", + "Did you know? Bees use propolis to mummify intruders they can't carry out of the hive.", + "Did you know? In rural England, 'telling the bees' of a death in the family was tradition — leave them out and they'd reportedly abandon the hive.", + "Did you know? A queen bee mates with up to 20 drones in a single flight.", + "Did you know? Honey from different regions tastes completely different — manuka, acacia, clover, lavender.", + "Did you know? Bees can teach each other to use tools.", + "Did you know? Some bees sleep — even with their tongues sticking out.", + "Did you know? Honeycomb cells tilt slightly upward — about 13 degrees — so liquid honey doesn't drip out before it ripens.", + "Did you know? Drones die immediately after mating with the queen.", + "Did you know? Bee venom is being researched as a cancer treatment.", + "Did you know? In Slovenia, beekeeping is so culturally important it's on UNESCO's heritage list.", + "Did you know? Bees can be tracked individually using tiny radio tags.", + "Did you know? The waggle dance can encode distance, direction, and quality of a food source.", + "Did you know? Bees can perceive flower humidity to estimate nectar quality.", + "Did you know? Hive bees fan their wings in coordinated rows to ventilate the colony.", + "Did you know? Pollen is the bee's only source of protein.", + "Did you know? Bees are the only insects that produce food eaten by humans.", + "Did you know? Some orchids look and smell like female bees to trick males into pollinating them.", + "Did you know? Bees recognize their hive entrance by its exact location, not by smell alone.", + "Did you know? Aristotle wrote one of the earliest scientific treatises on beekeeping.", + "Did you know? The hum of a healthy hive is around 250 Hz.", + "Did you know? Bees prefer warm nectar — they're cold-blooded but warm their flight muscles to 35°C.", + "Did you know? Honey contains pinocembrin, an antioxidant studied for its links to brain health.", + "Did you know? In winter, honeybees cluster tightly and shiver their wing muscles to keep the hive warm.", + "Did you know? A worker bee's lifespan in winter is up to 6 months — much longer than summer bees.", + "Did you know? The queen bee produces over 30 different pheromones to manage the colony.", + "Did you know? A pound of honey requires bees to fly the equivalent of three orbits around Earth.", +] + + +def current_bee_fact(tick: int, period_ticks: int = 50) -> str: + """Pick a bee fact from the list, rotating once every ``period_ticks`` + ticks of the REPL's 10 Hz ticker. Default 50 → a new fact every 5s. + """ + if not BEE_FACTS: + return "" + return BEE_FACTS[(tick // max(1, period_ticks)) % len(BEE_FACTS)] + + +# -- Bee-themed action verbs (rotate in place of the static "running") ------ +# Used as the toolbar status label while a command is in flight. Plain +# -ing verbs so they slot grammatically into `` · 12.3s``. + +BEE_VERBS: list[str] = [ + "pollinating", + "buzzing", + "foraging", + "gathering nectar", + "scouting flowers", + "waggle-dancing", + "tending the hive", + "building combs", + "harvesting honey", + "on the wing", + "working the field", + "humming along", + "fanning the hive", + "guarding the entrance", + "swarming", + "courting flowers", + "loading pollen baskets", + "patrolling petals", + "communing with clover", + "sipping nectar", + "weaving wax", + "circling the queen", + "ferrying nectar", + "cleaning cells", + "warming brood", + "deciphering scent trails", + "navigating by sun", + "feeding the queen", + "polishing the comb", + "humming homeward", + "tasting petals", + "marking flowers", + "scouting territories", + "buzzing through HTML", + "extracting honey", + "pollinating pages", + "harvesting data", + "chasing redirects", + "weaving CSS", + "decoding selectors", + "rendering blossoms", + "sniffing user agents", + "scrubbing trackers", +] + + +def current_bee_verb(tick: int, period_ticks: int = 25) -> str: + """Pick a bee verb from the list, rotating once every ``period_ticks`` + ticks. Default 25 → a new verb every 2.5s on the 10 Hz ticker — fast + enough to feel alive on quick scrapes, slow enough not to flicker. + """ + if not BEE_VERBS: + return "running" + return BEE_VERBS[(tick // max(1, period_ticks)) % len(BEE_VERBS)] + + +def current_bee_blurb(tick: int, period_ticks: int = 50) -> str: + """Pick the dim-row content while a command is in flight, alternating + between a "…" bee verb and a "Did you know? ..." fact every + ``period_ticks`` ticks (default 50 → a 5-second switch on the 10 Hz + ticker). The FIRST slot is always a verb so quick commands + (``usage``, ``docs``, fast scrapes) show a natural action label + rather than a flash of trivia. Subsequent slots alternate + verb → fact → verb → fact for the user to read while they wait. + + The fact index and verb index are independent, so the rotation + doesn't cycle the same fact/verb pair together — the lists have + different lengths and advance on their own slot counters. + """ + slot = tick // max(1, period_ticks) + if slot % 2 == 0: + if not BEE_VERBS: + return "" + verb_idx = (slot // 2) % len(BEE_VERBS) + return BEE_VERBS[verb_idx] + "…" + if not BEE_FACTS: + return "" + fact_idx = (slot // 2) % len(BEE_FACTS) + return BEE_FACTS[fact_idx] + + +# -- Crawl live-status state (current URL, fetched count, phase) ------------ +# The Scrapy spider's signal handlers push updates here from the worker +# thread; the REPL's ticker reads them on the main thread to repaint the +# dim row above the input. ``_crawl_status`` is intentionally a plain +# dict mutation since (a) Python dict assignments are atomic and (b) the +# update pattern is single-key writes from one writer at a time, so no +# explicit lock is needed. + +_crawl_status: dict | None = None + + +def update_crawl_status( + *, + current_url: str | None = None, + fetched: int | None = None, + queued: int | None = None, + saved: int | None = None, + phase: str | None = None, +) -> None: + """Update one or more fields of the crawl status. Any field left as + ``None`` keeps its previous value (so a per-signal handler can update + just the field it knows about). + + Subprocess crawl mode: the REPL parent runs each crawl in a child + Python process so it gets a fresh Twisted reactor. The child has no + way to push into the parent's in-memory ``_crawl_status``, so when + the env var ``SCRAPINGBEE_CRAWL_STATUS_FILE`` is set we *also* + mirror the current dict to that JSON file. The parent's ticker + polls the file and forwards updates back into its own + ``_crawl_status`` so the layout window keeps showing live progress. + """ + global _crawl_status # noqa: PLW0603 + if _crawl_status is None: + _crawl_status = { + "current_url": None, + "fetched": 0, + "queued": 0, + "saved": 0, + "phase": "starting", + } + if current_url is not None: + _crawl_status["current_url"] = current_url + if fetched is not None: + _crawl_status["fetched"] = fetched + if queued is not None: + _crawl_status["queued"] = queued + if saved is not None: + _crawl_status["saved"] = saved + if phase is not None: + _crawl_status["phase"] = phase + _maybe_mirror_to_status_file() + + +def _maybe_mirror_to_status_file() -> None: + """Atomic write of ``_crawl_status`` + progress state to + ``$SCRAPINGBEE_CRAWL_STATUS_FILE`` so a polling parent process sees + updates without read/write races. Atomic-rename pattern (write to + ``.tmp``, ``os.replace``) keeps the parent from ever reading a + half-flushed JSON file. + + Progress data (``_progress_state``) rides on the same payload — + that's how the parent learns about a known total (sitemap mode, + ``--max-pages N``) and can show the honeycomb bar above the URL + line in its fixed widget. + """ + sf = os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE") + if not sf: + return + if _crawl_status is None and _progress_state is None: + return + try: + import json as _json + payload: dict = {} + if _crawl_status is not None: + payload.update(_crawl_status) + if _progress_state is not None: + payload["progress_completed"] = _progress_state.get("completed") + payload["progress_total"] = _progress_state.get("total") + payload["progress_rps"] = _progress_state.get("rps") + payload["progress_eta"] = _progress_state.get("eta") + payload["progress_failure_pct"] = _progress_state.get("failure_pct") + tmp = sf + ".tmp" + with open(tmp, "w", encoding="utf-8") as fh: + _json.dump(payload, fh) + os.replace(tmp, sf) + except Exception: + pass + + +def get_crawl_status() -> dict | None: + return _crawl_status + + +def has_crawl_status() -> bool: + return _crawl_status is not None + + +def clear_crawl_status() -> None: + global _crawl_status # noqa: PLW0603 + _crawl_status = None + sf = os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE") + if sf: + try: + os.unlink(sf) + except Exception: + pass + + +def tick_crawl_render() -> None: + """Re-render the dedicated crawl status widget in scrollback. Same + in-place mechanism as the batch honeycomb (``emit_progress_lines`` + replaces the last N lines), but rendering the crawl-specific + content: a status line with ``: (X fetched[/Y])`` + plus, when a total is known (sitemap mode), the honeycomb + progress bar above it. + + Safe to call when no crawl is in flight — early-exits if + ``_crawl_status`` is None. + """ + if _crawl_status is None: + return + import io + from rich.console import Console as _RC + + lines_text: list[Text] = [] + progress = _progress_state + if progress is not None: + # Sitemap-mode batch-style bar, identical to the batch widget. + rows = format_honeycomb_grid( + completed=progress["completed"], + total=progress["total"], + rps=progress.get("rps"), + eta=progress.get("eta"), + failure_pct=progress.get("failure_pct"), + animate=True, + ) + lines_text.extend(rows) + + # Always include the live URL / fetched-count line below the bar. + status_text = Text() + status_text.append(" ") + phase = _crawl_status.get("phase") or "fetching" + url = _crawl_status.get("current_url") + fetched = _crawl_status.get("fetched") or 0 + saved = _crawl_status.get("saved") or 0 + if url and len(url) > 80: + url = url[:48] + "…" + url[-25:] + status_text.append(f"{phase}: ", style=f"bold {BEE_YELLOW}") + if url: + status_text.append(url, style=BEE_WHITE) + else: + status_text.append("…", style="dim") + status_text.append(f" ({fetched} fetched", style="dim") + if saved: + status_text.append(f", {saved} saved", style="dim") + status_text.append(")", style="dim") + lines_text.append(status_text) + + rendered: list[str] = [] + for row in lines_text: + buf = io.StringIO() + _c = _RC( + file=buf, force_terminal=True, color_system="truecolor", + highlight=False, width=200, + ) + _c.print(row, end="") + rendered.append(buf.getvalue()) + emit_progress_lines(rendered) + + +def crawl_status_line() -> str | None: + """Build a single-line status string. Kept around for any caller + that wants a one-line crawl summary; the live in-scrollback widget + uses ``tick_crawl_render`` instead. + """ + if _crawl_status is None: + return None + phase = _crawl_status.get("phase") or "fetching" + url = _crawl_status.get("current_url") + fetched = _crawl_status.get("fetched") or 0 + saved = _crawl_status.get("saved") or 0 + # Trim very long URLs so the line fits on narrow terminals — keep the + # prefix (scheme + host + start of path) and the tail (last 25 chars) + # so users can still recognise the page. + if url and len(url) > 80: + url = url[:48] + "…" + url[-25:] + if url: + suffix = f" ({fetched} fetched" + if saved: + suffix += f", {saved} saved" + suffix += ")" + return f"{phase}: {url}{suffix}" + return f"{phase}… ({fetched} fetched)" + + # -- Shared progress state for the REPL ticker animation --------------------- # batch.py calls ``update_progress_state`` on each completion to record # latest counts/rates. The REPL ticker calls ``tick_progress_render`` at @@ -133,6 +538,25 @@ def update_progress_state( "eta": eta, "failure_pct": failure_pct, } + # In the crawl subprocess we hand state to the parent via the + # status file (``_maybe_mirror_to_status_file`` reads + # ``_progress_state`` alongside ``_crawl_status``). Rendering here + # would emit honeycomb rows via ``emit_progress_lines`` → the + # stderr fallback (no ``_progress_renderer`` is installed in the + # child), and the parent would then ingest those rows into + # scrollback as duplicates because each Scrapy log line displaces + # the ``replace_last_n_lines`` anchor. + if os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE"): + _maybe_mirror_to_status_file() + return + # In the REPL parent during a crawl (``_crawl_status`` non-None), + # the fixed crawl_status widget reads ``_progress_state`` directly + # and renders the honeycomb in place. Rendering through + # ``tick_progress_render`` here would ALSO write to scrollback + # (the batch path), giving the same duplicate-rows problem the + # child fix already solved. + if _crawl_status is not None: + return tick_progress_render() @@ -228,164 +652,6 @@ def _render_inline_bee(frame_idx: int) -> Text: return text -# -- Spinner ----------------------------------------------------------------- - - -# Hex bloom — a "honey crystallising" cycle expressed as a 3-cell-wide -# animation that radiates from the centre outward. Pure geometry, no -# mascot: dot grows into a honeycomb cell, peaks at a four-pointed -# sparkle (the moment crystals form), then drains back. -# -# The middle cell is the focal point and stays anchored; "halo" cells -# appear and disappear symmetrically so the bloom feels like it's growing -# in all directions, not rightward. -# -# Each frame pairs a 3-character composition with a colour from a -# dim→bright→warm gradient so the eye reads a glowing, breathing shape. -# -# Frames (centre + halo, always 3 cells wide): -# " · " dust (dim grey) -# " • " speck (dim amber) -# "·⬡·" outline + halo (amber) -# "·⬢·" honeycomb + halo (bright yellow) -# "⬡✦⬡" sparkle + halo (warm yellow-orange — PEAK / crystallised) -# "·⬢·" descending -# "·⬡·" -# " • " -_HEX_BLOOM_FRAMES: list[tuple[str, str]] = [ - (" · ", "#555555"), - (" • ", "#886600"), - ("·⬡·", "#BAA000"), - ("·⬢·", "#FFCD23"), - ("⬡✦⬡", "#FFB13D"), - ("·⬢·", "#FFCD23"), - ("·⬡·", "#BAA000"), - (" • ", "#886600"), -] - -# Per-command verbs that rotate during the pulse — keep them short and active. -_PHRASES: dict[str, list[str]] = { - "scrape": ["Fetching", "Rendering", "Extracting"], - "crawl": ["Crawling", "Following links", "Discovering"], - "google": ["Searching", "Querying"], - "fast-search": ["Searching"], - "amazon-product": ["Fetching product"], - "amazon-search": ["Searching Amazon"], - "walmart-product": ["Fetching product"], - "walmart-search": ["Searching Walmart"], - "youtube-search": ["Searching"], - "youtube-metadata": ["Fetching metadata"], - "chatgpt": ["Querying", "Thinking"], - "usage": ["Checking credits"], - "sitemap": ["Fetching sitemap"], -} - -_FRAME_INTERVAL = 0.08 # seconds per frame ⇒ ~12 fps, smooth bloom -_PHRASE_DURATION_FRAMES = 30 # rotate verb every ~2.4s -_SHIMMER_DIVISOR = 2 # shimmer advances every N bloom frames - -# Shimmer palette — one bright "peak" cell sweeps across the verb, with two -# flank cells receiving softer highlights so the glim feels like a wave -# instead of a hard cursor. -_SHIMMER_PEAK = "#FFFFFF" -_SHIMMER_FLANK = "#FFE780" - - -def _shimmer_text(text: str, position: int, base_color: str) -> Text: - """Render `text` with a glimmer of light at `position`. - - The character at `position` is bright white; characters at ±1 are warm - light yellow; everything else uses `base_color`. Combined with a position - that advances each frame, this reads as a glow sweeping across the word. - """ - out = Text() - for i, ch in enumerate(text): - distance = abs(i - position) - if distance == 0: - style = f"bold {_SHIMMER_PEAK}" - elif distance == 1: - style = f"bold {_SHIMMER_FLANK}" - else: - style = f"bold {base_color}" - out.append(ch, style=style) - return out - - -class MiniBeeSpinner: - """Single-line pulsing-asterisk spinner with a rotating command verb. - - Usage:: - - with MiniBeeSpinner("scrape"): - await do_request() - - Renders one line: a Claude-style asterisk that blooms (· → ✻ → ·), a - short verb that rotates every ~2.4s ("Fetching" / "Rendering" / ...), - and an elapsed-time counter once the operation passes 0.5s. - """ - - def __init__(self, message: str = "") -> None: - self._label = message - # Resolve the verb cycle: per-command phrases if known, else just the - # label as a single static verb. - self._phrases = _PHRASES.get(message, [message] if message else ["Working"]) - self._stop = threading.Event() - self._thread: threading.Thread | None = None - - def _animate(self) -> None: - import time - - start = time.monotonic() - idx = 0 - while not self._stop.is_set(): - glyph, color = _HEX_BLOOM_FRAMES[idx % len(_HEX_BLOOM_FRAMES)] - phrase = self._phrases[(idx // _PHRASE_DURATION_FRAMES) % len(self._phrases)] - shimmer_pos = (idx // _SHIMMER_DIVISOR) % max(1, len(phrase)) - elapsed = time.monotonic() - start - - line = Text() - line.append(" ") - line.append(glyph, style=f"bold {color}") - line.append(" ") - line.append_text(_shimmer_text(phrase, shimmer_pos, BEE_YELLOW)) - if elapsed >= 0.5: - line.append(f" · {elapsed:.1f}s", style="dim") - - with err_console.capture() as capture: - err_console.print(line, end="") - sys.stderr.write("\r\033[K" + capture.get()) - sys.stderr.flush() - - idx += 1 - self._stop.wait(_FRAME_INTERVAL) - - # Clear the spinner line. - sys.stderr.write("\r\033[K") - sys.stderr.flush() - - def start(self) -> None: - # Disabled inside the REPL: the spinner's `\r`-rewrites would flow - # through patch_stdout and trigger a bottom-strip redraw on every - # frame, causing visible flicker. The REPL's toolbar conveys the - # "running" state instead. - if _repl_mode: - return - if not sys.stderr.isatty(): - return - self._thread = threading.Thread(target=self._animate, daemon=True) - self._thread.start() - - def stop(self) -> None: - self._stop.set() - if self._thread is not None: - self._thread.join(timeout=1) - - def __enter__(self) -> MiniBeeSpinner: - self.start() - return self - - def __exit__(self, *_: object) -> None: - self.stop() # -- Styled output helpers --------------------------------------------------- @@ -455,113 +721,6 @@ def format_progress_line( return text -# -- Live credit tracker (polls usage API during batch/crawl) ---------------- - - -class LiveCreditTracker: - """Background thread that polls the usage API every 20 seconds and prints - an updating honeycomb credit line to stderr. Only active in REPL mode. - - Usage:: - - with LiveCreditTracker(api_key, initial_remaining=33_000_000, total=50_000_000): - run_batch(...) - """ - - _POLL_INTERVAL = 20 # seconds (safe: 3× per minute, limit is 6×) - - def __init__( - self, - api_key: str, - *, - initial_remaining: int | None = None, - total: int | None = None, - ) -> None: - self._api_key = api_key - self._remaining = initial_remaining - self._total = total - self._start_remaining = initial_remaining - self._stop = threading.Event() - self._thread: threading.Thread | None = None - - # -- internal ------------------------------------------------------------ - - def _fetch(self) -> tuple[int, int] | None: - """Return (remaining, total) or None on error.""" - import asyncio - import json as _json - - from .client import Client - from .config import BASE_URL - - try: - async def _go() -> tuple[int, int] | None: - async with Client(self._api_key, BASE_URL, timeout=10) as c: - body, _, code = await c.usage() - if code == 200: - raw = _json.loads(body) - used = raw.get("used_api_credit", 0) or 0 - total = raw.get("max_api_credit", 0) or 0 - return total - used, total - return None - - return asyncio.run(_go()) - except Exception: - return None - - def _print_meter(self) -> None: - if self._remaining is None or self._total is None: - return - line = Text() - line.append(" ⬡ Credits: ", style=f"bold {BEE_YELLOW}") - line.append_text(format_honeycomb_meter( - self._total - self._remaining, self._total - )) - if self._start_remaining is not None: - consumed = self._start_remaining - self._remaining - if consumed > 0: - line.append(f" (−{consumed:,} this session)", style="dim") - err_console.print(line) - - def _run(self) -> None: - while not self._stop.wait(self._POLL_INTERVAL): - if self._stop.is_set(): - break - result = self._fetch() - if result: - self._remaining, self._total = result - self._print_meter() - - # -- public -------------------------------------------------------------- - - def start(self) -> None: - # Disabled inside the REPL. The REPL's bottom toolbar already shows - # credits + a usage gauge; running this thread additionally would - # repaint the bottom strip every ~0.5s via `\r`-rewrites that flow - # through patch_stdout, which is exactly what we see as flicker - # during a scrape. (Direct CLI mode — `scrapingbee scrape ...` outside - # the REPL — still gets the live meter on stderr as before.) - if _repl_mode: - return - # Print initial meter immediately if we have data - if self._remaining is not None: - self._print_meter() - self._thread = threading.Thread(target=self._run, daemon=True) - self._thread.start() - - def stop(self) -> None: - self._stop.set() - if self._thread is not None: - self._thread.join(timeout=2) - - def __enter__(self) -> LiveCreditTracker: - self.start() - return self - - def __exit__(self, *_: object) -> None: - self.stop() - - # -- Honeycomb credit meter -------------------------------------------------- From 51a66fd26acdedc10f73c1b1adc1c507d7ad9204 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 20 May 2026 09:41:54 +0530 Subject: [PATCH 13/15] chore: clear ruff + ty diagnostics on REPL branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename _CrawlerReactorAlreadyUsed → _CrawlerReactorAlreadyUsedError (N818) - Drop CamelCase import aliases and lowercase in-function constants flagged by N806/N813/N814 across interactive.py, cli_utils.py, theme.py - Route dynamic attribute access through getattr/setattr for twisted's reactor (callFromThread/stop), scrapy Spider._crawler, sys.stdout/err .buffer adapter install, and rich Console.file rebind so ty stops flagging unresolved-attribute / invalid-assignment - Import lxml.etree / lxml.html via importlib so ty resolves the compiled submodules - Pass loop_factory to asyncio.run via **kwargs (3.12+ signature) and install the wrapper via setattr to satisfy ty - Guard sys.__stdout__ None case and tighten _set_text null-check - Remove unused type:ignore comments and the now-unused shutil.get_ terminal_size assignment in interactive.py - Delete stale tests in test_crawl.py that referenced helpers removed by the pool-based screenshot crawl rewrite (_parse_discovery_links_only, _NON_HTML_URL_EXTENSIONS) --- src/scrapingbee_cli/batch.py | 3 - src/scrapingbee_cli/cli_utils.py | 9 +- src/scrapingbee_cli/crawl.py | 22 ++- src/scrapingbee_cli/interactive.py | 233 ++++++++++++++++------------- src/scrapingbee_cli/theme.py | 12 +- tests/unit/test_crawl.py | 163 -------------------- 6 files changed, 160 insertions(+), 282 deletions(-) diff --git a/src/scrapingbee_cli/batch.py b/src/scrapingbee_cli/batch.py index 21d27ca..f7e5b0b 100644 --- a/src/scrapingbee_cli/batch.py +++ b/src/scrapingbee_cli/batch.py @@ -20,8 +20,6 @@ from .config import BASE_URL, get_api_key from .theme import ( echo_warning, - err_console, - format_honeycomb_trail, is_repl_mode, notify_completion, print_completion_summary, @@ -809,7 +807,6 @@ def write_batch_output_to_dir( def _save_batch_meta(output_dir: str, total: int, succeeded: int, failed: int) -> None: """Save batch metadata for --resume discovery.""" import json as _json - import sys from datetime import datetime, timezone meta_path = os.path.join(output_dir, _BATCH_META_FILE) diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index 60a166f..9edc0fc 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -20,7 +20,6 @@ styled_echo, ) - _REPL_PREVIEW_MAX_LINES = 30 _REPL_PREVIEW_MAX_BYTES = 4000 @@ -137,19 +136,19 @@ def normalize_bool_flag_args( ``--verbose`` → unchanged ``--no-verbose`` → unchanged (Click's own ``--no-x`` form) """ - _TRUE = {"true", "1", "yes", "on"} - _FALSE = {"false", "0", "no", "off"} + _true = {"true", "1", "yes", "on"} + _false = {"false", "0", "no", "off"} out: list[str] = [] i = 0 while i < len(args): tok = args[i] if tok in flag_names and i + 1 < len(args): next_lv = args[i + 1].strip().lower() - if next_lv in _TRUE: + if next_lv in _true: out.append(tok) i += 2 continue - if next_lv in _FALSE: + if next_lv in _false: # Skip the flag entirely; default value applies. i += 2 continue diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py index 0ee5d73..451e383 100644 --- a/src/scrapingbee_cli/crawl.py +++ b/src/scrapingbee_cli/crawl.py @@ -30,7 +30,7 @@ MIDDLEWARE_PRIORITY = 725 -class _CrawlerReactorAlreadyUsed(RuntimeError): +class _CrawlerReactorAlreadyUsedError(RuntimeError): """Raised when Twisted's reactor has already been started + stopped in this Python process and can't be re-used for another crawl. The REPL surfaces a friendly message asking the user to restart the @@ -53,13 +53,21 @@ def stop_running_reactor() -> bool: thread, where ``reactor.stop()`` can run safely. """ try: - from twisted.internet import reactor # type: ignore[import-not-found] + from twisted.internet import reactor except Exception: return False if not getattr(reactor, "running", False): return False try: - reactor.callFromThread(reactor.stop) + # ``callFromThread`` / ``stop`` are populated dynamically when + # the reactor is installed; the static module stub doesn't + # carry them. ``getattr`` keeps the type checker quiet without + # rerouting the runtime hot path. + cft = getattr(reactor, "callFromThread", None) + stop = getattr(reactor, "stop", None) + if cft is None or stop is None: + return False + cft(stop) return True except Exception: return False @@ -93,7 +101,7 @@ def _ensure_reactor_usable() -> None: if reactor is None: return # No reactor has been installed yet, nothing to check. if getattr(reactor, "_startedBefore", False): - raise _CrawlerReactorAlreadyUsed( + raise _CrawlerReactorAlreadyUsedError( "Crawls in this REPL session have ended. Twisted's reactor " "is single-shot per process — please run ``:q`` and relaunch " "scrapingbee to crawl again." @@ -458,13 +466,15 @@ def from_crawler(cls, crawler, *args, **kwargs): spider = super().from_crawler(crawler, *args, **kwargs) try: from scrapy import signals as _scrapy_signals + from .theme import is_repl_mode # Stash the crawler so signal handlers can dispatch new # requests via ``crawler.engine.crawl`` (needed from # ``spider_idle`` to flush the pool when discovery exhausts - # without saturating). - spider._crawler = crawler + # without saturating). ``Spider`` doesn't declare this slot + # so we use ``setattr`` to keep the type checker happy. + setattr(spider, "_crawler", crawler) # The pool-based discovery flow needs to flush queued URLs # at spider_idle (when discovery exhausts before reaching diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index 526ceac..b8cb600 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -32,7 +32,8 @@ import sys import threading import time -from typing import TYPE_CHECKING, Any, Iterable +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any from rich.text import Text @@ -145,9 +146,7 @@ def closed(self) -> bool: try: from prompt_toolkit.auto_suggest import AutoSuggest as _PTKAutoSuggest except Exception: # pragma: no cover — prompt_toolkit should always be present - _PTKAutoSuggest = object # type: ignore[assignment,misc] - - + _PTKAutoSuggest = object # type: ignore[misc,assignment] class BeeAutoSuggest(_PTKAutoSuggest): """Context-aware ghost-text autosuggest for the REPL prompt. @@ -339,9 +338,9 @@ def _make_capped_history(filename: str, max_entries: int = 10_000): try: with open(tmp, "wb") as f: for s in keep_oldest_first: - f.write(f"\n# {now}\n".encode("utf-8")) + f.write(f"\n# {now}\n".encode()) for line in s.split("\n"): - f.write(f"+{line}\n".encode("utf-8")) + f.write(f"+{line}\n".encode()) _os.replace(tmp, filename) except Exception: try: @@ -415,8 +414,12 @@ def __init__(self) -> None: self.scroll_offset = 0 self._lock = threading.Lock() - def append_fragments(self, fragments: list[tuple[str, str]]) -> None: - """Append one rendered line (already styled) as the final entry.""" + def append_fragments(self, fragments: list) -> None: + """Append one rendered line (already styled) as the final entry. + ``fragments`` is the prompt_toolkit ``StyleAndTextTuples`` + shape — either ``(style, text)`` or ``(style, text, handler)``. + Typed loosely so callers using either variant are accepted. + """ with self._lock: self.lines.append(list(fragments)) if len(self.lines) > self.MAX_LINES: @@ -425,7 +428,7 @@ def append_fragments(self, fragments: list[tuple[str, str]]) -> None: drop = self.MAX_LINES // 10 del self.lines[:drop] - def replace_last_line(self, fragments: list[tuple[str, str]]) -> None: + def replace_last_line(self, fragments: list) -> None: """Overwrite the most recent line. Used for in-place progress updates via the standard terminal ``\\r`` idiom — write ``\\r\\n`` and the previous line gets replaced rather @@ -437,9 +440,7 @@ def replace_last_line(self, fragments: list[tuple[str, str]]) -> None: else: self.lines.append(list(fragments)) - def replace_last_n_lines( - self, n: int, lines: list[list[tuple[str, str]]] - ) -> None: + def replace_last_n_lines(self, n: int, lines: list) -> None: """Replace the most recent ``n`` lines with the given ``lines``. If fewer than ``n`` lines exist, the remainder is appended. Used for multi-line in-place progress widgets (e.g. the @@ -586,7 +587,7 @@ def at_bottom(self) -> bool: with self._lock: return self.scroll_offset == 0 - def insert_line(self, index: int, fragments: list[tuple[str, str]]) -> None: + def insert_line(self, index: int, fragments: list) -> None: """Insert a single line at ``index`` (clamped to current length). Used to retroactively splice the command-echo line in front of a @@ -1077,9 +1078,9 @@ def render() -> list[tuple[str, str]]: # multi-step ``--js-scenario`` JSON blob) are truncated so a # single chip never overflows the toolbar line. if state.settings: - _MAX_CHIP_VALUE = 28 + _max_chip_value = 28 for k, v in state.settings.items(): - display_v = v if len(v) <= _MAX_CHIP_VALUE else v[: _MAX_CHIP_VALUE - 1] + "…" + display_v = v if len(v) <= _max_chip_value else v[: _max_chip_value - 1] + "…" fields.append([("class:toolbar.chip", f" {k}={display_v} ")]) # Hint chunk pinned bottom-right. Always shows the active mouse @@ -1099,9 +1100,9 @@ def render() -> list[tuple[str, str]]: if state.is_running: hint_chunk.append(("class:toolbar.hint", " · Ctrl+C to stop")) - LEADING = " " - SEP = " · " - PAGE_SECONDS = 5 # how long each page is displayed before rotating + _leading = " " + _sep = " · " + _page_seconds = 5 # how long each page is displayed before rotating def _seg_len(chunk: list[tuple[str, str]]) -> int: return sum(len(t) for _, t in chunk) @@ -1116,19 +1117,19 @@ def _seg_len(chunk: list[tuple[str, str]]) -> int: # Reserve room for hint + separator on every page. If the hint alone # is wider than the budget, we'll still try to render it (final # hard-truncate at the bottom of this function will clip). - field_budget = max(0, budget - hint_len - len(SEP)) + field_budget = max(0, budget - hint_len - len(_sep)) # Greedy-pack the non-hint fields into pages, each ≤ field_budget. pages: list[list[list[tuple[str, str]]]] = [] cur: list[list[tuple[str, str]]] = [] - cur_len = len(LEADING) + cur_len = len(_leading) for chunk in fields: chunk_len = _seg_len(chunk) - added = chunk_len + (len(SEP) if cur else 0) + added = chunk_len + (len(_sep) if cur else 0) if cur and cur_len + added > field_budget: pages.append(cur) cur = [chunk] - cur_len = len(LEADING) + chunk_len + cur_len = len(_leading) + chunk_len else: cur.append(chunk) cur_len += added @@ -1143,14 +1144,14 @@ def _seg_len(chunk: list[tuple[str, str]]) -> int: if len(pages) == 1: page_idx = 0 else: - page_idx = int(time.monotonic() / PAGE_SECONDS) % len(pages) + page_idx = int(time.monotonic() / _page_seconds) % len(pages) page = pages[page_idx] # Compose the chosen page. - segs: list[tuple[str, str]] = [("class:toolbar", LEADING)] + segs: list[tuple[str, str]] = [("class:toolbar", _leading)] for i, chunk in enumerate(page): if i > 0: - segs.append(("class:toolbar", SEP)) + segs.append(("class:toolbar", _sep)) segs.extend(chunk) # Page indicator (e.g. "1/3") trailing — only when rotating. @@ -1210,7 +1211,7 @@ def _build_application(state: SessionState, completer: Any, history_path: str): try: history = FileHistory(history_path) except Exception: - history = None # type: ignore[assignment] + history = None buffer = Buffer( history=history, @@ -1581,7 +1582,12 @@ def _open_pager(path: str) -> None: stripped = raw_text.lstrip() if stripped.startswith("<"): try: - from lxml import etree as _etree, html as _lxml_html + # lxml's compiled submodules aren't visible to static + # type checkers; import via ``importlib`` so the + # checker doesn't try to resolve them. + import importlib + _etree = importlib.import_module("lxml.etree") + _lxml_html = importlib.import_module("lxml.html") tree = _lxml_html.fromstring(raw_text) pretty_text = _etree.tostring( tree, pretty_print=True, encoding="unicode", method="html" @@ -1596,7 +1602,7 @@ def _open_pager(path: str) -> None: def _set_text(s: str) -> None: buffer.set_document(Document(text=s, cursor_position=0), bypass_readonly=True) - _set_text(pretty_text if mode[0] == "pretty" else raw_text) + _set_text(pretty_text if (mode[0] == "pretty" and pretty_text is not None) else raw_text) def _current_line_count() -> int: return buffer.document.line_count @@ -2134,11 +2140,8 @@ def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: import click from prompt_toolkit.application import Application from prompt_toolkit.application.run_in_terminal import run_in_terminal - from prompt_toolkit.auto_suggest import AutoSuggestFromHistory from prompt_toolkit.buffer import Buffer - from prompt_toolkit.document import Document from prompt_toolkit.filters import Condition, has_completions - from prompt_toolkit.history import FileHistory from prompt_toolkit.key_binding import KeyBindings from prompt_toolkit.layout import Layout from prompt_toolkit.layout.containers import ConditionalContainer, HSplit, Window @@ -2189,17 +2192,23 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # Same guard on the cleanup side — only clear the worker-loop # ref if THIS call was a worker-thread call. If we're on the main # thread we never touched the ref in the first place. + # ``loop_factory`` was added to ``asyncio.run`` in Python 3.12; + # we pass it through ``**kwargs`` so the call works on both 3.11 + # (no kwarg) and 3.12+, and so the type checker doesn't reject + # the kwarg against the older stub. try: - return _original_asyncio_run( - main, - debug=debug, - loop_factory=loop_factory or _tracking_loop_factory, - ) + kwargs: dict = {"debug": debug} + factory = loop_factory or _tracking_loop_factory + kwargs["loop_factory"] = factory + return _original_asyncio_run(main, **kwargs) finally: if _threading_mod.current_thread() is not _main_thread: _active_worker_loop[0] = None - _asyncio_mod.run = _tracking_asyncio_run + # Monkey-patch asyncio.run via setattr so the type checker doesn't + # complain about the wrapper's slightly broader signature (it also + # accepts ``loop_factory`` for forward-compat with Python 3.12+). + setattr(_asyncio_mod, "run", _tracking_asyncio_run) # ── Click tree introspection ──────────────────────────────────────────── command_help, command_flags, bool_flags, choice_flags = _walk_click_tree(cli_group) @@ -2220,7 +2229,7 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): try: history = _make_capped_history(history_path, max_entries=10_000) except Exception: - history = None # type: ignore[assignment] + history = None completer = _make_completer( command_names, command_flags, bool_flags, choice_flags, command_help @@ -2252,10 +2261,10 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # large window isn't disrupted. try: _cur_cols, _cur_rows = shutil.get_terminal_size((80, 24)) - _MIN_COLS, _MIN_ROWS = 150, 50 - if _cur_cols < _MIN_COLS or _cur_rows < _MIN_ROWS: - _new_cols = max(_cur_cols, _MIN_COLS) - _new_rows = max(_cur_rows, _MIN_ROWS) + _min_cols, _min_rows = 150, 50 + if _cur_cols < _min_cols or _cur_rows < _min_rows: + _new_cols = max(_cur_cols, _min_cols) + _new_rows = max(_cur_rows, _min_rows) sys.stdout.write(f"\033[8;{_new_rows};{_new_cols}t") sys.stdout.flush() except Exception: @@ -2270,7 +2279,6 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # surface visible (their explicit ask: "when scraping banner should # not disappear"). scrollback = ScrollbackBuffer() - rows = shutil.get_terminal_size((80, 24)).lines # kept for API-key prompt sizing # ── Multi-line in-place progress renderer ─────────────────────────────── # Wired so batch operations (``scrape --input-file ...``) can update a @@ -2303,11 +2311,12 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # path doesn't work yet — patch_stdout isn't installed until # ``app.run()`` starts). try: - from io import StringIO as _SIO - from rich.console import Console as _RC + from io import StringIO + + from rich.console import Console - _buf = _SIO() - _c = _RC( + _buf = StringIO() + _c = Console( file=_buf, force_terminal=True, color_system="truecolor", highlight=False, width=shutil.get_terminal_size((80, 24)).columns, ) @@ -2473,11 +2482,10 @@ def _bee_fact_text() -> list[tuple[str, str]]: # where their mouse pointer is — without this, mouse events that # land on the fixed widgets get dropped because those windows # don't have their own scroll handler. + from prompt_toolkit.layout.controls import FormattedTextControl from prompt_toolkit.mouse_events import ( - MouseEventType as _MET, - MouseModifier as _MM, + MouseEventType, ) - from prompt_toolkit.layout.controls import FormattedTextControl as _PTFTC # ── Path detection for Ctrl/Alt+Click open ─────────────────────────────── # Matches just the *start* of a path candidate — absolute (``/``), @@ -2490,8 +2498,8 @@ def _bee_fact_text() -> list[tuple[str, str]]: # disk. This is what lets real-world paths with spaces work — # ``/Applications/Some App.app``, ``~/Library/Application Support/...``, # ``/var/folders/.../Screenshot 2026-05-18 at 11.44.12 PM.png``. - _PATH_START_RE = re.compile(r"(? '\"\t" + _path_start_re = re.compile(r"(? '\"\t" def _resolve_path_str(raw: str) -> str: if raw.startswith("~/"): @@ -2519,7 +2527,7 @@ def _open_path(path: str) -> None: if system == "Darwin": subprocess.Popen(["open", path]) elif system == "Windows": - os.startfile(path) # type: ignore[attr-defined] + getattr(os, "startfile")(path) # noqa: B009 else: subprocess.Popen(["xdg-open", path]) except Exception: @@ -2530,7 +2538,7 @@ def _open_path(path: str) -> None: # re-running expensive layout calculations. _last_scrollback_view: dict[str, list] = {"rows": []} - class _ScrollForwardingFTC(_PTFTC): + class _ScrollForwardingFTC(FormattedTextControl): """Wheel forwarder + optional modifier+click → path opener. ``click_handler`` is invoked on MOUSE_DOWN events that carry a @@ -2545,21 +2553,21 @@ def set_click_handler(self, handler) -> None: def mouse_handler(self, mouse_event): et = mouse_event.event_type - if et == _MET.SCROLL_UP: + if et == MouseEventType.SCROLL_UP: scrollback.scroll_up(1) try: app.invalidate() except Exception: pass return None - if et == _MET.SCROLL_DOWN: + if et == MouseEventType.SCROLL_DOWN: scrollback.scroll_down(1) try: app.invalidate() except Exception: pass return None - if et == _MET.MOUSE_DOWN and self._click_handler is not None: + if et == MouseEventType.MOUSE_DOWN and self._click_handler is not None: # Plain click opens highlighted paths. The scrollback is # read-only so a click has no other purpose there. The # click handler returns NotImplemented when the click @@ -2580,12 +2588,12 @@ def mouse_handler(self, mouse_event): # 30 s — long enough to be cheap, short enough that a file written # during a crawl shows up as clickable within half a minute. _path_exists_cache: dict[str, tuple[float, bool]] = {} - _PATH_EXISTS_TTL = 30.0 + _path_exists_ttl = 30.0 def _path_exists_cached(path: str) -> bool: now = time.monotonic() hit = _path_exists_cache.get(path) - if hit is not None and (now - hit[0]) < _PATH_EXISTS_TTL: + if hit is not None and (now - hit[0]) < _path_exists_ttl: return hit[1] try: exists = os.path.exists(path) @@ -2611,7 +2619,7 @@ def _find_path_at(text: str, start: int) -> tuple[int, str | None]: while end < len(text) and text[end] not in '\n\r"\'<>|`': end += 1 while end > start: - candidate = text[start:end].rstrip(_PATH_TRIM_CHARS) + candidate = text[start:end].rstrip(_path_trim_chars) if len(candidate) < 2: return (start, None) resolved = _resolve_path_str(candidate) @@ -2641,7 +2649,7 @@ def _existing_paths_in(text: str): """ i = 0 while i < len(text): - m = _PATH_START_RE.search(text, i) + m = _path_start_re.search(text, i) if not m: break start = m.start() @@ -2823,7 +2831,7 @@ def _crawl_status_text() -> list[tuple[str, str]]: frags.append((f"{BEE_DIM}", suffix)) return frags - def _crawl_status_height() -> "D": + def _crawl_status_height() -> D: """Compute widget height based on what's shown. Cases: • crawl only (no progress) → 1 row (URL line) @@ -2983,19 +2991,22 @@ def _active_job_in_progress() -> bool: except Exception: return False - def _text_to_fragments(t: "Text") -> list[tuple[str, str]]: + def _text_to_fragments(t: Text) -> list: """Render a rich Text object to the (style, text) fragment list prompt_toolkit's ``FormattedTextControl`` expects.""" try: + from io import StringIO + from prompt_toolkit.formatted_text import ( ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( to_formatted_text as _tft, ) - from io import StringIO as _SIO - from rich.console import Console as _RC + from rich.console import Console - buf = _SIO() - _c = _RC( + buf = StringIO() + _c = Console( file=buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -3004,7 +3015,7 @@ def _text_to_fragments(t: "Text") -> list[tuple[str, str]]: except Exception: return [("", t.plain)] - def _banner_height() -> "D": + def _banner_height() -> D: # Compact one-liner while a crawl / batch is active; full ASCII # banner otherwise. if _active_job_in_progress(): @@ -3215,15 +3226,18 @@ def _finish() -> None: state.run_start = None # Splice the dim echo line above the streamed output. try: + from io import StringIO + from prompt_toolkit.formatted_text import ( ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( to_formatted_text as _tft, ) - from io import StringIO as _SIO - from rich.console import Console as _RC + from rich.console import Console - _buf = _SIO() - _c = _RC( + _buf = StringIO() + _c = Console( file=_buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -3350,6 +3364,7 @@ def _poll_status_file() -> None: learns to show the honeycomb above the URL line. """ import json as _json + from .theme import update_crawl_status, update_progress_state last_mtime = 0.0 @@ -3475,15 +3490,18 @@ def _finish() -> None: pass # Splice the dim echo line above the streamed output. try: + from io import StringIO + from prompt_toolkit.formatted_text import ( ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( to_formatted_text as _tft, ) - from io import StringIO as _SIO - from rich.console import Console as _RC + from rich.console import Console - _buf = _SIO() - _c = _RC( + _buf = StringIO() + _c = Console( file=_buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -3597,8 +3615,14 @@ def _execute(line: str) -> bool: # main-screen scrollback. if line.strip().lower().startswith(":view"): try: - sys.__stdout__.write("\x1b[?1049h") - sys.__stdout__.flush() + # ``sys.__stdout__`` is ``Optional[TextIO]`` in the + # stdlib stubs; in practice it's set for any TTY + # invocation we care about. Guard against the rare + # None case rather than juggling type-ignores. + out = sys.__stdout__ + if out is not None: + out.write("\x1b[?1049h") + out.flush() except Exception: pass try: @@ -3613,14 +3637,17 @@ def _execute(line: str) -> bool: # printed during its run. Fall back to appending if the # rich-render or insert path fails. try: + from io import StringIO + from prompt_toolkit.formatted_text import ( ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( to_formatted_text as _tft, ) - from io import StringIO as _SIO - from rich.console import Console as _RC - _buf = _SIO() - _c = _RC( + from rich.console import Console + _buf = StringIO() + _c = Console( file=_buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -3679,14 +3706,17 @@ def _execute(line: str) -> bool: return True # Echo the typed line above whatever error we just printed. try: + from io import StringIO + from prompt_toolkit.formatted_text import ( ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( to_formatted_text as _tft, ) - from io import StringIO as _SIO - from rich.console import Console as _RC - _buf = _SIO() - _c = _RC( + from rich.console import Console + _buf = StringIO() + _c = Console( file=_buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -3869,15 +3899,18 @@ def _finish() -> None: # rows past that index belong to this command. Inserting at # that index puts the echo right above its output. try: + from io import StringIO + from prompt_toolkit.formatted_text import ( ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( to_formatted_text as _tft, ) - from io import StringIO as _SIO - from rich.console import Console as _RC + from rich.console import Console - _buf = _SIO() - _c = _RC( + _buf = StringIO() + _c = Console( file=_buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -4017,7 +4050,7 @@ def _worker() -> None: return True # ── Key bindings ──────────────────────────────────────────────────────── - _QUIT_TOKENS = {":q", ":quit", "exit", "quit", "q"} + _quit_tokens = {":q", ":quit", "exit", "quit", "q"} kb = KeyBindings() @@ -4048,7 +4081,7 @@ def _submit(event): input_buffer.reset() _handle_first_run_key(stripped, text) return - if stripped.lower() in _QUIT_TOKENS: + if stripped.lower() in _quit_tokens: input_buffer.reset() event.app.exit() return @@ -4279,8 +4312,8 @@ def _tab_open(event): # binding handler. buf = event.current_buffer try: - from prompt_toolkit.completion import CompleteEvent as _CE - cmps = list(buf.completer.get_completions(buf.document, _CE())) + from prompt_toolkit.completion import CompleteEvent + cmps = list(buf.completer.get_completions(buf.document, CompleteEvent())) except Exception: buf.start_completion(select_first=False) return @@ -4788,7 +4821,7 @@ def _signal_refresh_from_thread() -> None: scheduled 30s tick. """ try: - loop = app.loop # type: ignore[attr-defined] + loop = app.loop if loop is not None: loop.call_soon_threadsafe(_refresh_event.set) except Exception: @@ -4830,19 +4863,19 @@ def _on_buffer_write() -> None: sb_writer = ScrollbackWriter(scrollback, on_write=_on_buffer_write) original_stdout, original_stderr = sys.stdout, sys.stderr - sys.stdout = sb_writer # type: ignore[assignment] - sys.stderr = sb_writer # type: ignore[assignment] + sys.stdout = sb_writer + sys.stderr = sb_writer # Some callers (cli_utils.write_output) call ``sys.stdout.buffer.write(bytes)``. # Expose a binary-decoding adapter so those routes still land in our # scrollback as text. Truly binary output is decoded with errors=replace. if not hasattr(sys.stdout, "buffer"): - sys.stdout.buffer = _BinaryAdapter(sys.stdout) # type: ignore[attr-defined] + setattr(sys.stdout, "buffer", _BinaryAdapter(sys.stdout)) if not hasattr(sys.stderr, "buffer"): - sys.stderr.buffer = _BinaryAdapter(sys.stderr) # type: ignore[attr-defined] + setattr(sys.stderr, "buffer", _BinaryAdapter(sys.stderr)) # err_console (rich.Console used by theme.py) caches a file= reference # at module import time — point it at our buffer too. _orig_err_console_file = err_console.file - err_console.file = sb_writer # type: ignore[assignment] + setattr(err_console, "file", sb_writer) try: app.run(pre_run=_pre_run) finally: diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index 4d60ae6..32926bf 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -74,7 +74,7 @@ def is_repl_mode() -> bool: # in REPL mode it overwrites the previous frame; outside the REPL it # falls back to printing the lines normally. -_progress_renderer = None # type: ignore[var-annotated] +_progress_renderer = None def set_progress_renderer(fn) -> None: @@ -439,7 +439,8 @@ def tick_crawl_render() -> None: if _crawl_status is None: return import io - from rich.console import Console as _RC + + from rich.console import Console lines_text: list[Text] = [] progress = _progress_state @@ -478,7 +479,7 @@ def tick_crawl_render() -> None: rendered: list[str] = [] for row in lines_text: buf = io.StringIO() - _c = _RC( + _c = Console( file=buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) @@ -585,12 +586,13 @@ def tick_progress_render() -> None: animate=True, ) import io - from rich.console import Console as _RC + + from rich.console import Console rendered: list[str] = [] for row in rows: buf = io.StringIO() - _c = _RC( + _c = Console( file=buf, force_terminal=True, color_system="truecolor", highlight=False, width=200, ) diff --git a/tests/unit/test_crawl.py b/tests/unit/test_crawl.py index 0f93d1b..e5cc582 100644 --- a/tests/unit/test_crawl.py +++ b/tests/unit/test_crawl.py @@ -3,7 +3,6 @@ from __future__ import annotations from scrapingbee_cli.crawl import ( - _NON_HTML_URL_EXTENSIONS, _body_from_json_response, _extract_hrefs_from_body, _extract_hrefs_from_response, @@ -187,83 +186,6 @@ def test_html_links_via_css(self): assert "https://other.com/b" in hrefs -class TestSpiderDiscovery: - """Tests for the double-fetch discovery mechanism in GenericScrapingBeeSpider.""" - - def _make_response(self, url: str, body: bytes, depth: int = 0): - """Create a Scrapy HtmlResponse with request meta attached.""" - from scrapy.http import HtmlResponse, Request - - response = HtmlResponse(url, body=body, encoding="utf-8") - response.request = Request(url, meta={"depth": depth}) - return response - - def test_parse_yields_discovery_request_when_no_links(self): - """parse() must yield exactly one discovery request when the body has no links.""" - from scrapy_scrapingbee import ScrapingBeeRequest - - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"return_page_text": True}, - output_dir=None, - ) - response = self._make_response("https://example.com/page", b"Plain text, no links") - requests = list(spider.parse(response)) - - assert len(requests) == 1 - assert isinstance(requests[0], ScrapingBeeRequest) - assert requests[0].callback == spider._parse_discovery_links_only - assert requests[0].dont_filter is True - - def test_parse_does_not_yield_discovery_when_links_found(self): - """parse() must not yield a discovery request when the body already has links.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={}, - output_dir=None, - ) - spider.seen_urls.add("https://example.com") - - response = self._make_response( - "https://example.com", - b'link1link2', - ) - requests = list(spider.parse(response)) - - # No request should target the discovery callback - for req in requests: - assert req.callback != spider._parse_discovery_links_only - - def test_parse_discovery_links_only_follows_links_but_does_not_save(self, tmp_path): - """_parse_discovery_links_only must yield follow requests but never write files.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"return_page_text": True}, - output_dir=str(tmp_path), - ) - spider.seen_urls.add("https://example.com") - - response = self._make_response( - "https://example.com", - b'p1p2', - ) - requests = list(spider._parse_discovery_links_only(response)) - - # Should yield follow requests (not empty) - assert len(requests) > 0 - # Each follow request must use the main parse callback (not discovery again) - for req in requests: - assert req.callback == spider.parse - # Nothing written — discovery does not save - assert list(tmp_path.iterdir()) == [] - - class TestSpiderSaveResponse: """Tests for _save_response manifest field extraction.""" @@ -427,91 +349,6 @@ def test_return_page_markdown_does_not_require_discovery(self): assert _requires_discovery_phase({"return_page_markdown": "true"}) is False -class TestNonHtmlUrlExtensions: - """Tests for the _NON_HTML_URL_EXTENSIONS set and its use in parse().""" - - def test_image_extensions_are_binary(self): - for ext in ("jpg", "jpeg", "png", "gif", "webp", "svg", "ico"): - assert ext in _NON_HTML_URL_EXTENSIONS, f"{ext!r} should be in _NON_HTML_URL_EXTENSIONS" - - def test_download_extensions_are_binary(self): - for ext in ("pdf", "zip"): - assert ext in _NON_HTML_URL_EXTENSIONS - - def test_web_asset_extensions_are_binary(self): - for ext in ("css", "js"): - assert ext in _NON_HTML_URL_EXTENSIONS - - def test_html_like_extensions_not_in_set(self): - # These can contain links and must NOT be skipped - for ext in ("html", "htm", "asp", "aspx", "php", "xml", "md", "txt", "json"): - assert ext not in _NON_HTML_URL_EXTENSIONS, ( - f"{ext!r} must not be in _NON_HTML_URL_EXTENSIONS" - ) - - def _make_response(self, url: str, body: bytes, depth: int = 0): - from scrapy.http import HtmlResponse, Request - - response = HtmlResponse(url, body=body, encoding="utf-8") - response.request = Request(url, meta={"depth": depth}) - return response - - def test_parse_skips_discovery_for_image_url(self): - """parse() must NOT yield a discovery request when the URL is a known binary type.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"extract_rules": '{"price": ".price"}'}, - output_dir=None, - ) - # Simulate fetching a JPEG URL that returns no links (binary body) - response = self._make_response( - "https://example.com/hero.jpg", - b"\xff\xd8\xff\xe0", # JPEG magic bytes - ) - requests = list(spider.parse(response)) - # Must yield nothing — no discovery re-request for binary URLs - assert requests == [], f"Expected no requests for binary URL, got {requests}" - - def test_parse_still_fires_discovery_for_html_url_with_no_links(self): - """parse() must still yield a discovery request for HTML-like URLs with no links.""" - from scrapy_scrapingbee import ScrapingBeeRequest - - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"extract_rules": '{"price": ".price"}'}, - output_dir=None, - ) - # JSON response body (from extract_rules) has no links - response = self._make_response( - "https://example.com/product", # no binary extension → should fire discovery - b'{"price": "$9.99"}', - ) - requests = list(spider.parse(response)) - assert len(requests) == 1 - assert isinstance(requests[0], ScrapingBeeRequest) - assert requests[0].callback == spider._parse_discovery_links_only - - def test_parse_skips_discovery_for_css_url(self): - """CSS files never contain HTML links — discovery must be skipped.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={}, - output_dir=None, - ) - response = self._make_response( - "https://example.com/styles/main.css", - b"body { color: red; }", - ) - requests = list(spider.parse(response)) - assert requests == [] - - class TestExtractHrefsExceptionHandling: """Tests that _extract_hrefs_from_response handles non-HTML gracefully.""" From 5c3cada02e6dd5e082d4d7d9e09affa66011765c Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 20 May 2026 09:49:55 +0530 Subject: [PATCH 14/15] chore: apply ruff format on src/tests CI runs `ruff format --check src tests` in addition to `ruff check`; 8 files were flagged. Apply ruff format so the Lint job passes. --- src/scrapingbee_cli/batch.py | 16 +- src/scrapingbee_cli/cli.py | 1 + src/scrapingbee_cli/cli_utils.py | 29 +- src/scrapingbee_cli/commands/auth.py | 1 + src/scrapingbee_cli/commands/scrape.py | 4 +- src/scrapingbee_cli/crawl.py | 68 ++-- src/scrapingbee_cli/interactive.py | 454 ++++++++++++------------- src/scrapingbee_cli/theme.py | 19 +- 8 files changed, 291 insertions(+), 301 deletions(-) diff --git a/src/scrapingbee_cli/batch.py b/src/scrapingbee_cli/batch.py index f7e5b0b..fff83e0 100644 --- a/src/scrapingbee_cli/batch.py +++ b/src/scrapingbee_cli/batch.py @@ -414,13 +414,11 @@ def get_batch_usage(api_key_flag: str | None) -> dict: key = get_api_key(api_key_flag) try: from .theme import is_repl_mode + _in_repl = is_repl_mode() except Exception: _in_repl = False - cache_opt_in = ( - _in_repl - or os.environ.get("SCRAPINGBEE_USAGE_CACHE") == "1" - ) + cache_opt_in = _in_repl or os.environ.get("SCRAPINGBEE_USAGE_CACHE") == "1" if cache_opt_in: cached = read_usage_file_cache(key) if cached is not None: @@ -571,6 +569,7 @@ async def run_batch_async( if is_repl_mode() and show_progress and total > 0: try: from .theme import update_progress_state + update_progress_state(0, total, rps=None, eta=None, failure_pct=None) except Exception: pass @@ -628,9 +627,13 @@ async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: # AND the REPL ticker will keep re-rendering at ~10 Hz # so the boundary hex shimmers between completions. from .theme import update_progress_state + update_progress_state( - completed, total, - rps=rps_val, eta=eta_val, failure_pct=fail_pct, + completed, + total, + rps=rps_val, + eta=eta_val, + failure_pct=fail_pct, ) else: parts = [f"[{completed}/{total}]"] @@ -655,6 +658,7 @@ async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: if is_repl_mode(): try: from .theme import clear_progress_state + clear_progress_state() except Exception: pass diff --git a/src/scrapingbee_cli/cli.py b/src/scrapingbee_cli/cli.py index 4b97aee..b4d5cc5 100644 --- a/src/scrapingbee_cli/cli.py +++ b/src/scrapingbee_cli/cli.py @@ -196,6 +196,7 @@ def main() -> None: # options behave like the scraping-side ones (--render-js, etc.). try: from .cli_utils import collect_bool_flag_names, normalize_bool_flag_args + _bool_flags = collect_bool_flag_names(cli) sys.argv[1:] = normalize_bool_flag_args(sys.argv[1:], _bool_flags) except Exception: diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index 9edc0fc..3719b7e 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -71,16 +71,13 @@ def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: full_path = None line_count = data.count(b"\n") + 1 - if ( - len(data) <= _REPL_PREVIEW_MAX_BYTES - and line_count <= _REPL_PREVIEW_MAX_LINES - ): + if len(data) <= _REPL_PREVIEW_MAX_BYTES and line_count <= _REPL_PREVIEW_MAX_LINES: # Small enough to print inline — but the cache is still fresh. return data, None, None text = data.decode("utf-8", errors="replace") lines = text.split("\n") - line_preview = "\n".join(lines[: _REPL_PREVIEW_MAX_LINES]) + line_preview = "\n".join(lines[:_REPL_PREVIEW_MAX_LINES]) # Decide whether to truncate by lines or by chars. Single-line minified # HTML/JSON would have line_preview == text but len > byte cap; truncate by @@ -89,20 +86,15 @@ def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: preview = text[:_REPL_PREVIEW_MAX_BYTES] more_chars = len(text) - len(preview) truncation_note = ( - f"showing first {_REPL_PREVIEW_MAX_BYTES:,} chars · " - f"+{more_chars:,} more chars" + f"showing first {_REPL_PREVIEW_MAX_BYTES:,} chars · +{more_chars:,} more chars" ) else: preview = line_preview more_lines = max(0, len(lines) - _REPL_PREVIEW_MAX_LINES) shown = min(_REPL_PREVIEW_MAX_LINES, len(lines)) - truncation_note = ( - f"showing {shown}/{len(lines):,} lines · +{more_lines:,} more lines" - ) + truncation_note = f"showing {shown}/{len(lines):,} lines · +{more_lines:,} more lines" - summary = ( - f"… preview truncated · {_format_bytes(len(data))} · {truncation_note}" - ) + summary = f"… preview truncated · {_format_bytes(len(data))} · {truncation_note}" return preview.encode("utf-8"), summary, full_path @@ -126,9 +118,7 @@ def collect_bool_flag_names(cli_group: click.Group) -> set[str]: return flags -def normalize_bool_flag_args( - args: list[str], flag_names: set[str] -) -> list[str]: +def normalize_bool_flag_args(args: list[str], flag_names: set[str]) -> list[str]: """Pre-parse boolean flags so they accept an explicit true/false value in addition to the bare flag form: ``--verbose true`` → ``--verbose`` (value dropped, flag kept) @@ -1737,9 +1727,7 @@ def write_output( from scrapingbee_cli.credits import ESTIMATED_CREDITS if command in ESTIMATED_CREDITS: - echo_key_value( - "Credit Cost (estimated)", str(ESTIMATED_CREDITS[command]) - ) + echo_key_value("Credit Cost (estimated)", str(ESTIMATED_CREDITS[command])) echo_separator() else: click.echo(f"HTTP Status: {status_code}", err=True) @@ -1793,8 +1781,7 @@ def write_output( # must not have extra bytes appended. if preview_data and not preview_data.endswith(b"\n"): is_text = ( - preview_data[:1] in (b"{", b"[", b"<", b"#") - or b"\x00" not in preview_data[:512] + preview_data[:1] in (b"{", b"[", b"<", b"#") or b"\x00" not in preview_data[:512] ) if is_text: click.echo() diff --git a/src/scrapingbee_cli/commands/auth.py b/src/scrapingbee_cli/commands/auth.py index ac6ea2b..0cc879c 100644 --- a/src/scrapingbee_cli/commands/auth.py +++ b/src/scrapingbee_cli/commands/auth.py @@ -97,6 +97,7 @@ def _run_check() -> tuple[int, bytes]: try: asyncio.get_running_loop() from concurrent.futures import ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=1) as pool: status, data = pool.submit(_run_check).result() except RuntimeError: diff --git a/src/scrapingbee_cli/commands/scrape.py b/src/scrapingbee_cli/commands/scrape.py index b19881a..3b04f1c 100644 --- a/src/scrapingbee_cli/commands/scrape.py +++ b/src/scrapingbee_cli/commands/scrape.py @@ -725,9 +725,7 @@ async def _single() -> None: verbose=obj["verbose"], ) else: - data, resp_headers, status_code = await client.scrape( - scrape_url, **scrape_kwargs - ) + data, resp_headers, status_code = await client.scrape(scrape_url, **scrape_kwargs) if not scrape_kwargs.get("transparent_status_code") and status_code >= 400: if is_repl_mode(): echo_error(f"Error: HTTP {status_code}") diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py index 451e383..cfba286 100644 --- a/src/scrapingbee_cli/crawl.py +++ b/src/scrapingbee_cli/crawl.py @@ -97,6 +97,7 @@ def _ensure_reactor_usable() -> None: in a subprocess; that's a follow-up.) """ import sys as _sys + reactor = _sys.modules.get("twisted.internet.reactor") if reactor is None: return # No reactor has been installed yet, nothing to check. @@ -139,6 +140,7 @@ def _target_url_from_request(request) -> str: if "app.scrapingbee.com" in raw and "url=" in raw: try: from urllib.parse import parse_qs, unquote, urlparse + qs = parse_qs(urlparse(raw).query) target = qs.get("url", [None])[0] if target: @@ -174,6 +176,7 @@ def _install_signal_handlers() -> bool: """ try: from .theme import is_repl_mode + return not is_repl_mode() except Exception: return True @@ -199,6 +202,7 @@ def _maybe_set_repl_log_file(settings) -> str | None: """ try: from .theme import is_repl_mode + in_repl = is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" if not in_repl: return None @@ -209,6 +213,7 @@ def _maybe_set_repl_log_file(settings) -> str | None: settings.set("LOG_FILE_APPEND", False) # fresh log per run try: import logging as _logging + _logging.getLogger("py.warnings").setLevel(_logging.ERROR) except Exception: pass @@ -216,10 +221,12 @@ def _maybe_set_repl_log_file(settings) -> str | None: except Exception: return None + # 0 means unlimited DEFAULT_MAX_DEPTH = 0 DEFAULT_MAX_PAGES = 0 + def _normalize_url(url: str) -> str: """Strip fragment and trailing slash for deduplication.""" parsed = urlparse(url) @@ -480,9 +487,7 @@ def from_crawler(cls, crawler, *args, **kwargs): # at spider_idle (when discovery exhausts before reaching # ``max_pages``). Wire this regardless of REPL mode — it's # a credit-saving optimisation, not a UI feature. - crawler.signals.connect( - spider._on_spider_idle, signal=_scrapy_signals.spider_idle - ) + crawler.signals.connect(spider._on_spider_idle, signal=_scrapy_signals.spider_idle) # Register signal handlers when running inside the REPL # (legacy in-process path) OR when the parent REPL spawned @@ -490,9 +495,7 @@ def from_crawler(cls, crawler, *args, **kwargs): # new subprocess-per-crawl path). The handlers themselves # call ``update_crawl_status`` which atomically mirrors # state to the file if the env var is set. - _want_status = is_repl_mode() or bool( - os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE") - ) + _want_status = is_repl_mode() or bool(os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE")) if _want_status: crawler.signals.connect( spider._on_spider_opened, signal=_scrapy_signals.spider_opened @@ -520,8 +523,13 @@ def from_crawler(cls, crawler, *args, **kwargs): def _on_spider_opened(self, spider) -> None: try: from .theme import update_crawl_status, update_progress_state + update_crawl_status( - current_url=None, fetched=0, queued=0, saved=0, phase="discovering", + current_url=None, + fetched=0, + queued=0, + saved=0, + phase="discovering", ) # If we already know the total (sitemap mode), seed the # progress widget at 0/total so the user sees the bar from @@ -535,6 +543,7 @@ def _on_request_scheduled(self, request, spider) -> None: try: self._queued_count += 1 from .theme import update_crawl_status + update_crawl_status(queued=self._queued_count) except Exception: pass @@ -542,6 +551,7 @@ def _on_request_scheduled(self, request, spider) -> None: def _on_request_reached(self, request, spider) -> None: try: from .theme import update_crawl_status + # Scrapy sees the outgoing proxy URL # (``app.scrapingbee.com/api/v1/?api_key=…&url=…``) — that's # leaky (API key) and not what the user thinks of as "their" @@ -556,6 +566,7 @@ def _on_response_received(self, response, request, spider) -> None: try: self._response_count += 1 from .theme import update_crawl_status, update_progress_state + update_crawl_status( fetched=self._response_count, saved=self._save_count, @@ -572,6 +583,7 @@ def _on_response_received(self, response, request, spider) -> None: def _on_spider_closed(self, spider, reason) -> None: try: from .theme import clear_crawl_status, clear_progress_state + clear_crawl_status() clear_progress_state() except Exception: @@ -611,9 +623,7 @@ def _on_spider_idle(self, spider) -> None: return self._discovery_done = True budget = ( - min(self.max_pages, len(self._save_queue)) - if self.max_pages - else len(self._save_queue) + min(self.max_pages, len(self._save_queue)) if self.max_pages else len(self._save_queue) ) for url in self._save_queue[:budget]: self._save_pending += 1 @@ -624,6 +634,7 @@ def _on_spider_idle(self, spider) -> None: if self._save_pending > 0: self._save_pending -= 1 from scrapy.exceptions import DontCloseSpider + raise DontCloseSpider def _push_saved_status(self) -> None: @@ -648,11 +659,13 @@ def _push_saved_status(self) -> None: """ try: from .theme import update_crawl_status + update_crawl_status(saved=self._save_count) except Exception: pass if self.max_pages != 0 and self._save_count >= self.max_pages: from scrapy.exceptions import CloseSpider + raise CloseSpider("max_pages") def _on_request_error(self, failure) -> None: @@ -808,6 +821,7 @@ def _iter_follow_urls(self, response: Response) -> Any: if self.max_depth != 0 and depth >= self.max_depth: return from urllib.parse import unquote as _unquote + for href in _extract_hrefs_from_response(response): if not href or href.startswith(("#", "mailto:", "javascript:")): continue @@ -857,10 +871,7 @@ def _iter_follow_requests( """ # max_pages = max saved pages. Stop queueing follow-ups once # the budget (already-saved + in-flight saves) is committed. - if ( - self.max_pages != 0 - and self._save_count + self._save_pending >= self.max_pages - ): + if self.max_pages != 0 and self._save_count + self._save_pending >= self.max_pages: return for full_url, next_depth in self._iter_follow_urls(response): yield ScrapingBeeRequest( @@ -915,9 +926,7 @@ def parse(self, response: Response, **kwargs: object) -> Any: except Exception: hrefs = [] if hrefs: - yield from self._iter_follow_requests( - response, dict(self.scrape_params), self.parse - ) + yield from self._iter_follow_requests(response, dict(self.scrape_params), self.parse) def _parse_crawl_and_save(self, response: Response, **kwargs: object) -> Any: """Discovery-first callback. Two flows live here: @@ -945,12 +954,9 @@ def _parse_crawl_and_save(self, response: Response, **kwargs: object) -> Any: if not binary_mode: # ── HTML save-pattern flow (unchanged) ─────────────────── - save_this = (self._save_re is None) or bool( - self._save_re.search(response.url) - ) + save_this = (self._save_re is None) or bool(self._save_re.search(response.url)) within_cap = ( - self.max_pages == 0 - or self._save_count + self._save_pending < self.max_pages + self.max_pages == 0 or self._save_count + self._save_pending < self.max_pages ) if save_this and within_cap: try: @@ -1065,9 +1071,7 @@ def _on_save_error(self, failure) -> None: and self._save_queue_next < len(self._save_queue) and self._save_count + self._save_pending < self.max_pages ): - engine = getattr( - getattr(self, "_crawler", None), "engine", None - ) + engine = getattr(getattr(self, "_crawler", None), "engine", None) if engine is not None: url = self._save_queue[self._save_queue_next] self._save_queue_next += 1 @@ -1231,6 +1235,7 @@ def run_project_spider( autothrottle_enabled=autothrottle_enabled, ) from .theme import is_repl_mode as _is_repl_mode + _repl_log_active = _is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" if _repl_log_active: # Verbose file log, quiet stream — see run_urls_spider for why. @@ -1238,14 +1243,14 @@ def run_project_spider( log_path = _maybe_set_repl_log_file(settings) if log_path: click.echo( - f"REPL mode: full crawl log → {log_path} " - f"(use `:view crawl` to scroll through it)", + f"REPL mode: full crawl log → {log_path} (use `:view crawl` to scroll through it)", err=True, ) _ensure_reactor_usable() process = CrawlerProcess(settings) if _repl_log_active: import logging as _logging + for _h in _logging.getLogger().handlers: if isinstance(_h, _logging.FileHandler): continue @@ -1321,6 +1326,7 @@ def run_urls_spider( # the handlers (see below). Outside REPL there's no file log, so the # stream handler picks up LOG_LEVEL directly — keep that at WARNING. from .theme import is_repl_mode as _is_repl_mode + _repl_log_active = _is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" settings.set("LOG_LEVEL", "INFO" if _repl_log_active else "WARNING") if max_pages > 0: @@ -1333,16 +1339,13 @@ def run_urls_spider( # ``max_pages × N`` where N depends on how many hrefs a typical # page exposes. Set the framework cap to a generous multiple # so it never fires before the spider's own cap stops queuing. - use_discovery_flow = bool(save_pattern) or _requires_discovery_phase( - scrape_params or {} - ) + use_discovery_flow = bool(save_pattern) or _requires_discovery_phase(scrape_params or {}) framework_cap = max_pages * 20 if use_discovery_flow else max_pages settings.set("CLOSESPIDER_PAGECOUNT", framework_cap) log_path = _maybe_set_repl_log_file(settings) if log_path: click.echo( - f"REPL mode: full crawl log → {log_path} " - f"(use `:view crawl` to scroll through it)", + f"REPL mode: full crawl log → {log_path} (use `:view crawl` to scroll through it)", err=True, ) _ensure_reactor_usable() @@ -1355,6 +1358,7 @@ def run_urls_spider( # file stays verbose while stderr stays clean. if _repl_log_active: import logging as _logging + for _h in _logging.getLogger().handlers: if isinstance(_h, _logging.FileHandler): continue diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py index b8cb600..a0dd8b8 100644 --- a/src/scrapingbee_cli/interactive.py +++ b/src/scrapingbee_cli/interactive.py @@ -47,39 +47,39 @@ # Refined palette # --------------------------------------------------------------------------- -_AMBER = "#E5A800" # frame border / soft accent -_GREEN = "#22C55E" # success -_DIM2 = "#555555" # darker chrome (toolbar labels, hint) -_BG_CHIP = "#1a1400" # chip background (settings) -_URL_CYAN = "#7DD3FC" # URLs in input lexer +_AMBER = "#E5A800" # frame border / soft accent +_GREEN = "#22C55E" # success +_DIM2 = "#555555" # darker chrome (toolbar labels, hint) +_BG_CHIP = "#1a1400" # chip background (settings) +_URL_CYAN = "#7DD3FC" # URLs in input lexer _STYLE_DICT = { # Top/bottom horizontal rules around the input - "rule": _AMBER, + "rule": _AMBER, # Prompt mark inside the input area - "promptmark": f"{BEE_YELLOW} bold", + "promptmark": f"{BEE_YELLOW} bold", # Lexer (input syntax highlighting). Specific categories have explicit # colours; unstyled tokens fall through to the application's default # style (key `""`), which is set per-session in `_style_dict_for`. - "lexer.cmd": f"{BEE_YELLOW} bold", - "lexer.flag": _AMBER, - "lexer.url": _URL_CYAN, - "lexer.string": _GREEN, + "lexer.cmd": f"{BEE_YELLOW} bold", + "lexer.flag": _AMBER, + "lexer.url": _URL_CYAN, + "lexer.string": _GREEN, # Bottom toolbar - "toolbar": f"{BEE_DIM}", - "toolbar.label": _DIM2, - "toolbar.value": f"{BEE_YELLOW} bold", - "toolbar.ok": f"{_GREEN} bold", - "toolbar.fail": f"{BEE_RED} bold", - "toolbar.hint": _DIM2, - "toolbar.chip": f"bg:{_BG_CHIP} {BEE_YELLOW}", - "toolbar.gauge": f"{BEE_YELLOW}", + "toolbar": f"{BEE_DIM}", + "toolbar.label": _DIM2, + "toolbar.value": f"{BEE_YELLOW} bold", + "toolbar.ok": f"{_GREEN} bold", + "toolbar.fail": f"{BEE_RED} bold", + "toolbar.hint": _DIM2, + "toolbar.chip": f"bg:{_BG_CHIP} {BEE_YELLOW}", + "toolbar.gauge": f"{BEE_YELLOW}", # Completion menu - "completion-menu": f"bg:{_BG_CHIP}", - "completion-menu.completion": f"bg:{_BG_CHIP} {BEE_YELLOW}", - "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", - "completion-menu.meta.completion": f"bg:{_BG_CHIP} #886600", - "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", + "completion-menu": f"bg:{_BG_CHIP}", + "completion-menu.completion": f"bg:{_BG_CHIP} {BEE_YELLOW}", + "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", + "completion-menu.meta.completion": f"bg:{_BG_CHIP} #886600", + "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", "auto-suggestion": "fg:#777777 italic", } @@ -147,6 +147,8 @@ def closed(self) -> bool: from prompt_toolkit.auto_suggest import AutoSuggest as _PTKAutoSuggest except Exception: # pragma: no cover — prompt_toolkit should always be present _PTKAutoSuggest = object # type: ignore[misc,assignment] + + class BeeAutoSuggest(_PTKAutoSuggest): """Context-aware ghost-text autosuggest for the REPL prompt. @@ -240,9 +242,8 @@ def get_suggestion(self, buffer, document): # or a valid PREFIX of one — otherwise we'd risk surfacing # history junk for a clear typo (the user's explicit ask). first_is_known = first in self._command_flags - first_is_prefix = ( - not first_is_known - and any(c.startswith(first) for c in self._command_names) + first_is_prefix = not first_is_known and any( + c.startswith(first) for c in self._command_names ) if not (first_is_known or first_is_prefix): return None @@ -253,7 +254,7 @@ def get_suggestion(self, buffer, document): self._refresh_history() for line in self._cached_lines: if line.startswith(text) and line != text: - return Suggestion(line[len(text):]) + return Suggestion(line[len(text) :]) # 2) No matching history line. Suggest from the structured # options (command names, flags, choice values). @@ -262,14 +263,11 @@ def get_suggestion(self, buffer, document): on_first = (len(words) == 1) and not has_trailing_space if on_first: - cands = [ - c for c in self._command_names - if c.startswith(last) and c != last - ] + cands = [c for c in self._command_names if c.startswith(last) and c != last] if not cands: return None best = self._rank_by_recency(cands)[0] - return Suggestion(best[len(last):]) + return Suggestion(best[len(last) :]) # Multi-word — need a recognised command to suggest structure. if not first_is_known: @@ -283,23 +281,22 @@ def get_suggestion(self, buffer, document): if not cands: return None best = self._rank_by_recency(cands)[0] - return Suggestion(best[len(last):]) + return Suggestion(best[len(last) :]) if len(words) >= 2: prev = words[-2] if prev in self._choice_flags: cands = [ - v for v in self._choice_flags[prev] - if v.startswith(last) and v != last + v for v in self._choice_flags[prev] if v.startswith(last) and v != last ] if not cands: return None best = self._rank_by_recency(cands)[0] - return Suggestion(best[len(last):]) + return Suggestion(best[len(last) :]) if prev in self._bool_flags: for v in ("true", "false"): if v.startswith(last.lower()) and v != last.lower(): - return Suggestion(v[len(last):]) + return Suggestion(v[len(last) :]) return None return None except Exception: @@ -449,7 +446,7 @@ def replace_last_n_lines(self, n: int, lines: list) -> None: with self._lock: if len(self.lines) >= n and n > 0: # Replace tail in place — same count, no shift. - self.lines[len(self.lines) - n:] = [list(f) for f in lines] + self.lines[len(self.lines) - n :] = [list(f) for f in lines] else: # Not enough prior lines to replace; append. for f in lines: @@ -509,9 +506,7 @@ def flush_pending(self) -> None: fragments = [("", pending)] self.append_fragments(fragments) - def get_visible_window( - self, height: int - ) -> list[list[tuple[str, str]]]: + def get_visible_window(self, height: int) -> list[list[tuple[str, str]]]: """Backwards-compatible: visible slice in *logical* lines.""" with self._lock: total = len(self.lines) @@ -524,9 +519,7 @@ def get_visible_window( start = max(0, end - height) return [list(line) for line in self.lines[start:end]] - def get_visible_visual( - self, height: int, width: int - ) -> list[list[tuple[str, str]]]: + def get_visible_visual(self, height: int, width: int) -> list[list[tuple[str, str]]]: """Return visible content in *visual rows* (post-wrap). Long single lines that wrap to multiple terminal rows are @@ -566,9 +559,7 @@ def scroll_up(self, n: int = 1) -> None: # Soft cap — get_visible_window will further clamp based on # the actual rendered height, but capping here at total-1 # avoids letting offset grow unboundedly between renders. - self.scroll_offset = min( - max(0, len(self.lines) - 1), self.scroll_offset + n - ) + self.scroll_offset = min(max(0, len(self.lines) - 1), self.scroll_offset + n) def scroll_down(self, n: int = 1) -> None: with self._lock: @@ -659,7 +650,7 @@ def writable(self) -> bool: # Used for the live "running command" line above the input. A bright white # "peak" cell sweeps across the line, flanked by warm-yellow cells, with the # rest in brand yellow — reads as a glow running along the command text. -_SHIMMER_PEAK_PT = "#FFFFFF" +_SHIMMER_PEAK_PT = "#FFFFFF" _SHIMMER_FLANK_PT = "#FFE780" @@ -688,9 +679,9 @@ def _shimmer_pt(text: str, position: int, base_color: str) -> list[tuple[str, st # --------------------------------------------------------------------------- -def _walk_click_tree(cli_group: Any) -> tuple[ - dict[str, str], dict[str, list[str]], set[str], dict[str, list[str]] -]: +def _walk_click_tree( + cli_group: Any, +) -> tuple[dict[str, str], dict[str, list[str]], set[str], dict[str, list[str]]]: """Return (command_help, command_flags, bool_flags, choice_flags).""" import click @@ -735,12 +726,12 @@ class SessionState: def __init__(self) -> None: self.last_command: str | None = None - self.last_status: str | None = None # "ok" | "fail" + self.last_status: str | None = None # "ok" | "fail" self.last_duration: float | None = None # Live account state — surfaced in the toolbar. None ⇒ unknown / N/A. - self.credits: int | None = None # available = max - used - self.credits_total: int | None = None # max_api_credit - self.used_credits: int | None = None # used_api_credit (latest) + self.credits: int | None = None # available = max - used + self.credits_total: int | None = None # max_api_credit + self.used_credits: int | None = None # used_api_credit (latest) self.used_credits_at_start: int | None = None # snapshotted after first ok refresh self.max_concurrency: int | None = None self.current_concurrency: int | None = None @@ -759,7 +750,7 @@ def __init__(self) -> None: self.running_command: str | None = None self.running_command_text: str | None = None # full line as typed self.run_start: float | None = None - self.tick: int = 0 # frame counter for the shimmer position + self.tick: int = 0 # frame counter for the shimmer position # Mouse mode toggle: "scroll" = mouse_support on (wheel scrolls the # virtual buffer, drag-select needs a per-terminal modifier); # "select" = mouse_support off (native drag-select works everywhere @@ -871,9 +862,7 @@ def session_credits_used(self) -> int | None: def seconds_until_next_refresh(self) -> int | None: if self.last_usage_refresh_mono is None: return None - remaining = ( - self.last_usage_refresh_mono + self.USAGE_REFRESH_INTERVAL - time.monotonic() - ) + remaining = self.last_usage_refresh_mono + self.USAGE_REFRESH_INTERVAL - time.monotonic() return max(0, int(remaining + 0.999)) # ceil so the countdown never shows -1 @@ -953,11 +942,7 @@ def get_line(lineno: int): tokens.append(("class:lexer.flag", piece)) elif piece.startswith(("http://", "https://")): tokens.append(("class:lexer.url", piece)) - elif ( - len(piece) > 1 - and piece[0] in ("'", '"') - and piece[-1] == piece[0] - ): + elif len(piece) > 1 and piece[0] in ("'", '"') and piece[-1] == piece[0]: tokens.append(("class:lexer.string", piece)) else: # Inherit the app default style (`""`), which is set @@ -1008,6 +993,7 @@ def render() -> list[tuple[str, str]]: pass if not width: import shutil + width = shutil.get_terminal_size((80, 24)).columns segs: list[tuple[str, str]] = [("class:toolbar", " ")] @@ -1022,10 +1008,12 @@ def render() -> list[tuple[str, str]]: if state.is_running and state.run_start is not None: elapsed = time.monotonic() - state.run_start - fields.append([ - ("class:toolbar.label", "Elapsed "), - ("class:toolbar.value", f"{elapsed:.1f}s"), - ]) + fields.append( + [ + ("class:toolbar.label", "Elapsed "), + ("class:toolbar.value", f"{elapsed:.1f}s"), + ] + ) # Available Credits avail: list[tuple[str, str]] = [("class:toolbar.label", "Available Credits ")] @@ -1042,9 +1030,7 @@ def render() -> list[tuple[str, str]]: fields.append(avail) # Used (Current Session) - used_chunk: list[tuple[str, str]] = [ - ("class:toolbar.label", "Used (Current Session) ") - ] + used_chunk: list[tuple[str, str]] = [("class:toolbar.label", "Used (Current Session) ")] scu = state.session_credits_used if state.api_key_set else None used_chunk.append( ("class:toolbar.value", _format_credits(scu) if scu is not None else "N/A") @@ -1064,10 +1050,12 @@ def render() -> list[tuple[str, str]]: if state.api_key_set: nxt = state.seconds_until_next_refresh if nxt is not None: - fields.append([ - ("class:toolbar.label", "Next Update "), - ("class:toolbar.value", f"{nxt}s"), - ]) + fields.append( + [ + ("class:toolbar.label", "Next Update "), + ("class:toolbar.value", f"{nxt}s"), + ] + ) # (Removed "last cmd" indicator — the typed command and its # ✓/✗ footer are already visible in the scrollback echo, so a @@ -1093,9 +1081,7 @@ def render() -> list[tuple[str, str]]: hint_text = "type `auth` to set API key" hint_chunk: list[tuple[str, str]] = [("class:toolbar.hint", hint_text)] else: - mode_label = ( - "Scroll mode" if state.mouse_mode == "scroll" else "Select mode" - ) + mode_label = "Scroll mode" if state.mouse_mode == "scroll" else "Select mode" hint_chunk = [("class:toolbar.value", mode_label)] if state.is_running: hint_chunk.append(("class:toolbar.hint", " · Ctrl+C to stop")) @@ -1346,9 +1332,7 @@ def _escape_menu(event): # Combined "SCRAPING BEE" wordmark on a single row of letterforms — 6 # lines tall, ~90 cols wide. Replaces the prior 4-row smblock SCRAPING # + 6-row BEE stack (10 logo rows) with this single 6-row version. -_SCRAPINGBEE_LOGO = [ - " " + s + " " + b for s, b in zip(_SCRAPING_LETTERS, _BEE_LETTERS) -] +_SCRAPINGBEE_LOGO = [" " + s + " " + b for s, b in zip(_SCRAPING_LETTERS, _BEE_LETTERS)] # Column at which "BEE" begins inside each combined row, used by the # pinned banner renderer to split the row into a yellow "SCRAPING" half # and a white "BEE" half. @@ -1387,9 +1371,7 @@ def _render_banner(version: str) -> str: for line in _SCRAPINGBEE_LOGO: left = line[:_BEE_OFFSET] right = line[_BEE_OFFSET:] - c.print( - f"[bold {BEE_YELLOW}]{left}[/][bold white]{right}[/]" - ) + c.print(f"[bold {BEE_YELLOW}]{left}[/][bold white]{right}[/]") c.print() # Version c.print(f" [bold {BEE_YELLOW}]v{version}[/]") @@ -1454,15 +1436,14 @@ def _print_row(cmd: str, desc: str) -> None: err_console.print() groups = { - "Pages": ["scrape", "crawl"], - "Search": ["google", "fast-search"], - "Marketplaces": ["amazon-product", "amazon-search", - "walmart-product", "walmart-search"], - "Media": ["youtube-search", "youtube-metadata"], - "AI": ["chatgpt"], - "Learn": ["tutorial"], - "Account": ["auth", "logout"], - "Tools": ["usage", "schedule", "export", "docs", "unsafe"], + "Pages": ["scrape", "crawl"], + "Search": ["google", "fast-search"], + "Marketplaces": ["amazon-product", "amazon-search", "walmart-product", "walmart-search"], + "Media": ["youtube-search", "youtube-metadata"], + "AI": ["chatgpt"], + "Learn": ["tutorial"], + "Account": ["auth", "logout"], + "Tools": ["usage", "schedule", "export", "docs", "unsafe"], } for i, (group_name, cmds) in enumerate(groups.items()): if i > 0: @@ -1473,33 +1454,36 @@ def _print_row(cmd: str, desc: str) -> None: err_console.print() err_console.print(f" [{BEE_DIM}]REPL[/]") for cmd, desc in [ - (":help, :?", "Show this command list"), - (":clear", "Clear the screen"), - (":view", "Scroll the last command's output (auto-picks crawl.log after crawl; pass a path to view any file)"), + (":help, :?", "Show this command list"), + (":clear", "Clear the screen"), + ( + ":view", + "Scroll the last command's output (auto-picks crawl.log after crawl; pass a path to view any file)", + ), (":set K=V ...", "Set one or more session defaults"), - (":unset K", "Remove a session default ('all' or '*' clears every)"), - (":reset", "Clear every session default"), + (":unset K", "Remove a session default ('all' or '*' clears every)"), + (":reset", "Clear every session default"), (":show, :list", "Show current session defaults"), - ("!", "Run a shell command (requires unsafe mode)"), - (":q, :quit", "Quit the REPL"), + ("!", "Run a shell command (requires unsafe mode)"), + (":q, :quit", "Quit the REPL"), ]: _print_row(cmd, desc) err_console.print() err_console.print(f" [{BEE_DIM}]Shortcuts[/]") for cmd, desc in [ - ("Tab", "Complete (inline if 1 match, popup if many, ghost word otherwise)"), - ("Shift+Tab", "Cycle popup back / toggle Scroll ↔ Select mode"), - ("Esc", "Close the completion popup"), - ("→", "Accept the next word of the ghost suggestion"), - ("End", "Accept the whole ghost suggestion"), - ("↑ / ↓", "Walk history (single-line) / move cursor (multi-line)"), - ("PgUp / PgDn", "Scroll the scrollback buffer up / down"), - ("Ctrl+Home/End","Jump to top / bottom of scrollback"), - ("Ctrl+J", "Insert a newline (multi-line compose; also Alt/Option+Enter)"), - ("Ctrl+W", "Delete the word before the cursor (also Alt/Option+⌫)"), - ("Click", "Open a highlighted path in Finder / default app"), - ("Ctrl+C", "Stop running command / cancel queue / clear multi-line / exit when idle"), - ("Ctrl+D", "Exit the REPL (when no command is running)"), + ("Tab", "Complete (inline if 1 match, popup if many, ghost word otherwise)"), + ("Shift+Tab", "Cycle popup back / toggle Scroll ↔ Select mode"), + ("Esc", "Close the completion popup"), + ("→", "Accept the next word of the ghost suggestion"), + ("End", "Accept the whole ghost suggestion"), + ("↑ / ↓", "Walk history (single-line) / move cursor (multi-line)"), + ("PgUp / PgDn", "Scroll the scrollback buffer up / down"), + ("Ctrl+Home/End", "Jump to top / bottom of scrollback"), + ("Ctrl+J", "Insert a newline (multi-line compose; also Alt/Option+Enter)"), + ("Ctrl+W", "Delete the word before the cursor (also Alt/Option+⌫)"), + ("Click", "Open a highlighted path in Finder / default app"), + ("Ctrl+C", "Stop running command / cancel queue / clear multi-line / exit when idle"), + ("Ctrl+D", "Exit the REPL (when no command is running)"), ]: _print_row(cmd, desc) err_console.print() @@ -1570,9 +1554,7 @@ def _open_pager(path: str) -> None: # matches neither, pretty is unavailable and we stick with raw. pretty_text: str | None try: - pretty_text = json.dumps( - json.loads(raw_text), indent=2, ensure_ascii=False - ) + pretty_text = json.dumps(json.loads(raw_text), indent=2, ensure_ascii=False) except Exception: pretty_text = None if pretty_text is None: @@ -1586,6 +1568,7 @@ def _open_pager(path: str) -> None: # type checkers; import via ``importlib`` so the # checker doesn't try to resolve them. import importlib + _etree = importlib.import_module("lxml.etree") _lxml_html = importlib.import_module("lxml.html") tree = _lxml_html.fromstring(raw_text) @@ -1624,11 +1607,11 @@ def _status_line(): # `r` toggles raw on/off. Hidden when there's no pretty version # available (non-JSON content) — there'd be nothing to toggle to. toggle_hint = ( - ("r: pretty" if mode[0] == "raw" else "r: raw") - if pretty_text is not None else "" + ("r: pretty" if mode[0] == "raw" else "r: raw") if pretty_text is not None else "" ) right_hint = ( - "↑↓ PgUp/PgDn scroll" + (f" · {toggle_hint}" if toggle_hint else "") + "↑↓ PgUp/PgDn scroll" + + (f" · {toggle_hint}" if toggle_hint else "") + " · q to exit" ) return [ @@ -1710,10 +1693,10 @@ def _right(_e): style = Style.from_dict( { - "pager.bar": f"bg:{_BG_CHIP} {BEE_DIM}", + "pager.bar": f"bg:{_BG_CHIP} {BEE_DIM}", "pager.value": f"bg:{_BG_CHIP} {BEE_YELLOW} bold", "pager.label": f"bg:{_BG_CHIP} {BEE_DIM}", - "pager.hint": f"bg:{_BG_CHIP} {_DIM2}", + "pager.hint": f"bg:{_BG_CHIP} {_DIM2}", } ) @@ -1863,9 +1846,7 @@ def _handle_meta( else: err_console.print() for k, v in state.settings.items(): - err_console.print( - f" [bold {BEE_YELLOW}]{k:<20}[/] [dim]{v}[/]" - ) + err_console.print(f" [bold {BEE_YELLOW}]{k:<20}[/] [dim]{v}[/]") err_console.print() return "ok" if head_low == ":view": @@ -1908,8 +1889,7 @@ def _handle_meta( except Exception as e: err_console.print(f" [bold {BEE_RED}]pager error:[/] {e}") err_console.print( - f" [{BEE_DIM}]full output saved at[/] " - f"[bold {BEE_YELLOW}]{target_path}[/]" + f" [{BEE_DIM}]full output saved at[/] [bold {BEE_YELLOW}]{target_path}[/]" ) return "ok" @@ -1921,9 +1901,7 @@ def _handle_meta( if head_low == ":unset": target = rest.strip() if not target: - err_console.print( - f" [bold {BEE_RED}]usage:[/] :unset KEY | :unset * | :reset" - ) + err_console.print(f" [bold {BEE_RED}]usage:[/] :unset KEY | :unset * | :reset") return "ok" if target in {"*", "all"}: n = len(state.settings) @@ -1935,20 +1913,14 @@ def _handle_meta( for key in keys: if key in state.settings: del state.settings[key] - err_console.print( - f" [{BEE_DIM}]unset[/] [bold {BEE_YELLOW}]{key}[/]" - ) + err_console.print(f" [{BEE_DIM}]unset[/] [bold {BEE_YELLOW}]{key}[/]") else: err_console.print(f" [{BEE_DIM}]not set:[/] {key}") return "ok" if head_low == ":set": if not rest.strip(): - err_console.print( - f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE [KEY=VALUE ...]" - ) - err_console.print( - f" [{BEE_DIM}] or:[/] :set --KEY VALUE [--KEY VALUE ...]" - ) + err_console.print(f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE [KEY=VALUE ...]") + err_console.print(f" [{BEE_DIM}] or:[/] :set --KEY VALUE [--KEY VALUE ...]") return "ok" parsed = _parse_set_args(rest) if isinstance(parsed, str): @@ -1961,8 +1933,7 @@ def _handle_meta( for key, value in parsed: if key not in valid_keys: err_console.print( - f" [bold {BEE_RED}]unknown option:[/] " - f"[bold {BEE_YELLOW}]--{key}[/]" + f" [bold {BEE_RED}]unknown option:[/] [bold {BEE_YELLOW}]--{key}[/]" ) suggestion = _suggest(key, valid_keys, threshold=2) if suggestion: @@ -1979,29 +1950,28 @@ def _handle_meta( f" [bold {BEE_RED}]invalid value for[/] " f"[bold {BEE_YELLOW}]--{key}[/][bold {BEE_RED}]:[/] {value}" ) - err_console.print( - f" [{BEE_DIM}] choices:[/] " - + ", ".join(choice_flags[flag]) - ) + err_console.print(f" [{BEE_DIM}] choices:[/] " + ", ".join(choice_flags[flag])) rejected.append(key) continue # Validate bool values if flag in bool_flags and value.lower() not in ( - "true", "false", "yes", "no", "1", "0", "on", "off" + "true", + "false", + "yes", + "no", + "1", + "0", + "on", + "off", ): - err_console.print( - f" [bold {BEE_RED}]--{key} expects a bool, got:[/] {value}" - ) + err_console.print(f" [bold {BEE_RED}]--{key} expects a bool, got:[/] {value}") rejected.append(key) continue state.settings[key] = value applied.append((key, value)) for key, value in applied: - err_console.print( - f" [{BEE_DIM}]set[/] [bold {BEE_YELLOW}]{key}[/] = " - f"[dim]{value}[/]" - ) + err_console.print(f" [{BEE_DIM}]set[/] [bold {BEE_YELLOW}]{key}[/] = [dim]{value}[/]") return "ok" return None @@ -2021,8 +1991,17 @@ def _make_completer( from prompt_toolkit.completion import Completer, Completion meta_cmds = [ - ":help", ":?", ":clear", ":view", ":set", ":unset", ":reset", ":show", - ":list", ":q", ":quit", + ":help", + ":?", + ":clear", + ":view", + ":set", + ":unset", + ":reset", + ":show", + ":list", + ":q", + ":quit", ] # Precompute the union of every flag known to any command. Used as a @@ -2030,9 +2009,7 @@ def _make_completer( # recognised (typo, in-progress rename, etc.) — without this the # completer would silently stop suggesting anything as soon as the # first word is unknown, which is confusing UX. - _all_known_flags: list[str] = sorted({ - f for flags in command_flags.values() for f in flags - }) + _all_known_flags: list[str] = sorted({f for flags in command_flags.values() for f in flags}) class BeeCompleter(Completer): def get_completions(self, document, complete_event): @@ -2046,9 +2023,7 @@ def get_completions(self, document, complete_event): pool.extend((m, "REPL meta") for m in meta_cmds) for cmd, meta in sorted(pool): if cmd.startswith(partial): - yield Completion( - cmd, start_position=-len(partial), display_meta=meta - ) + yield Completion(cmd, start_position=-len(partial), display_meta=meta) return cmd_name = words[0] @@ -2057,9 +2032,7 @@ def get_completions(self, document, complete_event): # Display "(unknown command)" so they know completions may # not actually apply to what they typed. cmd_known = cmd_name in command_flags - flags_for_cmd = ( - command_flags[cmd_name] if cmd_known else _all_known_flags - ) + flags_for_cmd = command_flags[cmd_name] if cmd_known else _all_known_flags ends_with_space = text.endswith(" ") last_word = words[-1] if words else "" # When the buffer ends with a space the user has *finished* @@ -2105,9 +2078,7 @@ def get_completions(self, document, complete_event): meta_label = "" if cmd_known else "(unknown command)" for flag in flags_for_cmd: if flag.startswith(last): - yield Completion( - flag, start_position=-len(last), display_meta=meta_label - ) + yield Completion(flag, start_position=-len(last), display_meta=meta_label) return BeeCompleter() @@ -2292,6 +2263,7 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): # the live honeycomb directly from ``_progress_state``, so the # scrollback path is no longer needed in REPL mode. from .theme import set_progress_renderer as _set_progress_renderer + _set_progress_renderer(lambda _lines: None) # ── First-run API key state ───────────────────────────────────────────── @@ -2317,8 +2289,11 @@ def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): _buf = StringIO() _c = Console( - file=_buf, force_terminal=True, color_system="truecolor", - highlight=False, width=shutil.get_terminal_size((80, 24)).columns, + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=shutil.get_terminal_size((80, 24)).columns, ) _c.print( f" [{BEE_DIM}]Welcome! Enter your API key to get started — " @@ -2522,6 +2497,7 @@ def _open_path(path: str) -> None: """ import platform import subprocess + system = platform.system() try: if system == "Darwin": @@ -2601,9 +2577,7 @@ def _path_exists_cached(path: str) -> bool: exists = False _path_exists_cache[path] = (now, exists) if len(_path_exists_cache) > 512: - cutoff = sorted( - _path_exists_cache.items(), key=lambda kv: kv[1][0] - )[:128] + cutoff = sorted(_path_exists_cache.items(), key=lambda kv: kv[1][0])[:128] for k, _ in cutoff: _path_exists_cache.pop(k, None) return exists @@ -2616,7 +2590,7 @@ def _find_path_at(text: str, start: int) -> tuple[int, str | None]: prefix resolves to an existing path. """ end = start - while end < len(text) and text[end] not in '\n\r"\'<>|`': + while end < len(text) and text[end] not in "\n\r\"'<>|`": end += 1 while end > start: candidate = text[start:end].rstrip(_path_trim_chars) @@ -2748,6 +2722,7 @@ def _styled_with_links( def _has_crawl_status_safe() -> bool: try: from .theme import has_crawl_status + return has_crawl_status() except Exception: return False @@ -2761,6 +2736,7 @@ def _has_active_job_status() -> bool: return True try: from .theme import has_progress_state + return has_progress_state() except Exception: return False @@ -2778,6 +2754,7 @@ def _crawl_status_text() -> list[tuple[str, str]]: """ from . import theme as _theme # live module reference from .theme import BEE_WHITE, format_honeycomb_grid, get_crawl_status + cs = get_crawl_status() ps = getattr(_theme, "_progress_state", None) if cs is None and ps is None: @@ -2841,6 +2818,7 @@ def _crawl_status_height() -> D: cs_set = _has_crawl_status_safe() try: from .theme import has_progress_state + ps_set = has_progress_state() except Exception: ps_set = False @@ -2889,6 +2867,7 @@ def _scrollback_render() -> list[tuple[str, str]]: cs_set = _has_crawl_status_safe() try: from .theme import has_progress_state + ps_set = has_progress_state() except Exception: ps_set = False @@ -2987,6 +2966,7 @@ def _active_job_in_progress() -> bool: return True try: from .theme import has_progress_state + return has_progress_state() except Exception: return False @@ -3007,8 +2987,11 @@ def _text_to_fragments(t: Text) -> list: buf = StringIO() _c = Console( - file=buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _c.print(t, end="") return list(_tft(_ANSI(buf.getvalue()))) @@ -3138,9 +3121,7 @@ def _handle_first_run_key(key_raw: str, raw_with_ws: str) -> None: try: save_api_key_to_dotenv(key) except Exception as e: - err_console.print( - f" [bold {BEE_RED}]Could not save:[/] [{BEE_DIM}]{e}[/]" - ) + err_console.print(f" [bold {BEE_RED}]Could not save:[/] [{BEE_DIM}]{e}[/]") os.environ[ENV_API_KEY] = key state.api_key_set = True _first_run_needs_key[0] = False @@ -3200,9 +3181,7 @@ def _run() -> None: current_subprocess[0] = None if code != 0: status_ref[0] = "fail" - err_console.print( - f" [{BEE_DIM}]exit code {code}[/]" - ) + err_console.print(f" [{BEE_DIM}]exit code {code}[/]") except KeyboardInterrupt: # Ctrl+C: stop the child if it's still running, then mark # the command as cancelled in the footer. @@ -3238,8 +3217,11 @@ def _finish() -> None: _buf = StringIO() _c = Console( - file=_buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _echo_t = Text() _echo_t.append("❯ ", style=BEE_DIM) @@ -3310,9 +3292,7 @@ def _worker() -> None: # reads the file every 100 ms and forwards updates into the # parent's own ``_crawl_status`` so the layout-window crawl status # display keeps showing live URL / fetched count. - def _execute_crawl_subprocess( - crawl_args: list[str], original_line: str, echo_idx: int - ) -> None: + def _execute_crawl_subprocess(crawl_args: list[str], original_line: str, echo_idx: int) -> None: import os as _os import subprocess @@ -3325,8 +3305,7 @@ def _execute_crawl_subprocess( state.run_start = start status_file = ( - Path.home() / ".cache" / "scrapingbee-cli" - / f"crawl-status-{_os.getpid()}.json" + Path.home() / ".cache" / "scrapingbee-cli" / f"crawl-status-{_os.getpid()}.json" ) try: status_file.parent.mkdir(parents=True, exist_ok=True) @@ -3344,8 +3323,13 @@ def _execute_crawl_subprocess( # waiting for the child to fire its first signal. try: from .theme import update_crawl_status + update_crawl_status( - current_url=None, fetched=0, queued=0, saved=0, phase="starting", + current_url=None, + fetched=0, + queued=0, + saved=0, + phase="starting", ) except Exception: pass @@ -3446,9 +3430,7 @@ def _run() -> None: status_ref[0] = "stopped" else: status_ref[0] = "fail" - err_console.print( - f" [{BEE_DIM}]exit code {code}[/]" - ) + err_console.print(f" [{BEE_DIM}]exit code {code}[/]") except KeyboardInterrupt: proc = current_subprocess[0] if proc is not None: @@ -3480,6 +3462,7 @@ def _finish() -> None: pass try: from .theme import clear_crawl_status, clear_progress_state + clear_crawl_status() clear_progress_state() except Exception: @@ -3502,8 +3485,11 @@ def _finish() -> None: _buf = StringIO() _c = Console( - file=_buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _echo_t = Text() _echo_t.append("❯ ", style=BEE_DIM) @@ -3518,6 +3504,7 @@ def _finish() -> None: state.last_status = status_ref[0] state.last_duration = duration is_input_locked[0] = False + # Buffer mutations have to run on the prompt_toolkit main # loop thread — this ``_finish`` is on the worker thread, # and calling ``input_buffer.reset()`` from here directly @@ -3604,7 +3591,12 @@ def _execute(line: str) -> bool: # order: command, then its output. meta_echo_idx = scrollback.current_length() meta = _handle_meta( - line, state, command_help, all_known_flags, bool_flags, choice_flags, + line, + state, + command_help, + all_known_flags, + bool_flags, + choice_flags, scrollback=scrollback, ) if meta == "ok": @@ -3646,10 +3638,14 @@ def _execute(line: str) -> bool: to_formatted_text as _tft, ) from rich.console import Console + _buf = StringIO() _c = Console( - file=_buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _echo_t = Text() _echo_t.append("❯ ", style=BEE_DIM) @@ -3678,10 +3674,7 @@ def _execute(line: str) -> bool: shell_cmd = line[1:].strip() shell_echo_idx = scrollback.current_length() if not shell_cmd: - err_console.print( - f" [{BEE_DIM}]usage: ![/]" - f"[bold {BEE_YELLOW}][/]" - ) + err_console.print(f" [{BEE_DIM}]usage: ![/][bold {BEE_YELLOW}][/]") else: from .exec_gate import ( is_command_whitelisted, @@ -3715,10 +3708,14 @@ def _execute(line: str) -> bool: to_formatted_text as _tft, ) from rich.console import Console + _buf = StringIO() _c = Console( - file=_buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _echo_t = Text() _echo_t.append("❯ ", style=BEE_DIM) @@ -3732,7 +3729,7 @@ def _execute(line: str) -> bool: # Tolerate users typing `scrapingbee ...` out of muscle memory. if line.lower().startswith("scrapingbee "): - line = line[len("scrapingbee "):].strip() + line = line[len("scrapingbee ") :].strip() original_line = line # what to echo after completion @@ -3773,9 +3770,7 @@ def _execute(line: str) -> bool: input_buffer.reset() except Exception: pass - err_console.print( - f" [{BEE_DIM}]Enter your API key below.[/]" - ) + err_console.print(f" [{BEE_DIM}]Enter your API key below.[/]") try: app.invalidate() except Exception: @@ -3803,9 +3798,7 @@ def _execute(line: str) -> bool: # Only inject session defaults that the target command actually # accepts; otherwise ``:set --json-response true`` would also # apply to ``usage``, which rejects it as an unknown option. - args = state.apply_settings_to_args( - args, accepted=set(command_flags.get(cmd_name, [])) - ) + args = state.apply_settings_to_args(args, accepted=set(command_flags.get(cmd_name, []))) # Let users type ``--verbose true|false`` (etc.) in the REPL # too — same normalisation as the CLI ``main()`` entry. try: @@ -3813,9 +3806,8 @@ def _execute(line: str) -> bool: collect_bool_flag_names, normalize_bool_flag_args, ) - args = normalize_bool_flag_args( - args, collect_bool_flag_names(cli_group) - ) + + args = normalize_bool_flag_args(args, collect_bool_flag_names(cli_group)) except Exception: pass @@ -3911,8 +3903,11 @@ def _finish() -> None: _buf = StringIO() _c = Console( - file=_buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _echo_t = Text() _echo_t.append("❯ ", style=BEE_DIM) @@ -3930,6 +3925,7 @@ def _finish() -> None: state.last_duration = duration state.refresh_credits_from_cache() is_input_locked[0] = False + # State mutations triggered by auth/logout need to be visible to # the asyncio loop's _usage_refresher and the toolbar render — # both run on the main loop thread while we're in the worker @@ -4154,6 +4150,7 @@ def _ctrl_c(event): if state.is_running and worker is not None and worker.is_alive(): loop = _active_worker_loop[0] if loop is not None: + def _cancel_all_tasks() -> None: try: for task in _asyncio_mod.all_tasks(loop): @@ -4161,6 +4158,7 @@ def _cancel_all_tasks() -> None: task.cancel() except Exception: pass + try: loop.call_soon_threadsafe(_cancel_all_tasks) except Exception: @@ -4175,6 +4173,7 @@ def _cancel_all_tasks() -> None: # running Twisted reactor from outside. try: from .crawl import stop_running_reactor + stop_running_reactor() except Exception: pass @@ -4217,9 +4216,7 @@ def _cancel_all_tasks() -> None: # thread, the docs say to undo it — otherwise we leave a # dangling pending exception on an unrelated thread. if res > 1: - ctypes.pythonapi.PyThreadState_SetAsyncExc( - ctypes.c_ulong(tid), None - ) + ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_ulong(tid), None) except Exception: # ctypes path failed (PyPy? embedded?) — fall back to # exiting; daemon worker dies with the process. @@ -4313,10 +4310,12 @@ def _tab_open(event): buf = event.current_buffer try: from prompt_toolkit.completion import CompleteEvent + cmps = list(buf.completer.get_completions(buf.document, CompleteEvent())) except Exception: buf.start_completion(select_first=False) return + # Helper: accept the next word (up to & including next space) of # the ghost-text suggestion, mirroring what Right arrow does. def _accept_ghost_word() -> bool: @@ -4338,9 +4337,7 @@ def _accept_ghost_word() -> bool: # suggestion is showing, prefer advancing into the ghost — # that's what the user actually wants progress on. typed_before = buf.document.text_before_cursor - replaced = ( - typed_before[c.start_position:] if c.start_position < 0 else "" - ) + replaced = typed_before[c.start_position :] if c.start_position < 0 else "" if c.text == replaced and _accept_ghost_word(): return try: @@ -4435,6 +4432,7 @@ def _insert_newline(event): # join with spaces. ``\r`` is always normalised (CR is never a # useful separator in our buffer). from prompt_toolkit.keys import Keys as _Keys + _command_name_set = set(command_names) def _looks_like_command_line(line: str) -> bool: @@ -4459,6 +4457,7 @@ def _bracketed_paste(event): # spaces/tabs to single spaces (handles soft-wrap in the # source rendering), CR to space, and insert normally. import re as _re + # Normalise line endings: CRLF (Windows), CR (classic Mac), and # LF all become a single ``\n``. Treating a lone CR as a space # would silently collapse multi-line paste into one line on @@ -4473,9 +4472,7 @@ def _bracketed_paste(event): # is the dominant intent). The user can edit any line # and Enter submits the batch. event.current_buffer.text = "\n".join(non_empty) - event.current_buffer.cursor_position = len( - event.current_buffer.text - ) + event.current_buffer.cursor_position = len(event.current_buffer.text) return text = _re.sub(r"[ \t]+", " ", text) event.current_buffer.insert_text(text) @@ -4669,11 +4666,7 @@ async def _ticker(): # the input lock is clear (previous command done) AND we're # not in the API-key prompt. Pop one per tick so each # command's footer renders before the next starts. - if ( - _pending_commands - and not is_input_locked[0] - and not _first_run_needs_key[0] - ): + if _pending_commands and not is_input_locked[0] and not _first_run_needs_key[0]: next_cmd = _pending_commands.pop(0) try: if history is not None: @@ -4755,8 +4748,7 @@ async def _do_usage_refresh() -> None: credits = cached.get("credits") used_credit = ( int(max_credit) - int(credits) - if isinstance(max_credit, (int, float)) - and isinstance(credits, (int, float)) + if isinstance(max_credit, (int, float)) and isinstance(credits, (int, float)) else None ) synthetic = { diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py index 32926bf..d8c2336 100644 --- a/src/scrapingbee_cli/theme.py +++ b/src/scrapingbee_cli/theme.py @@ -389,6 +389,7 @@ def _maybe_mirror_to_status_file() -> None: return try: import json as _json + payload: dict = {} if _crawl_status is not None: payload.update(_crawl_status) @@ -480,8 +481,11 @@ def tick_crawl_render() -> None: for row in lines_text: buf = io.StringIO() _c = Console( - file=buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _c.print(row, end="") rendered.append(buf.getvalue()) @@ -593,8 +597,11 @@ def tick_progress_render() -> None: for row in rows: buf = io.StringIO() _c = Console( - file=buf, force_terminal=True, color_system="truecolor", - highlight=False, width=200, + file=buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, ) _c.print(row, end="") rendered.append(buf.getvalue()) @@ -654,8 +661,6 @@ def _render_inline_bee(frame_idx: int) -> Text: return text - - # -- Styled output helpers --------------------------------------------------- @@ -1112,5 +1117,3 @@ def echo_bee_error(status_code: int, fallback_msg: str = "") -> None: err_console.print(f" [dim]Tip: {tip}[/dim]") else: echo_error(fallback_msg or f"Error: HTTP {status_code}") - - From 059443ba6df7297b091f0ab1d7360c7fd1dbaf82 Mon Sep 17 00:00:00 2001 From: Sahil Sunny Date: Wed, 20 May 2026 09:56:57 +0530 Subject: [PATCH 15/15] fix(repl): don't hang on existing --output-file path confirm_overwrite() called click.confirm() when the target file existed, which reads from sys.stdin directly. In REPL mode prompt_toolkit owns the TTY (full-screen / alt-buffer) and never forwards keystrokes to stdin, so the prompt blocked forever and the REPL appeared frozen. When is_repl_mode() is true, raise a UsageError telling the user to re-run with --overwrite instead of attempting to prompt. --- src/scrapingbee_cli/cli_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index 3719b7e..d4814e8 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -320,6 +320,13 @@ def confirm_overwrite(path: str | None, overwrite: bool = False) -> None: from pathlib import Path if Path(path).exists() and not overwrite: + # In REPL mode, prompt_toolkit owns the TTY (full-screen / alt-buffer), + # so click.confirm reads from sys.stdin and blocks forever. Surface + # the conflict as an error and tell the user to pass --overwrite. + if is_repl_mode(): + raise click.UsageError( + f"'{path}' already exists. Re-run with --overwrite to replace it." + ) if not click.confirm(f"'{path}' already exists. Overwrite?"): click.echo("Cancelled.", err=True) raise SystemExit(0)