Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added playwright support and passed the tests #1376

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 102 additions & 2 deletions edsl/scenarios/FileStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
import tempfile
import mimetypes
import os
from typing import Dict, Any, IO, Optional
from typing import Dict, Any, IO, Optional, Literal, List
import requests
from urllib.parse import urlparse
import time
import subprocess
import google.generativeai as genai

from edsl import Scenario
from edsl import Scenario, ScenarioList
from edsl.utilities.decorators import add_edsl_version, remove_edsl_version
from edsl.utilities.utilities import is_notebook
import asyncio


def view_docx(docx_path):
Expand Down Expand Up @@ -541,6 +543,104 @@ def create_link(self, custom_filename=None, style=None):

return ConstructDownloadLink(self).create_link(custom_filename, style)

@classmethod
async def _async_screenshot(
cls,
url: str,
full_page: bool = True,
wait_until: Literal[
"load", "domcontentloaded", "networkidle", "commit"
] = "networkidle",
download_path: Optional[str] = None,
) -> "FileStore":
"""Async version of screenshot functionality"""
try:
from playwright.async_api import async_playwright
except ImportError:
raise ImportError(
"Screenshot functionality requires additional dependencies.\n"
"Install them with: pip install 'edsl[screenshot]'"
)

if download_path is None:
download_path = os.path.join(
os.getcwd(), f"screenshot_{int(time.time())}.png"
)

async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(url, wait_until=wait_until)
await page.screenshot(path=download_path, full_page=full_page)
await browser.close()

return cls(download_path, mime_type="image/png")

@classmethod
def from_url_screenshot(cls, url: str, **kwargs) -> "FileStore":
"""Synchronous wrapper for screenshot functionality"""
import asyncio

try:
# Try using get_event_loop first (works in regular Python)
loop = asyncio.get_event_loop()
except RuntimeError:
# If we're in IPython/Jupyter, create a new loop
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

try:
return loop.run_until_complete(cls._async_screenshot(url, **kwargs))
finally:
if not loop.is_running():
loop.close()

@classmethod
def batch_screenshots(cls, urls: List[str], **kwargs) -> "ScenarioList":
"""
Take screenshots of multiple URLs concurrently.

Args:
urls: List of URLs to screenshot
**kwargs: Additional arguments passed to screenshot function (full_page, wait_until, etc.)

Returns:
ScenarioList containing FileStore objects with their corresponding URLs
"""
from edsl import ScenarioList

try:
# Try using get_event_loop first (works in regular Python)
loop = asyncio.get_event_loop()
except RuntimeError:
# If we're in IPython/Jupyter, create a new loop
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

# Create tasks for all screenshots
tasks = [cls._async_screenshot(url, **kwargs) for url in urls]

try:
# Run all screenshots concurrently
results = loop.run_until_complete(
asyncio.gather(*tasks, return_exceptions=True)
)

# Filter out any errors and log them
successful_results = []
for url, result in zip(urls, results):
if isinstance(result, Exception):
print(f"Failed to screenshot {url}: {result}")
else:
successful_results.append(
Scenario({"url": url, "screenshot": result})
)

return ScenarioList(successful_results)
finally:
if not loop.is_running():
loop.close()


class CSVFileStore(FileStore):
@classmethod
Expand Down
26 changes: 26 additions & 0 deletions edsl/utilities/setup_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import subprocess
import sys
import os


class PlaywrightInstallPlugin:
def install_browsers(self):
print("Installing Playwright browsers...")
try:
result = subprocess.run(
[sys.executable, "-m", "playwright", "install", "chromium"],
check=True,
capture_output=True,
text=True,
)
print("Successfully installed Playwright browsers")
if result.stdout:
print(f"Output: {result.stdout}")
except subprocess.CalledProcessError as e:
print(f"Failed to install Playwright browsers: {e}")
if e.stdout:
print(f"Output: {e.stdout}")
if e.stderr:
print(f"Error: {e.stderr}")
except Exception as e:
print(f"Unexpected error during Playwright installation: {e}")
42 changes: 41 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ mistralai = "^1.0.2"
urllib3 = ">=1.25.4,<1.27"
google-generativeai = "^0.8.2"
tabulate = "^0.9.0"
playwright = { version = "^1.40.0", optional = true }

[tool.poetry.dependencies.black]
extras = ["jupyter"]
Expand Down Expand Up @@ -81,3 +82,15 @@ uvicorn = "^0.30.6"

[tool.tomlsort.overrides."tool.poetry.dependencies"]
table_keys = false

[tool.poetry.group.screenshot]
optional = true

[tool.poetry.group.screenshot.dependencies]
playwright = "^1.40.0"

[tool.poetry.plugins."poetry.application.plugin"]
install-playwright-browsers = "edsl.utilities.setup_utils:PlaywrightInstallPlugin"

[tool.poetry.extras]
screenshot = ["playwright"]
Loading